def process_event(event): # Get the event page and compute its id url = re.search('href="([^"]*)"', event['Event']).group(1) event_id = url.replace('/', '').replace('e', 'event_') # Convert the CFP to text and save it document = BeautifulSoup(urllib2.urlopen("http://eventseer.net" + url).read()) if document.find(id='cfp-content') == None: return cfp = document.find(id='cfp-content').renderContents() cfp = cfp.replace('<p>', '\n').replace('</p>', '').replace('<br />', '') cfp = cfp.replace('<ul>', '\n').replace('</ul>', '').replace('<li>', '').replace('</li>', '') cfp = re.sub(r'<a [^>]*>([^<]*)</a>', '\g<1>', cfp, flags=re.IGNORECASE) cfp = re.sub(r'^\n*', '', cfp, flags=re.IGNORECASE) f = open('data/' + event_id + '_cfp.txt', 'w') f.write(cfp) f.close() # Create, populate and save the RDF graph graph = ConjunctiveGraph() graph.bind('swc', SWC) graph.bind('cfp', CFP) graph.bind('ical', ICAL) graph.bind('foaf', FOAF) graph.bind('dct', DCT) graph.bind('lode', LODE) ### Event resource_event = LDES[event_id] graph.add((resource_event, RDF.type, SWC['AcademicEvent'])) if event['StartDate'] != None: graph.add((resource_event, ICAL['dtstart'], Literal(datetime.strptime(event['StartDate'], "%d %b %Y")))) if event['EndDate'] != None: graph.add((resource_event, ICAL['dtend'], Literal(datetime.strptime(event['EndDate'], "%d %b %Y")))) if event['City'] != None and event['Country'] != None: city = get_location(event['City'], event['Country']) if city != None: city = URIRef(city) else: city = Literal(event['City'] + ", " + event['Country']) graph.add((resource_event, FOAF['based_near'], city)) ### CFP resource_cfp = LDES[event_id + '_cfp'] graph.add((resource_cfp, RDF.type, CFP['CallForPapers'])) graph.add((resource_cfp, CFP['for'], resource_event)) graph.add((resource_cfp, CFP['details'], URIRef(BASE + 'data/' + event_id + '_cfp.txt'))) ### Deadlines deadlines = [] for a in document.findAll('script'): res = re.search('var deadlineList = ([^;]*);', a.renderContents()) if res != None: txt = res.group(1).replace('\n', '').replace('\t', '').replace("'", '"') txt = re.sub(r'<span [^>]*>([^<]*)</span>', '\g<1>', txt, flags=re.IGNORECASE) txt = txt.replace('Date:', '"Date":').replace('Title:', '"Title":') deadlines = json.loads(txt) i = 0 for deadline in deadlines: resource_deadline = LDES[event_id + '_deadline-' + str(i)] graph.add((resource_deadline, RDF.type, ICAL['Vevent'])) graph.add((resource_deadline, ICAL['dtstart'], Literal(datetime.strptime(deadline['Date'], "%d %b %Y")))) graph.add((resource_deadline, ICAL['dtend'], Literal(datetime.strptime(deadline['Date'], "%d %b %Y")))) graph.add((resource_deadline, ICAL['summary'], Literal(deadline['Title']))) graph.add((resource_deadline, ICAL['relatedTo'], resource_event)) i = i + 1 ### Topics and persons for link in document.find(id='cfp-content').findAll('a'): link = link.get('href') if link != None: if link[:3] == '/t/': topic_id = link.replace('/t/', 'topic_').replace('/', '') graph.add((resource_event, DCT['subject'], LDES[topic_id])) if link[:3] == '/p/': person_id = link.replace('/p/', 'person_').replace('/', '') graph.add((resource_event, LODE['involvedAgent'], LDES[person_id])) ### Save f = open('data/' + event_id + '.rdf', 'w') f.write(graph.serialize()) f.close()
def process_event(event): # Get the event page and compute its id url = re.search('href="([^"]*)"', event['Event']).group(1) event_id = url.replace('/', '').replace('e', 'event_') # Convert the CFP to text and save it document = BeautifulSoup( urllib2.urlopen("http://eventseer.net" + url).read()) if document.find(id='cfp-content') == None: return cfp = document.find(id='cfp-content').renderContents() cfp = cfp.replace('<p>', '\n').replace('</p>', '').replace('<br />', '') cfp = cfp.replace('<ul>', '\n').replace('</ul>', '').replace('<li>', '').replace('</li>', '') cfp = re.sub(r'<a [^>]*>([^<]*)</a>', '\g<1>', cfp, flags=re.IGNORECASE) cfp = re.sub(r'^\n*', '', cfp, flags=re.IGNORECASE) f = open('data/' + event_id + '_cfp.txt', 'w') f.write(cfp) f.close() # Create, populate and save the RDF graph graph = ConjunctiveGraph() graph.bind('swc', SWC) graph.bind('cfp', CFP) graph.bind('ical', ICAL) graph.bind('foaf', FOAF) graph.bind('dct', DCT) graph.bind('lode', LODE) ### Event resource_event = LDES[event_id] graph.add((resource_event, RDF.type, SWC['AcademicEvent'])) if event['StartDate'] != None: graph.add((resource_event, ICAL['dtstart'], Literal(datetime.strptime(event['StartDate'], "%d %b %Y")))) if event['EndDate'] != None: graph.add((resource_event, ICAL['dtend'], Literal(datetime.strptime(event['EndDate'], "%d %b %Y")))) if event['City'] != None and event['Country'] != None: city = get_location(event['City'], event['Country']) if city != None: city = URIRef(city) else: city = Literal(event['City'] + ", " + event['Country']) graph.add((resource_event, FOAF['based_near'], city)) ### CFP resource_cfp = LDES[event_id + '_cfp'] graph.add((resource_cfp, RDF.type, CFP['CallForPapers'])) graph.add((resource_cfp, CFP['for'], resource_event)) graph.add((resource_cfp, CFP['details'], URIRef(BASE + 'data/' + event_id + '_cfp.txt'))) ### Deadlines deadlines = [] for a in document.findAll('script'): res = re.search('var deadlineList = ([^;]*);', a.renderContents()) if res != None: txt = res.group(1).replace('\n', '').replace('\t', '').replace("'", '"') txt = re.sub(r'<span [^>]*>([^<]*)</span>', '\g<1>', txt, flags=re.IGNORECASE) txt = txt.replace('Date:', '"Date":').replace('Title:', '"Title":') deadlines = json.loads(txt) i = 0 for deadline in deadlines: resource_deadline = LDES[event_id + '_deadline-' + str(i)] graph.add((resource_deadline, RDF.type, ICAL['Vevent'])) graph.add((resource_deadline, ICAL['dtstart'], Literal(datetime.strptime(deadline['Date'], "%d %b %Y")))) graph.add((resource_deadline, ICAL['dtend'], Literal(datetime.strptime(deadline['Date'], "%d %b %Y")))) graph.add( (resource_deadline, ICAL['summary'], Literal(deadline['Title']))) graph.add((resource_deadline, ICAL['relatedTo'], resource_event)) i = i + 1 ### Topics and persons for link in document.find(id='cfp-content').findAll('a'): link = link.get('href') if link != None: if link[:3] == '/t/': topic_id = link.replace('/t/', 'topic_').replace('/', '') graph.add((resource_event, DCT['subject'], LDES[topic_id])) if link[:3] == '/p/': person_id = link.replace('/p/', 'person_').replace('/', '') graph.add( (resource_event, LODE['involvedAgent'], LDES[person_id])) ### Save f = open('data/' + event_id + '.rdf', 'w') f.write(graph.serialize()) f.close()
def _process_data(self, document): ''' Creates the RDF graph describing the event @param document: the DOM document of the event ''' # Create the graph graph = ConjunctiveGraph() graph.bind('swc', SWC) graph.bind('cfp', CFP) graph.bind('ical', ICAL) graph.bind('foaf', FOAF) graph.bind('dct', DCT) graph.bind('lode', LODE) # Init the event resource_event = LDES[self.get_resource_name()] graph.add((resource_event, RDF.type, SWC['AcademicEvent'])) # Get the title if document.find(id='inner_left') != None: title = document.find(id='inner_left').find('h1').text graph.add((resource_event, RDFS.label, Literal(title))) # Get the location if document.find(text='City:') != None and document.find( text='Country:') != None: city = document.find( text='City:').findParent().findNextSibling().renderContents() country = document.find(text='Country:').findParent( ).findNextSibling().renderContents() location = get_location(city, country) if location == None: location = Literal("%s, %s" % (city, country)) graph.add((resource_event, FOAF['based_near'], location)) # Get the starting and ending dates if document.find(text='Period:') != None: text = document.find(text='Period:').findParent().findNextSibling( ).renderContents() parts = re.search( '(?P<begin>[^-,]*)(-(?P<end>[^,]*))?, (?P<year>\d{4})', text).groupdict() if parts['begin'] != None and parts['year'] != None: (month, start_day) = parts['begin'].split(' ') begin_date = datetime.strptime( "%s %s %s" % (start_day, month, parts['year']), "%d %B %Y") graph.add( (resource_event, ICAL['dtstart'], Literal(begin_date))) if parts['end'] != None: end_parts = parts['end'].split(' ') end_date = None if len(end_parts) == 2: end_date = datetime.strptime( "%s %s %s" % (end_parts[1], end_parts[0], parts['year']), "%d %B %Y") elif len(end_parts) == 1: end_date = datetime.strptime( "%s %s %s" % (end_parts[0], month, parts['year']), "%d %B %Y") if end_date != None: graph.add( (resource_event, ICAL['dtend'], Literal(end_date))) # Get the data for the CFP resource_cfp = LDES[self.get_resource_name() + "_cfp"] graph.add((resource_cfp, RDF.type, CFP['CallForPapers'])) graph.add((resource_cfp, CFP['for'], LDES[self.entity_id])) graph.add( (resource_cfp, CFP['details'], URIRef(BASE + 'data/' + self.get_resource_name() + '_cfp.txt'))) # Get the deadlines deadlines = [] for a in document.findAll('script'): res = re.search('var deadlineList = ([^;]*);', a.renderContents()) if res != None: txt = res.group(1).replace('\n', '').replace('\t', '').replace("'", '"') txt = re.sub(r'<span [^>]*>([^<]*)</span>', '\g<1>', txt, flags=re.IGNORECASE) txt = txt.replace('Date:', '"Date":').replace('Title:', '"Title":') deadlines = json.loads(txt) i = 0 for deadline in deadlines: resource_deadline = LDES[self.get_resource_name() + '_deadline_' + str(i)] graph.add((resource_deadline, RDF.type, ICAL['Vevent'])) graph.add((resource_deadline, ICAL['dtstart'], Literal(datetime.strptime(deadline['Date'], "%d %b %Y")))) graph.add((resource_deadline, ICAL['dtend'], Literal(datetime.strptime(deadline['Date'], "%d %b %Y")))) graph.add((resource_deadline, ICAL['summary'], Literal(deadline['Title']))) graph.add((resource_deadline, ICAL['relatedTo'], resource_event)) i = i + 1 # Add the topics and persons if document.find(id='cfp-content') != None: for link in document.find(id='cfp-content').findAll('a'): link = link.get('href') if link != None: if link[:3] == '/t/' and link not in self.topics_set: try: graph.add( (resource_event, DCT['subject'], LDES[Topic(link[1:-1]).get_resource_name()])) self.topics_set.add(link[1:-1]) except: # Ignore bad topic links pass if link[:3] == '/p/' and link not in self.persons_set: try: graph.add( (resource_event, LODE['involvedAgent'], LDES[Person(link[1:-1]).get_resource_name()])) self.persons_set.add(link[1:-1]) except: # Ignore bad person link pass # Set the last modification date graph.add( (self.get_named_graph(), DCT['modified'], Literal(datetime.now()))) # Save the data self.rdf_data = graph.serialize()
def _process_data(self, document): ''' Creates the RDF graph describing the event @param document: the DOM document of the event ''' # Create the graph graph = ConjunctiveGraph() graph.bind('swc', SWC) graph.bind('cfp', CFP) graph.bind('ical', ICAL) graph.bind('foaf', FOAF) graph.bind('dct', DCT) graph.bind('lode', LODE) # Init the event resource_event = LDES[self.get_resource_name()] graph.add((resource_event, RDF.type, SWC['AcademicEvent'])) # Get the title if document.find(id='inner_left') != None: title = document.find(id='inner_left').find('h1').text graph.add((resource_event, RDFS.label, Literal(title))) # Get the location if document.find(text='City:') != None and document.find(text='Country:') != None: city = document.find(text='City:').findParent().findNextSibling().renderContents() country = document.find(text='Country:').findParent().findNextSibling().renderContents() location = get_location(city, country) if location == None: location = Literal("%s, %s" % (city, country)) graph.add((resource_event, FOAF['based_near'], location)) # Get the starting and ending dates if document.find(text='Period:') != None: text = document.find(text='Period:').findParent().findNextSibling().renderContents() parts = re.search('(?P<begin>[^-,]*)(-(?P<end>[^,]*))?, (?P<year>\d{4})', text).groupdict() if parts['begin'] != None and parts['year'] != None: (month, start_day) = parts['begin'].split(' ') begin_date = datetime.strptime("%s %s %s" % (start_day, month, parts['year']), "%d %B %Y") graph.add((resource_event, ICAL['dtstart'], Literal(begin_date))) if parts['end'] != None: end_parts = parts['end'].split(' ') end_date = None if len(end_parts) == 2: end_date = datetime.strptime("%s %s %s" % (end_parts[1], end_parts[0], parts['year']), "%d %B %Y") elif len(end_parts) == 1: end_date = datetime.strptime("%s %s %s" % (end_parts[0], month, parts['year']), "%d %B %Y") if end_date != None: graph.add((resource_event, ICAL['dtend'], Literal(end_date))) # Get the data for the CFP resource_cfp = LDES[self.get_resource_name() + "_cfp"] graph.add((resource_cfp, RDF.type, CFP['CallForPapers'])) graph.add((resource_cfp, CFP['for'], LDES[self.entity_id])) graph.add((resource_cfp, CFP['details'], URIRef(BASE + 'data/' + self.get_resource_name() + '_cfp.txt'))) # Get the deadlines deadlines = [] for a in document.findAll('script'): res = re.search('var deadlineList = ([^;]*);', a.renderContents()) if res != None: txt = res.group(1).replace('\n', '').replace('\t', '').replace("'", '"') txt = re.sub(r'<span [^>]*>([^<]*)</span>', '\g<1>', txt, flags=re.IGNORECASE) txt = txt.replace('Date:', '"Date":').replace('Title:', '"Title":') deadlines = json.loads(txt) i = 0 for deadline in deadlines: resource_deadline = LDES[self.get_resource_name() + '_deadline_' + str(i)] graph.add((resource_deadline, RDF.type, ICAL['Vevent'])) graph.add((resource_deadline, ICAL['dtstart'], Literal(datetime.strptime(deadline['Date'], "%d %b %Y")))) graph.add((resource_deadline, ICAL['dtend'], Literal(datetime.strptime(deadline['Date'], "%d %b %Y")))) graph.add((resource_deadline, ICAL['summary'], Literal(deadline['Title']))) graph.add((resource_deadline, ICAL['relatedTo'], resource_event)) i = i + 1 # Add the topics and persons if document.find(id='cfp-content') != None: for link in document.find(id='cfp-content').findAll('a'): link = link.get('href') if link != None: if link[:3] == '/t/' and link not in self.topics_set: try: graph.add((resource_event, DCT['subject'], LDES[Topic(link[1:-1]).get_resource_name()])) self.topics_set.add(link[1:-1]) except: # Ignore bad topic links pass if link[:3] == '/p/' and link not in self.persons_set: try: graph.add((resource_event, LODE['involvedAgent'], LDES[Person(link[1:-1]).get_resource_name()])) self.persons_set.add(link[1:-1]) except: # Ignore bad person link pass # Set the last modification date graph.add((self.get_named_graph(), DCT['modified'], Literal(datetime.now()))) # Save the data self.rdf_data = graph.serialize()