def __init__(self, triple_store_url, data_directory): ''' Constructor ''' self.triple_store = TripleStore(triple_store_url) self.data_directory = data_directory
class Harvester(object): ''' The Harvester get the list of events from EventSeer using the JSON API and save the data into the triple store ''' def __init__(self, triple_store_url, data_directory): ''' Constructor ''' self.triple_store = TripleStore(triple_store_url) self.data_directory = data_directory def process_events(self): ''' Call the main API and process all the events, page by page ''' # Number of events to ask at once PAGE_SIZE = 100 # Iterate over all the pages of events pages = (self._get_events(0, 1)['totalRecords'] / PAGE_SIZE) + 1 for i in range(0, pages): print 'Get page %d' % i page = self._get_events(i * PAGE_SIZE, PAGE_SIZE) for record in page['records']: self._process_event(record) def _process_event(self, record): # Get the event id, its name and its last modification date event_name = re.sub(r'<a [^>]*>([^<]*)</a>', '\g<1>', record['Event'], flags=re.IGNORECASE).replace('\n', '') event_id = re.search('href="([^"]*)"', record['Event']).group(1)[1:-1] event_last_update_date = datetime.strptime(record['Date'], "%d %b %Y") event = Event(event_id) # Get the last modification date from the end point server_version_date = self.triple_store.get_last_version_date(event) # Check the status of the event in the triple store if server_version_date == None or (event_last_update_date - server_version_date).days > 0: # The event is not in the triple store or needs to be updated print '\t[UPD] %s - %s' % (event_id, event_name) # Add the topics not already existing for t in event.get_topics(): topic = Topic(t) if self.triple_store.get_last_version_date(topic) == None: try: print '\t\t[UPD-TOPIC] %s' % t topic.load_data() self.triple_store.save_rdf_data(topic) except: # It's ok if we miss one pass # Update the data about all the persons concerned for p in event.get_persons(): try: print '\t\t[UPD-PERSON] %s' % p person = Person(p) person.load_data() self.triple_store.save_rdf_data(person) except: # It's ok if we miss one pass # Save the RDF data of the event event.load_data() self.triple_store.save_rdf_data(event) # Save the CFP from the call file = open(self.data_directory + '/' + event.get_resource_name() + '_cfp.txt', 'w') file.write(event.get_cfp_data()) file.close() else: # The server version is up to date print '\t[OK] %s - %s' % (event_id, event_name) def _get_events(self, start, size): ''' Return a list of events as a JSON object @param start: start index @param size: number of events to retrieve ''' url = "http://eventseer.net/e/_category_list_json/?category=e&sort=Date&dir=desc&type=cat&filter=all&deadlines=all&search=text&start=%d&results=%d" % (start, size) return json.loads(urllib2.urlopen(url).read())