class FunctionHarvester(): """ Harvest the details of agencies associated with a particular function. """ def __init__(self, function): self.function = function self.total_pages = None self.pages_complete = 0 self.client = RSAgencySearchClient() self.prepare_harvest() db = self.get_db() self.functions = db.functions self.agencies = db.agencies self.agencies.create_index('agency_id', unique=True) def get_db(self): dbclient = MongoClient(MONGOLAB_URL) db = dbclient.get_default_database() return db def get_total(self): return self.client.total_results def prepare_harvest(self): self.client.search_agencies(results_per_page=0, function=self.function) total_results = self.client.total_results print '{} agencies'.format(total_results) self.total_pages = (int(total_results) / self.client.results_per_page) + 1 print '{} pages'.format(self.total_pages) def start_harvest(self, page=None): if not page: page = self.pages_complete + 1 else: self.pages_complete = page - 1 while self.pages_complete < self.total_pages: response = self.client.search_agencies(page=page, function=self.function) agencies = [] for result in response['results']: print result['agency_id'] agency = {'agency_id': result['agency_id'], 'title': result['title']} agency['agency_status'] = result['agency_status'] agency['location'] = result['location'] agency['start_date'] = result['dates']['start_date'] agency['end_date'] = result['dates']['end_date'] for function in result['functions']: if function['identifier'] == self.function: agency['function_start'] = function['start_date'] agency['function_end'] = function['end_date'] agencies.append(agency) try: self.agencies.insert(result) except DuplicateKeyError: pass self.functions.update_one({'function': self.function}, {'$push': {'agencies': {'$each': agencies}}}, upsert=True) self.pages_complete += 1 page += 1 print '{} pages complete'.format(self.pages_complete) time.sleep(1)
def __init__(self, function): self.function = function self.total_pages = None self.pages_complete = 0 self.client = RSAgencySearchClient() self.prepare_harvest() db = self.get_db() self.functions = db.functions self.agencies = db.agencies self.agencies.create_index('agency_id', unique=True)
class FunctionHarvester(): """ Harvest the details of agencies associated with a particular function. """ def __init__(self, function): self.function = function self.total_pages = None self.pages_complete = 0 self.client = RSAgencySearchClient() self.prepare_harvest() db = self.get_db() self.functions = db.functions self.agencies = db.agencies self.agencies.create_index('agency_id', unique=True) def get_db(self): dbclient = MongoClient(MONGOLAB_URL) db = dbclient.get_default_database() return db def get_total(self): return self.client.total_results def prepare_harvest(self): self.client.search_agencies(results_per_page=0, function=self.function) total_results = self.client.total_results print '{} agencies'.format(total_results) self.total_pages = (int(total_results) / self.client.results_per_page) + 1 print '{} pages'.format(self.total_pages) def start_harvest(self, page=None): if not page: page = self.pages_complete + 1 else: self.pages_complete = page - 1 while self.pages_complete < self.total_pages: response = self.client.search_agencies(page=page, function=self.function) agencies = [] for result in response['results']: print result['agency_id'] agency = { 'agency_id': result['agency_id'], 'title': result['title'] } agency['agency_status'] = result['agency_status'] agency['location'] = result['location'] agency['start_date'] = result['dates']['start_date'] agency['end_date'] = result['dates']['end_date'] for function in result['functions']: if function['identifier'] == self.function: agency['function_start'] = function['start_date'] agency['function_end'] = function['end_date'] agencies.append(agency) try: self.agencies.insert(result) except DuplicateKeyError: pass self.functions.update_one( {'function': self.function}, {'$push': { 'agencies': { '$each': agencies } }}, upsert=True) self.pages_complete += 1 page += 1 print '{} pages complete'.format(self.pages_complete) time.sleep(1)