class MainScraper(BaseScraper): """ The main scraper - this scraper does all the work and uses the other scrapers """ def __init__(self): super(MainScraper, self).__init__(self) self.source = ScraperSource(LobbyistsIndexScraper()) self.storage = MainScraperListStorage() def _scrape(self): lobbyist_ids = self.source.fetch() i=0 for lobbyist_id in lobbyist_ids: lobbyist = LobbyistScraper().scrape(lobbyist_id) self.storage.store(lobbyist) i+=1
class MainScraper(BaseScraper): """ The main scraper - this scraper does all the work and uses the other scrapers """ def __init__(self): super(MainScraper, self).__init__(self) self.source = ScraperSource(LobbyistsIndexScraper()) self.storage = MainScraperListStorage() def _scrape(self): lobbyist_ids = self.source.fetch() i = 0 for lobbyist_id in lobbyist_ids: lobbyist = LobbyistScraper().scrape(lobbyist_id) self.storage.store(lobbyist) i += 1 #if i>0: break self._getLogger().info( 'looking for mentions of lobbyists in committee meetings') LobbyistsCommiteeMeetingsScraper().scrape() LobbyistCorporationsCommitteeMeetingsScraper().scrape()
class MainScraper(BaseScraper): """ The main scraper - this scraper does all the work and uses the other scrapers """ def __init__(self): super(MainScraper, self).__init__(self) self.source = ScraperSource(LobbyistsIndexScraper()) self.storage = MainScraperListStorage() def _scrape(self): lobbyist_ids = self.source.fetch() i=0 for lobbyist_id in lobbyist_ids: lobbyist = LobbyistScraper().scrape(lobbyist_id) self.storage.store(lobbyist) i+=1 #if i>0: break self._getLogger().info('looking for mentions of lobbyists in committee meetings') LobbyistsCommiteeMeetingsScraper().scrape() LobbyistCorporationsCommitteeMeetingsScraper().scrape()
def __init__(self): super(MainScraper, self).__init__(self) self.source = ScraperSource(LobbyistsIndexScraper()) self.storage = MainScraperListStorage()
class MainScraper(BaseScraper): """ The main scraper - this scraper does all the work and uses the other scrapers """ def __init__(self): super(MainScraper, self).__init__(self) self.source = ScraperSource(LobbyistsIndexScraper()) self.storage = MainScraperListStorage() def _update_lobbyists_changes(self): chgs = [] self._getLogger().info( 'processing lobbyist changes - deleting all existing changes and re-creating from scratch' ) LobbyistsChange.objects.all().delete() self._getLogger().info('lobbyist history (added / deleted lobbyists)') prev_lh = None for lh in LobbyistHistory.objects.order_by('scrape_time'): # look for added / deleted lobbyyists if prev_lh == None: self._getLogger().debug(lh.scrape_time) self._getLogger().debug( 'first history - all lobbyists considered as added') added_lobbyist_ids = [l.pk for l in lh.lobbyists.all()] deleted_lobbyist_ids = [] else: prev_lobbyist_ids = set( [l.pk for l in prev_lh.lobbyists.all()]) cur_lobbyist_ids = set([l.pk for l in lh.lobbyists.all()]) deleted_lobbyist_ids = list( prev_lobbyist_ids.difference(cur_lobbyist_ids)) added_lobbyist_ids = list( cur_lobbyist_ids.difference(prev_lobbyist_ids)) if len(deleted_lobbyist_ids) > 0 or len( added_lobbyist_ids) > 0: self._getLogger().debug(lh.scrape_time) self._getLogger().debug( '%s deleted lobbyists, %s added lobbyists' % (len(deleted_lobbyist_ids), len(added_lobbyist_ids))) for lid in added_lobbyist_ids: chgs.append( LobbyistsChange( date=lh.scrape_time, content_object=Lobbyist.objects.get(pk=lid), type='added')) for lid in deleted_lobbyist_ids: chgs.append( LobbyistsChange( date=lh.scrape_time, content_object=Lobbyist.objects.get(pk=lid), type='deleted')) prev_lh = lh self._getLogger().info('lobbyist data (lobbyist metadata changes)') lds = {} for ld in LobbyistData.objects.order_by('scrape_time'): lid = ld.lobbyist.pk if lid in lds: # we don't add an event for new lobbyist data, assuming you will get a lobbyist added event from the lobbyist history changeset = [] prev_ld = lds[lid] # looking for changes in the lobbyist data fields for field in [ 'source_id', 'first_name', 'family_name', 'profession', 'corporation_name', 'corporation_id', 'faction_member', 'faction_name', 'permit_type' ]: if getattr(prev_ld, field) != getattr(ld, field): changeset.append( (field, getattr(prev_ld, field), getattr(ld, field))) # looking for changes in the lobbyist represents # we use only the name because there are some problems with the other represents fields prev_represents = set( [r.name for r in prev_ld.represents.all()]) cur_represents = set([r.name for r in ld.represents.all()]) deleted_represent_names = list( prev_represents.difference(cur_represents)) if len(deleted_represent_names) > 0: changeset.append(('represent_names', 'deleted', deleted_represent_names)) added_represent_names = list( cur_represents.difference(prev_represents)) if len(added_represent_names) > 0: changeset.append( ('represent_names', 'added', added_represent_names)) if len(changeset) > 0: self._getLogger().debug('%s: got %s changes' % (ld.scrape_time, len(changeset))) chgs.append( LobbyistsChange(date=ld.scrape_time, content_object=ld.lobbyist, type='modified', extra_data=json.dumps(changeset))) lds[lid] = ld self._getLogger().info('lobbyist corporation data') lcds = {} for lcd in LobbyistCorporationData.objects.order_by('scrape_time'): lc = lcd.corporation lcid = lc.pk if lcid in lcds: # existing corporation - need to check for changes changeset = [] prev_lcd = lcds[lcid] for field in ['name', 'source_id']: if getattr(prev_lcd, field) != getattr(lcd, field): changeset.append( (field, getattr(prev_lcd, field), getattr(lcd, field))) prev_lobbyists = set([l.pk for l in prev_lcd.lobbyists.all()]) cur_lobbyists = set([l.pk for l in lcd.lobbyists.all()]) deleted_lobbyists = list( prev_lobbyists.difference(cur_lobbyists)) if len(deleted_lobbyists) > 0: changeset.append( ('lobbyists', 'deleted', deleted_lobbyists)) added_lobbyists = list( cur_lobbyists.difference(prev_lobbyists)) if len(added_lobbyists) > 0: changeset.append(('lobbyists', 'added', added_lobbyists)) if len(changeset) > 0: self._getLogger().debug('%s: got %s changes' % (lcd.scrape_time, len(changeset))) chgs.append( LobbyistsChange(date=lcd.scrape_time, content_object=lc, type='modified', extra_data=json.dumps(changeset))) else: # new coropration chgs.append( LobbyistsChange(date=lcd.scrape_time, content_object=lc, type='added')) lcds[lcid] = lcd self._getLogger().info('bulk creating %s changes' % len(chgs)) LobbyistsChange.objects.bulk_create(chgs) def _scrape(self): lobbyist_ids = self.source.fetch() i = 0 for lobbyist_id in lobbyist_ids: lobbyist = LobbyistScraper().scrape(lobbyist_id) self.storage.store(lobbyist) i += 1 #if i>0: break self._getLogger().info( 'looking for mentions of lobbyists in committee meetings') LobbyistsCommiteeMeetingsScraper().scrape() LobbyistCorporationsCommitteeMeetingsScraper().scrape() self._update_lobbyists_changes()
class MainScraper(BaseScraper): """ The main scraper - this scraper does all the work and uses the other scrapers """ def __init__(self): super(MainScraper, self).__init__(self) self.source = ScraperSource(LobbyistsIndexScraper()) self.storage = MainScraperListStorage() def _update_lobbyists_changes(self): chgs = [] self._getLogger().info('processing lobbyist changes - deleting all existing changes and re-creating from scratch') LobbyistsChange.objects.all().delete() self._getLogger().info('lobbyist history (added / deleted lobbyists)') prev_lh = None for lh in LobbyistHistory.objects.order_by('scrape_time'): # look for added / deleted lobbyyists if prev_lh == None: self._getLogger().debug(lh.scrape_time) self._getLogger().debug('first history - all lobbyists considered as added') added_lobbyist_ids = [l.pk for l in lh.lobbyists.all()] deleted_lobbyist_ids = [] else: prev_lobbyist_ids = set([l.pk for l in prev_lh.lobbyists.all()]) cur_lobbyist_ids = set([l.pk for l in lh.lobbyists.all()]) deleted_lobbyist_ids = list(prev_lobbyist_ids.difference(cur_lobbyist_ids)) added_lobbyist_ids = list(cur_lobbyist_ids.difference(prev_lobbyist_ids)) if len(deleted_lobbyist_ids) > 0 or len(added_lobbyist_ids) > 0: self._getLogger().debug(lh.scrape_time) self._getLogger().debug('%s deleted lobbyists, %s added lobbyists'%(len(deleted_lobbyist_ids), len(added_lobbyist_ids))) for lid in added_lobbyist_ids: chgs.append(LobbyistsChange(date=lh.scrape_time, content_object=Lobbyist.objects.get(pk=lid), type='added')) for lid in deleted_lobbyist_ids: chgs.append(LobbyistsChange(date=lh.scrape_time, content_object=Lobbyist.objects.get(pk=lid), type='deleted')) prev_lh = lh self._getLogger().info('lobbyist data (lobbyist metadata changes)') lds = {} for ld in LobbyistData.objects.order_by('scrape_time'): lid = ld.lobbyist.pk if lid in lds: # we don't add an event for new lobbyist data, assuming you will get a lobbyist added event from the lobbyist history changeset = [] prev_ld = lds[lid] # looking for changes in the lobbyist data fields for field in ['source_id', 'first_name', 'family_name', 'profession', 'corporation_name', 'corporation_id', 'faction_member', 'faction_name', 'permit_type']: if getattr(prev_ld, field) != getattr(ld, field): changeset.append((field, getattr(prev_ld, field), getattr(ld, field))) # looking for changes in the lobbyist represents # we use only the name because there are some problems with the other represents fields prev_represents = set([r.name for r in prev_ld.represents.all()]) cur_represents = set([r.name for r in ld.represents.all()]) deleted_represent_names = list(prev_represents.difference(cur_represents)) if len(deleted_represent_names) > 0: changeset.append(('represent_names', 'deleted', deleted_represent_names)) added_represent_names = list(cur_represents.difference(prev_represents)) if len(added_represent_names) > 0: changeset.append(('represent_names', 'added', added_represent_names)) if len(changeset) > 0: self._getLogger().debug('%s: got %s changes'%(ld.scrape_time, len(changeset))) chgs.append(LobbyistsChange(date=ld.scrape_time, content_object=ld.lobbyist, type='modified', extra_data=json.dumps(changeset))) lds[lid] = ld self._getLogger().info('lobbyist corporation data') lcds = {} for lcd in LobbyistCorporationData.objects.order_by('scrape_time'): lc = lcd.corporation lcid = lc.pk if lcid in lcds: # existing corporation - need to check for changes changeset = [] prev_lcd = lcds[lcid] for field in ['name', 'source_id']: if getattr(prev_lcd, field) != getattr(lcd, field): changeset.append((field, getattr(prev_lcd, field), getattr(lcd, field))) prev_lobbyists = set([l.pk for l in prev_lcd.lobbyists.all()]) cur_lobbyists = set([l.pk for l in lcd.lobbyists.all()]) deleted_lobbyists = list(prev_lobbyists.difference(cur_lobbyists)) if len(deleted_lobbyists) > 0: changeset.append(('lobbyists', 'deleted', deleted_lobbyists)) added_lobbyists = list(cur_lobbyists.difference(prev_lobbyists)) if len(added_lobbyists) > 0: changeset.append(('lobbyists', 'added', added_lobbyists)) if len(changeset) > 0: self._getLogger().debug('%s: got %s changes'%(lcd.scrape_time, len(changeset))) chgs.append(LobbyistsChange(date=lcd.scrape_time, content_object=lc, type='modified', extra_data=json.dumps(changeset))) else: # new coropration chgs.append(LobbyistsChange(date=lcd.scrape_time, content_object=lc, type='added')) lcds[lcid] = lcd self._getLogger().info('bulk creating %s changes'%len(chgs)) LobbyistsChange.objects.bulk_create(chgs) def _scrape(self): lobbyist_ids = self.source.fetch() i=0 for lobbyist_id in lobbyist_ids: lobbyist = LobbyistScraper().scrape(lobbyist_id) self.storage.store(lobbyist) i+=1 #if i>0: break self._getLogger().info('looking for mentions of lobbyists in committee meetings') LobbyistsCommiteeMeetingsScraper().scrape() LobbyistCorporationsCommitteeMeetingsScraper().scrape() self._update_lobbyists_changes()