class SecondPassCrawler: session = attr.ib() navigator = attr.ib(init=False) file_processor = attr.ib(init=False) def __attrs_post_init__(self): self.navigator = SeleniumNavigator(loading_strategy='none') self.file_processor = FileProcessor() def exit(self): self.navigator.close_browser() self.session.close() def get_urls(self): # results = self.session.query(Report).all() # return (report.url for report in results) return ['http://media.ethics.ga.gov/search/Campaign/Campaign_ReportOptions.aspx?NameID=16067&FilerID=C2012000744&CDRID=59991'] def add_scrapelog_to_db(self, url, content, dtime): slog = ScrapeLog(scrape_date=dtime, raw_data=content, page_url=url) try: self.session.add(slog) self.session.commit() except Exception as e: self.session.rollback() logging.info(e) def crawl_download_link(self): parser = CSVLinkParser(self.navigator.page_source()) parsed_link = parser.parse() if parsed_link is not None: logging.info(f'Parsed link: {parsed_link}') url = self.navigator.get_current_url() self.navigator.click_link(parsed_link) logging.info('Clicking download link for csv file.') content, dtime = self.file_processor.process() self.add_scrapelog_to_db(url, content, dtime) self.file_processor.delete_csv() def crawl_view_contributions_ids(self): logging.info(f'Current page: {self.navigator.get_current_url()}') parser = ContributionsViewParser(self.navigator.page_source()) parsed_link = parser.parse() if parsed_link is not None: logging.info(f'Parsed link: {parsed_link}') self.navigator.click_link(parsed_link) self.navigator.wait_for_csv_link() self.crawl_download_link() def crawl(self): urls = self.get_urls() for url in urls: logging.info(f'Current url: {url}') self.navigator.navigate(url) self.navigator.wait_for_contributions_id() self.crawl_view_contributions_ids()
class SecondPassCrawler: session = attr.ib() navigator = attr.ib(init=False) file_processor = attr.ib(init=False) letter = attr.ib() def __attrs_post_init__(self): logging.info('attrs post init called') self.navigator = SeleniumNavigator(loading_strategy='none', letter=self.letter) self.file_processor = FileProcessor(letter=self.letter) def exit(self): self.navigator.close_browser() self.session.close() def get_urls(self): _ids = self.session.query(Candidate).filter( Candidate.Lastname.ilike('zorn')).all() #ids_ = \ #self.session.query(Candidate).filter(Lastname.like("%z%")).all() reports = [] for _id in _ids: results = \ self.session.query(Report).filter_by(CandidateId=_id.CandidateId).all() logging.info(results) for result in results: reports.append((result.CandidateId, result.Url)) return reports #return ['http://media.ethics.ga.gov/search/Campaign/Campaign_ReportOptions.aspx?NameID=16067&FilerID=C2012000744&CDRID=59991'] def add_scrapelog_to_db(self, _id, url, content, dtime): slog = ScrapeLog(CandidateId=_id, ScrapeDate=dtime, RawData=content, PageURL=url) try: self.session.add(slog) self.session.commit() except Exception as e: self.session.rollback() logging.info(e) def crawl_download_link(self, _id): parser = CSVLinkParser(self.navigator.page_source()) parsed_link = parser.parse() if parsed_link is not None: logging.info(f'Parsed link: {parsed_link}') url = self.navigator.get_current_url() self.navigator.click_link(parsed_link) logging.info('Clicking download link for csv file.') content, dtime = self.file_processor.process() logging.info('Adding scrapelog to database') self.add_scrapelog_to_db(_id, url, content, dtime) self.file_processor.delete_csv() def crawl_view_contributions_ids(self, _id): logging.info(f'Current page: {self.navigator.get_current_url()}') parser = ContributionsViewParser(self.navigator.page_source()) parsed_link = parser.parse() if parsed_link is not None: logging.info(f'Parsed link: {parsed_link}') self.navigator.click_link(parsed_link) self.navigator.wait_for_csv_link() self.crawl_download_link(_id) def crawl(self): for _id, url in self.get_urls(): logging.info(f'Current url: {url}') self.navigator.navigate(url) self.navigator.wait_for_contributions_id() self.crawl_view_contributions_ids(_id)