コード例 #1
0
 def __attrs_post_init__(self):
     # This is the default string that is formatted to contain the given
     # letter for this specific parser. This will (well, should) throw
     # an error if self.candidate_letter is improperly defined or
     # undefined after the class object is initialized.
     self.url = f'http://media.ethics.ga.gov/search/Campaign/Campaign_Namesearchresults.aspx?CommitteeName=&LastName={self.candidate_letter}&FirstName=&Method=0'
     self.navigator = SeleniumNavigator()
コード例 #2
0
 def __attrs_post_init__(self):
     self.search_results_urls = [
         'http://media.ethics.ga.gov/search/Campaign/Campaign_Namesearchresults.aspx?CommitteeName=&LastName=a&FirstName=&Method=0'
     ]
     # self.search_results_urls = (f'http://media.ethics.ga.gov/search/\
     #        Campaign/Campaign_Namesearchresults.aspx?CommitteeName=&LastName=\
     #        {character}&FirstName=&Method=0' for character in string.ascii_lowercase)
     self.navigator = SeleniumNavigator()
コード例 #3
0
 def __attrs_post_init__(self):
     self.search_results_urls = \
         [f'http://media.ethics.ga.gov/search/Campaign/Campaign_Namesearchresults.aspx?CommitteeName=&LastName={self.letter}&FirstName=&Method=0']
     #                            'http://media.ethics.ga.gov/search/Campaign/Campaign_Namesearchresults.aspx?CommitteeName=&LastName=x&FirstName=&Method=0',
     #                            'http://media.ethics.ga.gov/search/Campaign/Campaign_Namesearchresults.aspx?CommitteeName=&LastName=z&FirstName=&Method=0']
     # self.search_results_urls =
     # (f'http://media.ethics.ga.gov/search/Campaign/Campaign\
     # _Namesearchresults.aspx?CommitteeName=&LastName={character}&FirstName=&Method=0'
     # for character in string.ascii_lowercase)
     logging.info(self.letter)
     logging.info(self.search_results_urls)
     self.navigator = SeleniumNavigator(letter=self.letter)
コード例 #4
0
class FirstPassCrawler:
    session = attr.ib()
    candidate_letter = attr.ib()
    navigator = attr.ib(init=False)

    def __attrs_post_init__(self):
        # This is the default string that is formatted to contain the given
        # letter for this specific parser. This will (well, should) throw
        # an error if self.candidate_letter is improperly defined or
        # undefined after the class object is initialized.
        self.url = f'http://media.ethics.ga.gov/search/Campaign/Campaign_Namesearchresults.aspx?CommitteeName=&LastName={self.candidate_letter}&FirstName=&Method=0'
        self.navigator = SeleniumNavigator()

    def exit(self):
        self.session.close()

# 3-1.1: Same thingas reports and office get_or_add functions.

    def get_or_add_candidate(self, candidate):
        try:
            query_result = self.session.query(Candidate).filter_by(
                FilerId=candidate['FilerId'],
                Firstname=candidate['Firstname'],
                Lastname=candidate['Lastname']).first()
            if query_result:
                return query_result.CandidateId
            candidate = Candidate(**candidate)
            self.session.add(candidate)
            self.session.commit()
            return candidate.CandidateId
        except Exception as e:
            self.session.rollback()
            logging.info(e)

# 4.1: Same thing for reports as get_or_add_office, but for reports.

    def get_or_add_report(self, report):
        try:
            query_result = self.session.query(Report).filter_by(
                Url=report['Url']).first()
            if query_result:
                return query_result.ReportId
            report = Report(**report)
            self.session.add(report)
            self.session.commit()
            return report.ReportId
        except Exception as e:
            logging.info(e)
            self.session.rollback()

# 2.1: Either adds the office for which a candidate has run/is running or
# finds that it already exists and then returns the ID of said office.

    def get_or_add_office(self, office):
        try:
            query_result = self.session.query(Office).filter_by(
                Name=office.Name).first()
            if query_result:
                return query_result.OfficeId
            self.session.add(office)
            self.session.commit()
            return office.OfficeId
        except Exception as e:
            self.session.rollback()
            logging.info(e)

# 3-2: Crawls campaign finance reports tables and controls the interaction
# with the dropdowns and buttons that expose the table to allow Selenium
# to effectively scrape the tables and navigate to Reference Pages.

    def crawl_reports_table(self, candidate_id):
        dropdown = DropdownParser(self.navigator.page_source())
        if dropdown.parse() is not None:
            try:
                self.navigator.click_dropdown()
                parser = ReportsTableParser(self.navigator.page_source())
                for report_link, report in parser.parse():
                    if report_link is None:
                        logging.info('No report found.')
                        continue
                    try:
                        self.navigator.wait_for_it(report_link)
                        self.navigator.click_link(report_link)
                        self.navigator.wait_for_contributions_id()
                        report['CandidateId'] = candidate_id
                        report['Url'] = self.navigator.get_current_url()
                        self.get_or_add_report(report)
                        self.navigator.back()
                        self.navigator.click_dropdown()
                    except Exception as e:
                        logging.info(e)
                        logging.info(f'Report link id: {report_link}')
            except Exception as e:
                logging.info(e)

# 3-1: Essentially encapsulates the parser for the campaign registration
# information.

    def crawl_registration_info(self, candidate):
        parser = CandidateRegistrationParser(self.navigator.page_source())
        ret_candidate = parser.parse(candidate)
        return self.get_or_add_candidate(ret_candidate)

# 2: This crawls basic data from the candidate profiles. Information like
# the offices for which they are running, dropdown links associated with
# each office that exposes the campaign finance reports, as well as basic
# registration info like address, political party, etc.

    def crawl_candidate_profile(self, url, candidate):
        parser = CandidateProfileParser(self.navigator.page_source())
        for dropdown, office, current_candidate in parser.parse(candidate):
            if dropdown is None:
                logging.info(
                    f"No dropdown for {current_candidate['Firstname']} "
                    f"{current_candidate['Lastname']}")
                office_id = self.get_or_add_office(office)
                current_candidate['OfficeId'] = office_id
                self.crawl_registration_info(current_candidate)
                continue
            office_id = self.get_or_add_office(office)
            current_candidate['OfficeId'] = office_id
            self.navigator.expose_dropdown(dropdown)
            candidate_id = self.crawl_registration_info(current_candidate)
            try:
                self.crawl_reports_table(candidate_id)
            except Exception as e:
                logging.info(e)
        self.navigator.navigate(url)

# 1: This crawls the candidate profile links for the given url parameter.
# The SearchResultsParser parses out candidate name, bundling the name as
# a dictionary and navigating to each candidate's individual profile page.

    def crawl_candidate_profile_links(self, url):
        self.navigator.navigate(url)
        parser = SearchResultsParser(self.navigator.page_source())
        for candidate, current_link in parser.parse():
            if current_link is None:
                continue
            logging.info(f"Navigating to {candidate['Firstname']} "
                         f"{candidate['Lastname']}")
            try:
                self.navigator.wait_for_it(current_link)
                self.navigator.click_link(current_link)
                self.crawl_candidate_profile(url, candidate)
            except Exception as e:
                # Maybe include a self.navigator.navigate(url) call here in
                # case page doesn't load correctly
                logging.info(e)

# 0: The method called in app.py that actually executes the process of
# running the crawler and scraping the website.

    def crawl(self):
        try:
            self.crawl_candidate_profile_links(self.url)
        except Exception as e:
            logging.info(e)
        self.navigator.close_browser()
コード例 #5
0
 def __attrs_post_init__(self):
     logging.info('attrs post init called')
     self.navigator = SeleniumNavigator(loading_strategy='none',
                                        letter=self.letter)
     self.file_processor = FileProcessor(letter=self.letter)
コード例 #6
0
class SecondPassCrawler:
    session = attr.ib()
    navigator = attr.ib(init=False)
    file_processor = attr.ib(init=False)
    letter = attr.ib()

    def __attrs_post_init__(self):
        logging.info('attrs post init called')
        self.navigator = SeleniumNavigator(loading_strategy='none',
                                           letter=self.letter)
        self.file_processor = FileProcessor(letter=self.letter)

    def exit(self):
        self.navigator.close_browser()
        self.session.close()

    def get_urls(self):
        _ids = self.session.query(Candidate).filter(
            Candidate.Lastname.ilike('zorn')).all()
        #ids_ = \
        #self.session.query(Candidate).filter(Lastname.like("%z%")).all()
        reports = []
        for _id in _ids:
            results = \
            self.session.query(Report).filter_by(CandidateId=_id.CandidateId).all()
            logging.info(results)
            for result in results:
                reports.append((result.CandidateId, result.Url))
        return reports
        #return ['http://media.ethics.ga.gov/search/Campaign/Campaign_ReportOptions.aspx?NameID=16067&FilerID=C2012000744&CDRID=59991']

    def add_scrapelog_to_db(self, _id, url, content, dtime):
        slog = ScrapeLog(CandidateId=_id,
                         ScrapeDate=dtime,
                         RawData=content,
                         PageURL=url)
        try:
            self.session.add(slog)
            self.session.commit()
        except Exception as e:
            self.session.rollback()
            logging.info(e)

    def crawl_download_link(self, _id):
        parser = CSVLinkParser(self.navigator.page_source())
        parsed_link = parser.parse()
        if parsed_link is not None:
            logging.info(f'Parsed link: {parsed_link}')
            url = self.navigator.get_current_url()
            self.navigator.click_link(parsed_link)
            logging.info('Clicking download link for csv file.')
            content, dtime = self.file_processor.process()
            logging.info('Adding scrapelog to database')
            self.add_scrapelog_to_db(_id, url, content, dtime)
            self.file_processor.delete_csv()

    def crawl_view_contributions_ids(self, _id):
        logging.info(f'Current page: {self.navigator.get_current_url()}')
        parser = ContributionsViewParser(self.navigator.page_source())
        parsed_link = parser.parse()
        if parsed_link is not None:
            logging.info(f'Parsed link: {parsed_link}')
            self.navigator.click_link(parsed_link)
            self.navigator.wait_for_csv_link()
            self.crawl_download_link(_id)

    def crawl(self):
        for _id, url in self.get_urls():
            logging.info(f'Current url: {url}')
            self.navigator.navigate(url)
            self.navigator.wait_for_contributions_id()
            self.crawl_view_contributions_ids(_id)
コード例 #7
0
 def __attrs_post_init__(self):
     self.navigator = SeleniumNavigator(loading_strategy='none')
     self.file_processor = FileProcessor()
コード例 #8
0
class SecondPassCrawler:
    session = attr.ib()
    navigator = attr.ib(init=False)
    file_processor = attr.ib(init=False)

    def __attrs_post_init__(self):
        self.navigator = SeleniumNavigator(loading_strategy='none')
        self.file_processor = FileProcessor()

    def exit(self):
        self.navigator.close_browser()
        self.session.close()

    def get_urls(self):
        # results = self.session.query(Report).all()
        # return (report.url for report in results)
        return ['http://media.ethics.ga.gov/search/Campaign/Campaign_ReportOptions.aspx?NameID=16067&FilerID=C2012000744&CDRID=59991']

    def add_scrapelog_to_db(self, url, content, dtime):
        slog = ScrapeLog(scrape_date=dtime,
                         raw_data=content,
                         page_url=url)
        try:
            self.session.add(slog)
            self.session.commit()
        except Exception as e:
            self.session.rollback()
            logging.info(e)

    def crawl_download_link(self):
        parser = CSVLinkParser(self.navigator.page_source())
        parsed_link = parser.parse()
        if parsed_link is not None:
            logging.info(f'Parsed link: {parsed_link}')
            url = self.navigator.get_current_url()
            self.navigator.click_link(parsed_link)
            logging.info('Clicking download link for csv file.')
            content, dtime = self.file_processor.process()
            self.add_scrapelog_to_db(url, content, dtime)
            self.file_processor.delete_csv()

    def crawl_view_contributions_ids(self):
        logging.info(f'Current page: {self.navigator.get_current_url()}')
        parser = ContributionsViewParser(self.navigator.page_source())
        parsed_link = parser.parse()
        if parsed_link is not None:
            logging.info(f'Parsed link: {parsed_link}')
            self.navigator.click_link(parsed_link)
            self.navigator.wait_for_csv_link()
            self.crawl_download_link()

    def crawl(self):
        urls = self.get_urls()
        for url in urls:
            logging.info(f'Current url: {url}')
            self.navigator.navigate(url)
            self.navigator.wait_for_contributions_id() 
            self.crawl_view_contributions_ids()
コード例 #9
0
class FirstPassCrawler:
    session = attr.ib()
    candidate_list = attr.ib(init=False)
    navigator = attr.ib(init=False)
    letter = attr.ib()

    def __attrs_post_init__(self):
        self.search_results_urls = \
            [f'http://media.ethics.ga.gov/search/Campaign/Campaign_Namesearchresults.aspx?CommitteeName=&LastName={self.letter}&FirstName=&Method=0']
        #                            'http://media.ethics.ga.gov/search/Campaign/Campaign_Namesearchresults.aspx?CommitteeName=&LastName=x&FirstName=&Method=0',
        #                            'http://media.ethics.ga.gov/search/Campaign/Campaign_Namesearchresults.aspx?CommitteeName=&LastName=z&FirstName=&Method=0']
        # self.search_results_urls =
        # (f'http://media.ethics.ga.gov/search/Campaign/Campaign\
        # _Namesearchresults.aspx?CommitteeName=&LastName={character}&FirstName=&Method=0'
        # for character in string.ascii_lowercase)
        logging.info(self.letter)
        logging.info(self.search_results_urls)
        self.navigator = SeleniumNavigator(letter=self.letter)

    def exit(self):
        self.session.close()

# 3-1.1

    def get_or_add_candidate(self, candidate):
        try:
            query_result = self.session.query(Candidate).filter_by(
                FilerId=candidate['FilerId'],
                Firstname=candidate['Firstname'],
                Lastname=candidate['Lastname']).first()
            if query_result:
                return query_result.CandidateId
            candidate = Candidate(**candidate)
            self.session.add(candidate)
            self.session.commit()
            return candidate.CandidateId
        except Exception as e:
            self.session.rollback()
            logging.info(e)

# 4.1

    def get_or_add_report(self, report):
        try:
            query_result = self.session.query(Report).filter_by(
                Url=report['Url']).first()
            if query_result:
                return query_result.ReportId
            report = Report(**report)
            self.session.add(report)
            self.session.commit()
            return report.ReportId
        except Exception as e:
            logging.info(e)
            self.session.rollback()

# 2.1

    def get_or_add_office(self, office):
        try:
            query_result = self.session.query(Office).filter_by(
                Name=office.Name).first()
            if query_result:
                return query_result.OfficeId
            self.session.add(office)
            self.session.commit()
            return office.OfficeId
        except Exception as e:
            self.session.rollback()
            logging.info(e)

# 3-2

    def crawl_reports_table(self, candidate_id):
        dropdown = DropdownParser(self.navigator.page_source())
        if dropdown.parse() is not None:
            try:
                self.navigator.click_dropdown()
                parser = ReportsTableParser(self.navigator.page_source())
                for report_link, report in parser.parse():
                    if report_link is None:
                        logging.info('No report found.')
                        continue
                    try:
                        self.navigator.wait_for_it(report_link)
                        self.navigator.click_link(report_link)
                        self.navigator.wait_for_contributions_id()
                        report['CandidateId'] = candidate_id
                        report['Url'] = self.navigator.get_current_url()
                        self.get_or_add_report(report)
                        self.navigator.back()
                        self.navigator.click_dropdown()
                    except Exception as e:
                        logging.info(e)
                        logging.info(f'Report link id: {report_link}')
            except Exception as e:
                logging.info(e)

# 3-1

    def crawl_registration_info(self, candidate):
        parser = CandidateRegistrationParser(self.navigator.page_source())
        ret_candidate = parser.parse(candidate)
        return self.get_or_add_candidate(ret_candidate)

# 2

    def crawl_candidate_profile(self, url, candidate):
        parser = CandidateProfileParser(self.navigator.page_source())
        for dropdown, office, current_candidate in parser.parse(candidate):
            if dropdown is None:
                logging.info(
                    f"No dropdown for {current_candidate['Firstname']} "
                    f"{current_candidate['Lastname']}")
                office_id = self.get_or_add_office(office)
                current_candidate['OfficeId'] = office_id
                self.crawl_registration_info(current_candidate)
                continue
            office_id = self.get_or_add_office(office)
            current_candidate['OfficeId'] = office_id
            self.navigator.expose_dropdown(dropdown)
            candidate_id = self.crawl_registration_info(current_candidate)
            try:
                self.crawl_reports_table(candidate_id)
            except Exception as e:
                logging.info(e)
        self.navigator.navigate(url)

# 1

    def crawl_candidate_profile_links(self, url):
        self.navigator.navigate(url)
        parser = SearchResultsParser(self.navigator.page_source())
        for candidate, current_link in parser.parse():
            if current_link is None:
                continue
            logging.info(f"Navigating to {candidate['Firstname']} "
                         f"{candidate['Lastname']}")
            try:
                self.navigator.wait_for_it(current_link)
                self.navigator.click_link(current_link)
                self.crawl_candidate_profile(url, candidate)
            except Exception as e:
                logging.info(e)

# 0

    def crawl(self):
        for url in self.search_results_urls:
            try:
                self.crawl_candidate_profile_links(url)
            except Exception as e:
                logging.info(e)
        self.navigator.close_browser()
コード例 #10
0
class Crawler:
    session = attr.ib()
    candidate_list = attr.ib(init=False)
    navigator = attr.ib(init=False)

    def __attrs_post_init__(self):
        self.search_results_urls = [
            'http://media.ethics.ga.gov/search/Campaign/Campaign_Namesearchresults.aspx?CommitteeName=&LastName=a&FirstName=&Method=0'
        ]
        # self.search_results_urls = (f'http://media.ethics.ga.gov/search/\
        #        Campaign/Campaign_Namesearchresults.aspx?CommitteeName=&LastName=\
        #        {character}&FirstName=&Method=0' for character in string.ascii_lowercase)
        self.navigator = SeleniumNavigator()

    def exit(self):
        self.session.close()

    def add_candidate_to_db(self, candidate):
        try:
            self.session.add(candidate)
            self.session.commit()
        except Exception as e:
            logging.info(e)
        return candidate.id

    def add_office_to_db(self, office):
        try:
            self.session.add(office)
            self.session.commit()
        except Exception as e:
            logging.info(e)
            self.session.rollback()
        return office.id

    def add_report_to_db(self, report):
        try:
            self.session.add(report)
            self.session.commit()
        except Exception as e:
            logging.info(e)
            self.session.rollback()
        return report.id

    def crawl_reports_table(self, office_id):
        dropdown = DropdownParser(self.navigator.page_source())
        if dropdown.parse() is not None:
            try:
                self.navigator.click_dropdown_initial()
                parser = ReportsTableParser(self.navigator.page_source())
                res = parser.parse()
                for report_link, report in res:
                    try:
                        self.navigator.click_link(report_link)
                        self.navigator.wait_for_contributions_id()
                        report.url = self.navigator.get_current_url()
                        report.office_id = office_id
                        self.add_report_to_db(report)
                        self.navigator.back()
                        self.navigator.click_dropdown_subsequent()
                    except Exception as e:
                        logging.info(e)
            except Exception as e:
                logging.info(e)

    def crawl_candidate_profile(self, url, candidate):
        parser = CandidateProfileParser(self.navigator.page_source())
        logging.info(
            f'Crawling page for {candidate.firstname} {candidate.lastname}')
        for dropdown_link, office in parser.parse():
            if dropdown_link is None:
                continue
            candidate_id = self.add_candidate_to_db(candidate)
            office.candidate_id = candidate_id
            office_id = self.add_office_to_db(office)
            self.navigator.expose_dropdown(dropdown_link)
            try:
                self.crawl_reports_table(office_id)
            except Exception as e:
                logging.info(e)
        self.navigator.navigate(url)

    def crawl_candidate_profile_links(self, url):
        self.navigator.navigate(url)
        parser = SearchResultsParser(self.navigator.page_source())
        for candidate, current_link in parser.parse():
            self.navigator.click_link(current_link)
            try:
                self.crawl_candidate_profile(url, candidate)
            except Exception as e:
                logging.info(e)

    def crawl(self):
        for url in self.search_results_urls:
            logging.info(f'Crawling {url}')
            try:
                self.crawl_candidate_profile_links(url)
            except Exception as e:
                logging.info(e)
        self.navigator.close_browser()