Esempio n. 1
0
class Scraper:
    def __init__(self):
        self.DEFAULT_SLEEP_TIME = 5
        self.SEARCH_FORM_ADDRESS = "https://www.southtechhosting.com/SanJoseCity/CampaignDocsWebRetrieval/Search/SearchByElection.aspx"

        # create data folder in current directory to store files
        self.website = SjcWebsite()
        self.new_dir = DirManager(["data"])
        self.new_dir.createFolder()
        self.download_dir = self.new_dir.getDirectory()

        self.website.preprocessing = PreProcessing(self.download_dir)

        options = webdriver.ChromeOptions()

        # enable headless data retrieval 
        isHeadless = os.environ.get('HEADLESS', True)
        if isHeadless:
            options.add_argument("--headless")
        options.add_argument("--disable-gpu")
        options.add_argument("--window-size=1280,800")

        options.add_argument("--ignore-certificate-errors")
        options.add_argument("--test_type")
        options.add_argument("--no-sandbox")
        options.add_argument("start-maximized")
        options.add_argument("disable-infobars")
        options.add_argument("--disable-extensions")
        plugs = {"enabled": False, "name": "Chrome PDF Viewer"}
        prefs = {
            "download.default_directory": self.download_dir,
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "safebrowsing.enabled": False,
            "safebrowsing.disable_download_protection": True,
            "plugins.plugins_list": [plugs],
        }
        options.add_experimental_option("prefs", prefs)
        self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)


    def scrape(self, election_cycle=None):
        # Navigate to https://www.southtechhosting.com/SanJoseCity/CampaignDocsWebRetrieval/Search/SearchByElection.aspx
        self.website.navigateToSearchPage(self.driver, self.SEARCH_FORM_ADDRESS, election_cycle=election_cycle)
        self.website.verifySearchTableLoadComplete(self.driver)

        countFile = 0

        for search_page_num in range(1, self.website.numPages(self.driver) + 1):
            print('PAGE {}'.format(search_page_num))
            # Need to navigate to the page upfront so that when we get the number of entries on the page it is accurate.
            self.website.navigateToPage(self.driver, search_page_num)

            for entry_index in self.website.numTableEntries(
                self.driver, search_page_num
            ):
                print('INDEX {}'.format(entry_index))
                # will result in the website bringing us back to page 1.
                self.website.navigateToPage(self.driver, search_page_num)
                self.website.extractTableData(self.driver, entry_index)
                self.website.clickEntryIndex(self.driver, entry_index % 10)

                sleep(self.DEFAULT_SLEEP_TIME)

                if self.website.errorDialogExists(self.driver):
                    # If there are no forms for a specific entry, we get an error message.
                    self.website.closeErrorDialog(self.driver)
                else:
                    # If there are forms, then we will be brought to the "forms" page.
                    self.website.verifyDownloadFormTableLoadComplete(self.driver)
                    countFile = self.website.downloadExcel(self.driver, countFile)

                    self.website.clickBackButton(self.driver)
                    self.website.verifySearchTableLoadComplete(self.driver)

        # Close browser once scrape is complete
        self.driver.quit()

        # Custom module to aggregate data into single CSV
        self.website.preprocessing.aggregateData()
Esempio n. 2
0
class PreProcessing():
    def __init__(self, scraper_download_dir):
        download_file_dir_wildcard = '{}/*.xls'.format(scraper_download_dir)
        self.filenames = glob.glob(download_file_dir_wildcard)
        self.download_dir = scraper_download_dir

    def aggregateData(self):
        # Create new directory for storing aggregated data
        # download_folder = path.abspath(path.join(self.download_dir, pardir))
        aggregateFolder = DirManager(['aggregated_data'])
        aggregateFolder.createFolder()
        new_folder = aggregateFolder.getDirectory()
        new_csv_file = '{}/data.csv'.format(new_folder)

        insertColumsFolder = self.insertCandidateFolder.getDirectory()
        filenames = sorted([
            insertColumsFolder + "/" + f for f in listdir(insertColumsFolder)
        ],
                           key=path.getmtime)

        with open(new_csv_file, 'w') as new_aggregate_csv:
            new_worksheet = csv.writer(new_aggregate_csv,
                                       quoting=csv.QUOTE_ALL)

            # Loop through all workbooks (EXCEL)
            header = False
            for filename in filenames:
                # Open worksheet
                wb = xlrd.open_workbook(filename)
                sheet = wb.sheet_by_index(0)

                # Only pull excel header from the first file to reduce duplicates
                if header:
                    for rownum in range(1, sheet.nrows):
                        new_worksheet.writerow(sheet.row_values(rownum))
                else:
                    for rownum in range(sheet.nrows):
                        new_worksheet.writerow(sheet.row_values(rownum))
                    header = True

    def insertColumns(self, numDownloads, CandidateName, ElectionDate,
                      BallotItem):
        print('Processing {} for {}'.format(numDownloads, CandidateName))

        if numDownloads == 0:
            return

        self.insertCandidateFolder = DirManager(['insertedData'])
        self.insertCandidateFolder.createFolder()
        new_folder = self.insertCandidateFolder.getDirectory()
        filenames = self.insertColumnsHelper()

        candidateHeader = "CandidateControlledName"
        electionDateHeader = "Election Date"
        ballotItemHeader = "Ballot Item"

        print(filenames)
        for fullfilepathname in filenames[-numDownloads:]:
            filename = path.basename(fullfilepathname)
            print(filename)

            wb = xlrd.open_workbook(fullfilepathname,
                                    logfile=open(devnull, 'w'))
            errordTypes = [
                'Cmte_ID', 'Intr_Nam L', 'Intr_City', 'Intr_ST', 'Off_S_H_Cd',
                'XRef_Match'
            ]
            data = pd.read_excel(
                wb, dtype={datatype: str
                           for datatype in errordTypes})

            if CandidateName == "   ":
                data.insert(0, candidateHeader, "Independent")
            else:
                data.insert(0, candidateHeader, CandidateName)

            data.insert(0, electionDateHeader, ElectionDate)
            data.insert(0, ballotItemHeader, BallotItem)

            data.to_excel('{}/{}'.format(new_folder, filename), index=False)

    def insertColumnsHelper(self):
        partial_download = True
        filenames = sorted([
            self.download_dir + FILE_DIVIDER + f
            for f in listdir(self.download_dir)
        ],
                           key=path.getmtime)

        while partial_download:
            filename = path.basename(filenames[-1])
            print(filename)
            if "transactionExportGrid" in filename and "crdownload" not in filename:
                partial_download = False
            else:
                sleep(3)
                filenames = sorted([
                    self.download_dir + FILE_DIVIDER + f
                    for f in listdir(self.download_dir)
                ],
                                   key=path.getmtime)

        return filenames