コード例 #1
0
    print('')

    # Create Dataframes
    cr = pd.DataFrame(contactRecords)
    dr = pd.DataFrame(orgRecords)
    print('DATAFRAMES READY')

    ## //////////////////  Initialize Contact Checker Classes with Fresh Data  \\\\\\\\\\\\\\\\\\\

    # Setup Contact Record Output
    cc.ContactSheetOutput.change_output_sheet_name('Scraper Output')   ## Overide output sheet name for Production
    cc.ContactSheetOutput.set_output(contactKeys)

    # For this scrape session Give the Verification Handler class an Orgsession with Organization Records
    dm.OrgSession.set_browser_path()                                 ## IMPORTANT STEP: The browser path must be set to the current working directory which varies for different machines
    cc.VerificationHandler.set_orgRecords(dm.HeadlessOrgSession(orgRecords))

    # For this scrape session Give the Verification Handler class the contact record data
    cc.VerificationHandler.set_contactRecords(cr)
    print('CONTACT CHECKER READY')

    ## //////////////////        Scrape Base Case and Turn Off Browser         \\\\\\\\\\\\\\\\\\\

    a = cc.ScrapeAll(orgRecords)

    try:
        cc.VerificationHandler.close_browser()
    except:
        print("Browser Closed")

    print('\nSCRAPE SESSION COMPLETE')
コード例 #2
0
    def run(self):
        get_credentials_method = smgs.modelInit()

        # Get Headers from google sheets
        print('KEYS')
        self.startupQueue.put({'progress': 1})
        self.startupQueue.put({'message': 'KEYS'})
        contactKeys = getContactKeys(get_credentials_method)
        self.startupQueue.put({'progress': 2})
        directoryKeys = getAgencyDirKeys(get_credentials_method)
        self.startupQueue.put({'progress': 3})
        print('')

        # Get contact and orginization website data and structure with collected headings
        print('RECORDS')
        self.startupQueue.put({'message': 'RECORDS'})
        contactRecords = [
            sheetRecord(row, contactKeys)
            for row in getContacts(get_credentials_method)
        ]
        self.startupQueue.put({'progress': 4})
        self.orgRecords = [
            sheetRecord(row, directoryKeys)
            for row in getAgencyDir(get_credentials_method)
        ]
        self.startupQueue.put({'progress': 5})
        print('')

        # Create Dataframes
        cr = pd.DataFrame(contactRecords)
        dr = pd.DataFrame(self.orgRecords)
        print('DATAFRAMES READY')
        self.startupQueue.put({'message': 'DATAFRAMES READY', 'progress': 6})
        ## //////////////////  Initialize Contact Checker Classes with Fresh Data  \\\\\\\\\\\\\\\\\\\

        # Setup Contact Record Output
        cc.ContactSheetOutput.set_output(contactKeys)
        self.startupQueue.put({'progress': 7})
        # For this scrape session Give the Verification Handler class an Orgsession with Organization Records
        dm.OrgSession.set_browser_path(
        )  ## IMPORTANT STEP: The browser path must be set to the current working directory which varies for different machines
        cc.VerificationHandler.set_orgRecords(
            dm.HeadlessOrgSession(self.orgRecords))
        #self.queue.put({'progress': 'Finishd'})
        # For this scrape session Give the Verification Handler class the contact record data
        cc.VerificationHandler.set_contactRecords(cr)

        cc.ScrapeSession.set_app_scraper_queue(self.scraperQueue)
        cc.ScrapeSession.set_app_command_queue(self.commandQueue)
        print('CONTACT CHECKER READY')

        print('SCRAPE SESSION OPEN')
        print('')
        self.startupQueue.put({
            'message': 'SCRAPE SESSION OPEN',
            'progress': 'FINNISHED'
        })

        ## //////////////////        Begin Scraper Loop         \\\\\\\\\\\\\\\\\\\
        self.commandLoop = True

        while self.commandLoop:
            self.commandLoop = self.listen_for_cmd()

        cc.VerificationHandler.close_browser()
        print('SCRAPER THREAD FINNISHED')
コード例 #3
0
    def start_scraper(self):
        get_credentials_method = smgs.modelInit()

        # Get Headers from google sheets
        print('KEYS')
        self.startupQueue.put({'progress': 'START'})
        self.startupQueue.put({'message': 'KEYS',
                               '__waiting': ScraperThread.ContactKeysVal})
        contactKeys = getContactKeys(get_credentials_method)
        self.startupQueue.put({'progress': 1,
                               '__ready': ScraperThread.ContactKeysVal,
                               '__waiting': ScraperThread.DirectoryKeysVal})
        directoryKeys = getAgencyDirKeys(get_credentials_method)
        self.startupQueue.put({'progress': 2,
                               '__ready': ScraperThread.DirectoryKeysVal})
        print('')

        # Get contact and orginization website data and structure with collected headings
        print('RECORDS')
        self.startupQueue.put({'message': 'RECORDS',
                               '__waiting': ScraperThread.ContactRecordsVal})
        contactRecords = [sheetRecord(row, contactKeys) for row in getContacts(get_credentials_method)]
        self.startupQueue.put({'progress': 3,
                               '__ready': ScraperThread.ContactRecordsVal,
                               '__waiting': ScraperThread.AgencyDirectoryVal})
        self.orgRecords = [sheetRecord(row, directoryKeys) for row in getAgencyDir(get_credentials_method)]
        self.startupQueue.put({'progress': 4,
                               '__ready': ScraperThread.AgencyDirectoryVal})
        print('')

        # Create Dataframes
        self.startupQueue.put({'__waiting': ScraperThread.DataVal})
        cr = pd.DataFrame(contactRecords)
        dr = pd.DataFrame(self.orgRecords)
        print('DATAFRAMES READY') 
        self.startupQueue.put({'message': 'DATAFRAMES READY',
                               'progress': 5,
                               '__ready': ScraperThread.DataVal})
        ## //////////////////  Initialize Contact Checker Classes with Fresh Data  \\\\\\\\\\\\\\\\\\\

        # Setup Contact Record Output
        self.startupQueue.put({'__waiting': ScraperThread.OutputVal})
        cc.ContactSheetOutput.set_output(contactKeys)
        self.startupQueue.put({'progress': 6,
                               '__ready': ScraperThread.OutputVal,
                               '__waiting': ScraperThread.BrowserDriverVal})
        # For this scrape session Give the Verification Handler class an Orgsession with Organization Records
        dm.OrgSession.set_browser_path()                                 ## IMPORTANT STEP: The browser path must be set to the current working directory which varies for different machines
        cc.VerificationHandler.set_orgRecords(dm.HeadlessOrgSession(self.orgRecords))
        self.startupQueue.put({'progress': 7,
                               '__ready': ScraperThread.BrowserDriverVal,
                               '__waiting': ScraperThread.ContactCheckerVal})
        # For this scrape session Give the Verification Handler class the contact record data
        cc.VerificationHandler.set_contactRecords(cr)
        cc.ScrapeSession.set_app_scraper_queue(self.scraperQueue)
        cc.ContactSheetOutput.set_app_scraper_queue(self.scraperQueue)
        cc.ContactCollector.set_app_scraper_queue(self.scraperQueue)
        dm.DirectoryManager.set_app_scraper_queue(self.scraperQueue)
        dm.OrgQuery.set_app_scraper_queue(self.scraperQueue)
        cc.ScrapeSession.set_app_command_queue(self.commandQueue)

        ## Count Rows and Finnish up
        self.startupQueue.put({'rowCounts': {'contact counts': cc.ContactSheetOutput.count_contacts_rows(),
                                             'output counts': cc.ContactSheetOutput.count_scraper_output_rows()}})
        print('CONTACT CHECKER READY')
        print('SCRAPE SESSION OPEN')
        print('')
        self.startupQueue.put({'message': 'SCRAPE SESSION OPEN',
                               'progress': 'FINNISHED',
                               '__ready': ScraperThread.ContactCheckerVal})