print('') # Create Dataframes cr = pd.DataFrame(contactRecords) dr = pd.DataFrame(orgRecords) print('DATAFRAMES READY') ## ////////////////// Initialize Contact Checker Classes with Fresh Data \\\\\\\\\\\\\\\\\\\ # Setup Contact Record Output cc.ContactSheetOutput.change_output_sheet_name('Scraper Output') ## Overide output sheet name for Production cc.ContactSheetOutput.set_output(contactKeys) # For this scrape session Give the Verification Handler class an Orgsession with Organization Records dm.OrgSession.set_browser_path() ## IMPORTANT STEP: The browser path must be set to the current working directory which varies for different machines cc.VerificationHandler.set_orgRecords(dm.HeadlessOrgSession(orgRecords)) # For this scrape session Give the Verification Handler class the contact record data cc.VerificationHandler.set_contactRecords(cr) print('CONTACT CHECKER READY') ## ////////////////// Scrape Base Case and Turn Off Browser \\\\\\\\\\\\\\\\\\\ a = cc.ScrapeAll(orgRecords) try: cc.VerificationHandler.close_browser() except: print("Browser Closed") print('\nSCRAPE SESSION COMPLETE')
def run(self): get_credentials_method = smgs.modelInit() # Get Headers from google sheets print('KEYS') self.startupQueue.put({'progress': 1}) self.startupQueue.put({'message': 'KEYS'}) contactKeys = getContactKeys(get_credentials_method) self.startupQueue.put({'progress': 2}) directoryKeys = getAgencyDirKeys(get_credentials_method) self.startupQueue.put({'progress': 3}) print('') # Get contact and orginization website data and structure with collected headings print('RECORDS') self.startupQueue.put({'message': 'RECORDS'}) contactRecords = [ sheetRecord(row, contactKeys) for row in getContacts(get_credentials_method) ] self.startupQueue.put({'progress': 4}) self.orgRecords = [ sheetRecord(row, directoryKeys) for row in getAgencyDir(get_credentials_method) ] self.startupQueue.put({'progress': 5}) print('') # Create Dataframes cr = pd.DataFrame(contactRecords) dr = pd.DataFrame(self.orgRecords) print('DATAFRAMES READY') self.startupQueue.put({'message': 'DATAFRAMES READY', 'progress': 6}) ## ////////////////// Initialize Contact Checker Classes with Fresh Data \\\\\\\\\\\\\\\\\\\ # Setup Contact Record Output cc.ContactSheetOutput.set_output(contactKeys) self.startupQueue.put({'progress': 7}) # For this scrape session Give the Verification Handler class an Orgsession with Organization Records dm.OrgSession.set_browser_path( ) ## IMPORTANT STEP: The browser path must be set to the current working directory which varies for different machines cc.VerificationHandler.set_orgRecords( dm.HeadlessOrgSession(self.orgRecords)) #self.queue.put({'progress': 'Finishd'}) # For this scrape session Give the Verification Handler class the contact record data cc.VerificationHandler.set_contactRecords(cr) cc.ScrapeSession.set_app_scraper_queue(self.scraperQueue) cc.ScrapeSession.set_app_command_queue(self.commandQueue) print('CONTACT CHECKER READY') print('SCRAPE SESSION OPEN') print('') self.startupQueue.put({ 'message': 'SCRAPE SESSION OPEN', 'progress': 'FINNISHED' }) ## ////////////////// Begin Scraper Loop \\\\\\\\\\\\\\\\\\\ self.commandLoop = True while self.commandLoop: self.commandLoop = self.listen_for_cmd() cc.VerificationHandler.close_browser() print('SCRAPER THREAD FINNISHED')
def start_scraper(self): get_credentials_method = smgs.modelInit() # Get Headers from google sheets print('KEYS') self.startupQueue.put({'progress': 'START'}) self.startupQueue.put({'message': 'KEYS', '__waiting': ScraperThread.ContactKeysVal}) contactKeys = getContactKeys(get_credentials_method) self.startupQueue.put({'progress': 1, '__ready': ScraperThread.ContactKeysVal, '__waiting': ScraperThread.DirectoryKeysVal}) directoryKeys = getAgencyDirKeys(get_credentials_method) self.startupQueue.put({'progress': 2, '__ready': ScraperThread.DirectoryKeysVal}) print('') # Get contact and orginization website data and structure with collected headings print('RECORDS') self.startupQueue.put({'message': 'RECORDS', '__waiting': ScraperThread.ContactRecordsVal}) contactRecords = [sheetRecord(row, contactKeys) for row in getContacts(get_credentials_method)] self.startupQueue.put({'progress': 3, '__ready': ScraperThread.ContactRecordsVal, '__waiting': ScraperThread.AgencyDirectoryVal}) self.orgRecords = [sheetRecord(row, directoryKeys) for row in getAgencyDir(get_credentials_method)] self.startupQueue.put({'progress': 4, '__ready': ScraperThread.AgencyDirectoryVal}) print('') # Create Dataframes self.startupQueue.put({'__waiting': ScraperThread.DataVal}) cr = pd.DataFrame(contactRecords) dr = pd.DataFrame(self.orgRecords) print('DATAFRAMES READY') self.startupQueue.put({'message': 'DATAFRAMES READY', 'progress': 5, '__ready': ScraperThread.DataVal}) ## ////////////////// Initialize Contact Checker Classes with Fresh Data \\\\\\\\\\\\\\\\\\\ # Setup Contact Record Output self.startupQueue.put({'__waiting': ScraperThread.OutputVal}) cc.ContactSheetOutput.set_output(contactKeys) self.startupQueue.put({'progress': 6, '__ready': ScraperThread.OutputVal, '__waiting': ScraperThread.BrowserDriverVal}) # For this scrape session Give the Verification Handler class an Orgsession with Organization Records dm.OrgSession.set_browser_path() ## IMPORTANT STEP: The browser path must be set to the current working directory which varies for different machines cc.VerificationHandler.set_orgRecords(dm.HeadlessOrgSession(self.orgRecords)) self.startupQueue.put({'progress': 7, '__ready': ScraperThread.BrowserDriverVal, '__waiting': ScraperThread.ContactCheckerVal}) # For this scrape session Give the Verification Handler class the contact record data cc.VerificationHandler.set_contactRecords(cr) cc.ScrapeSession.set_app_scraper_queue(self.scraperQueue) cc.ContactSheetOutput.set_app_scraper_queue(self.scraperQueue) cc.ContactCollector.set_app_scraper_queue(self.scraperQueue) dm.DirectoryManager.set_app_scraper_queue(self.scraperQueue) dm.OrgQuery.set_app_scraper_queue(self.scraperQueue) cc.ScrapeSession.set_app_command_queue(self.commandQueue) ## Count Rows and Finnish up self.startupQueue.put({'rowCounts': {'contact counts': cc.ContactSheetOutput.count_contacts_rows(), 'output counts': cc.ContactSheetOutput.count_scraper_output_rows()}}) print('CONTACT CHECKER READY') print('SCRAPE SESSION OPEN') print('') self.startupQueue.put({'message': 'SCRAPE SESSION OPEN', 'progress': 'FINNISHED', '__ready': ScraperThread.ContactCheckerVal})