def getGoogleLinks(searchKey, startPage, filters): print(' >> Starting Google Search for Links << \n') for loop in range(15): try: startPage += 1 scrapeLinks = [] result = google.search(searchKey, startPage) for link in result: if str(link.link) == NOT_FOUND: continue flag = False for filterKey in filters: if filterKey in link.link: flag = True break if not flag: scrapeLinks.append(link.link) except Exception as e: error_logger.logError(format(e)) return startPage, scrapeLinks
def listOptions(): while True: try: print('** Google Scraper ** ') print('** ============== ** ') print('=> Options: ') print(' 1. Start Scraping of Existing Search Keyword') print(' 2. Enter new Keywoard and Depriate Old one') print(' 3. Enter new Filter Key Word') print(' 4. Show Details of Existing Keyword') print(' 5. Delete Filter Key Word') print(' 6. Clear Screen') print(' 7. Exit System') usrInput = int(input('\n\n => Enter Option Number : ')) print('\n\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n\n') if usrInput == 1: content_extractor.startScraping() elif usrInput == 2: dictionary.enterNewKeyword() elif usrInput == 3: dictionary.enterNewFilter() elif usrInput == 4: dictionary.displayDictionaryDetail() elif usrInput == 5: dictionary.deleteFilter() elif usrInput == 6: clearScreen() elif usrInput == 7: break else: print(' >> Wrong Input <<') print('\n\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n\n') except Exception as e: error_logger.logError(format(e))
def getHtml(url): try: driver.get(url) driver.execute_script('return document.documentElement.outerHTML') return BeautifulSoup(driver.page_source, 'lxml') except Exception as e: error_logger.logError('Error in Fetching HTML == ' + format(e), url) return False
def writeCSV(data, url=''): try: with open('Contact-Lists-' + datetime.now().strftime('%d-%b-%y') + '.csv', 'a', encoding="utf-8") as fh: csvWriter = csv.writer(fh) csvWriter.writerow(data) except Exception as e: error_logger.logError( 'Error in Writing Data into the file == ' + format(e), url)
def startScraping(): try: dictionaryDetail = dictionary.readDict() searchKey = dictionaryDetail['search_key'] filters = dictionaryDetail['filters'] startPage = dictionaryDetail['next_page'] timeNow = datetime.now() if searchKey is "": print( ' There is no Search Key to Scrape Please Select option No 2. and Enter a SearchKey' ) return if not startPage: startPage = 1 if dictionaryDetail['last_executed'] != '' and ( timeNow - datetime.strptime(dictionaryDetail['last_executed'], '%Y-%m-%d %H:%M:%S.%f')) < timedelta(1): print( ' =>> 24 Hrs Have not passed after the last execution yet. Script cannot Run now. <<=\n\n' ) driver.quit() exit() for loop in range(50): print(' >> Starting Google Search for Links << \n') scrapeLinks = [] result = google.search(searchKey, startPage) for link in result: flag = False for filterKey in filters: if filterKey in link.link: flag = True break if flag: continue scrapeLinks.append(link.link) startPage += 1 visitWebsites(scrapeLinks) dictionaryDetail['next_page'] = startPage dictionaryDetail['last_executed'] = timeNow.strftime( '%Y-%m-%d %H:%M:%S.%f') dictionary.writeDict(dictionaryDetail) driver.quit() except Exception as e: error_logger.logError(format(e))
def checkResponsive(html): try: meta = html.find('meta', {'name': 'viewport'}) responsive = html.find('div', {'class': 'responsive'}) svg = html.find('svg') if str(meta) == NOT_FOUND and str(responsive) == NOT_FOUND and str( svg) == NOT_FOUND and ('responsive' not in str(html)): return False return True except Exception as e: error_logger.logError(format(e)) return True
def extractEmails(html, baseUrl): try: mails = [] hrefs = html.find_all('a') flag = False for href in hrefs: if 'mailto' in str(href.get('href')): mails.append(str(href.get('href'))) flag = True if '@' in str(href.get_text()): mails.append(str(href.get_text())) flag = True if flag: mails.insert(0, baseUrl) writeFile(mails, baseUrl) print(' => Conatct Extracted Successfully for => ' + baseUrl) except Exception as e: error_logger.logError(format(e))
def visitWebsites(links): for link in links: try: baseUrl = link.split('://') baseUrl = baseUrl[0] + '://' + baseUrl[1].split('/')[0] html = getHtml(baseUrl) if checkResponsive(html): print(' => website is responsive => ' + baseUrl) continue print(' => website is not responsive => ' + baseUrl) extractEmails(html, baseUrl) allLinks = html.find_all('a') for href in allLinks: if 'impressum' in str(href.get('href')) or 'kontakt' in str( href.get('href')) or 'uber' in str(href.get('href')): extractEmails( getHtml(baseUrl + '/' + str(href.get('href'))), baseUrl) except Exception as e: error_logger.logError(format(e), link)
def startScraping(): try: dictionaryDetail = dictionary.readDict() searchKey = dictionaryDetail['search_key'] filters = dictionaryDetail['filters'] startPage = dictionaryDetail['next_page'] timeNow = datetime.now() if searchKey is "": print( ' There is no Search Key to Scrape Please Select option No 2. and Enter a SearchKey' ) driver.quit() return if not startPage: startPage = 1 if dictionaryDetail['last_executed'] != '' and ( timeNow - datetime.strptime(dictionaryDetail['last_executed'], '%Y-%m-%d %H:%M:%S.%f')) < timedelta(1): print( ' =>> 24 Hrs Have not passed after the last execution yet. Script cannot Run now. <<=\n\n' ) driver.quit() exit() dictionaryDetail['last_executed'] = timeNow.strftime( '%Y-%m-%d %H:%M:%S.%f') finalPage, scrapedLinks = getGoogleLinks(searchKey, startPage, filters) dictionaryDetail['next_page'] = finalPage dictionary.writeDict(dictionaryDetail) visitWebsites(scrapedLinks) except Exception as e: error_logger.logError(format(e)) driver.quit()