def getGoogleLinks(searchKey, startPage, filters):
    print(' >> Starting Google Search for Links << \n')
    for loop in range(15):
        try:
            startPage += 1
            scrapeLinks = []
            result = google.search(searchKey, startPage)

            for link in result:
                if str(link.link) == NOT_FOUND:
                    continue

                flag = False
                for filterKey in filters:
                    if filterKey in link.link:
                        flag = True
                        break

                if not flag:
                    scrapeLinks.append(link.link)

        except Exception as e:
            error_logger.logError(format(e))

    return startPage, scrapeLinks
コード例 #2
0
def listOptions():
    while True:
        try:
            print('** Google Scraper ** ')
            print('** ============== ** ')
            print('=> Options: ')
            print(' 1. Start Scraping of Existing Search Keyword')
            print(' 2. Enter new Keywoard and Depriate Old one')
            print(' 3. Enter new Filter Key Word')
            print(' 4. Show Details of Existing Keyword')
            print(' 5. Delete Filter Key Word')
            print(' 6. Clear Screen')
            print(' 7. Exit System')
            usrInput = int(input('\n\n  => Enter Option Number : '))
            print('\n\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n\n')
            if usrInput == 1:
                content_extractor.startScraping()
            elif usrInput == 2:
                dictionary.enterNewKeyword()
            elif usrInput == 3:
                dictionary.enterNewFilter()
            elif usrInput == 4:
                dictionary.displayDictionaryDetail()
            elif usrInput == 5:
                dictionary.deleteFilter()
            elif usrInput == 6:
                clearScreen()
            elif usrInput == 7:
                break
            else:
                print(' >> Wrong Input <<')

            print('\n\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n\n')
        except Exception as e:
            error_logger.logError(format(e))
def getHtml(url):
    try:
        driver.get(url)
        driver.execute_script('return document.documentElement.outerHTML')
        return BeautifulSoup(driver.page_source, 'lxml')

    except Exception as e:
        error_logger.logError('Error in Fetching HTML == ' + format(e), url)

    return False
def writeCSV(data, url=''):
    try:
        with open('Contact-Lists-' + datetime.now().strftime('%d-%b-%y') +
                  '.csv',
                  'a',
                  encoding="utf-8") as fh:
            csvWriter = csv.writer(fh)
            csvWriter.writerow(data)
    except Exception as e:
        error_logger.logError(
            'Error in Writing Data into the file == ' + format(e), url)
コード例 #5
0
def startScraping():
    try:
        dictionaryDetail = dictionary.readDict()
        searchKey = dictionaryDetail['search_key']
        filters = dictionaryDetail['filters']
        startPage = dictionaryDetail['next_page']
        timeNow = datetime.now()

        if searchKey is "":
            print(
                ' There is no Search Key to Scrape Please Select option No 2. and Enter a SearchKey'
            )
            return

        if not startPage:
            startPage = 1

        if dictionaryDetail['last_executed'] != '' and (
                timeNow -
                datetime.strptime(dictionaryDetail['last_executed'],
                                  '%Y-%m-%d %H:%M:%S.%f')) < timedelta(1):
            print(
                ' =>> 24 Hrs Have not passed after the last execution yet. Script cannot Run now. <<=\n\n'
            )
            driver.quit()
            exit()

        for loop in range(50):
            print(' >> Starting Google Search for Links << \n')
            scrapeLinks = []
            result = google.search(searchKey, startPage)
            for link in result:
                flag = False
                for filterKey in filters:
                    if filterKey in link.link:
                        flag = True
                        break

                if flag:
                    continue

                scrapeLinks.append(link.link)
            startPage += 1
            visitWebsites(scrapeLinks)

        dictionaryDetail['next_page'] = startPage
        dictionaryDetail['last_executed'] = timeNow.strftime(
            '%Y-%m-%d %H:%M:%S.%f')
        dictionary.writeDict(dictionaryDetail)
        driver.quit()
    except Exception as e:
        error_logger.logError(format(e))
def checkResponsive(html):
    try:
        meta = html.find('meta', {'name': 'viewport'})
        responsive = html.find('div', {'class': 'responsive'})
        svg = html.find('svg')
        if str(meta) == NOT_FOUND and str(responsive) == NOT_FOUND and str(
                svg) == NOT_FOUND and ('responsive' not in str(html)):
            return False

        return True
    except Exception as e:
        error_logger.logError(format(e))
        return True
コード例 #7
0
def extractEmails(html, baseUrl):
    try:
        mails = []
        hrefs = html.find_all('a')
        flag = False
        for href in hrefs:
            if 'mailto' in str(href.get('href')):
                mails.append(str(href.get('href')))
                flag = True

            if '@' in str(href.get_text()):
                mails.append(str(href.get_text()))
                flag = True

        if flag:
            mails.insert(0, baseUrl)
            writeFile(mails, baseUrl)
            print(' => Conatct Extracted Successfully for => ' + baseUrl)
    except Exception as e:
        error_logger.logError(format(e))
def visitWebsites(links):
    for link in links:
        try:
            baseUrl = link.split('://')
            baseUrl = baseUrl[0] + '://' + baseUrl[1].split('/')[0]
            html = getHtml(baseUrl)
            if checkResponsive(html):
                print(' => website is responsive => ' + baseUrl)
                continue
            print(' => website is not responsive => ' + baseUrl)

            extractEmails(html, baseUrl)
            allLinks = html.find_all('a')
            for href in allLinks:
                if 'impressum' in str(href.get('href')) or 'kontakt' in str(
                        href.get('href')) or 'uber' in str(href.get('href')):
                    extractEmails(
                        getHtml(baseUrl + '/' + str(href.get('href'))),
                        baseUrl)
        except Exception as e:
            error_logger.logError(format(e), link)
def startScraping():
    try:
        dictionaryDetail = dictionary.readDict()
        searchKey = dictionaryDetail['search_key']
        filters = dictionaryDetail['filters']
        startPage = dictionaryDetail['next_page']
        timeNow = datetime.now()

        if searchKey is "":
            print(
                ' There is no Search Key to Scrape Please Select option No 2. and Enter a SearchKey'
            )
            driver.quit()
            return

        if not startPage:
            startPage = 1

        if dictionaryDetail['last_executed'] != '' and (
                timeNow -
                datetime.strptime(dictionaryDetail['last_executed'],
                                  '%Y-%m-%d %H:%M:%S.%f')) < timedelta(1):
            print(
                ' =>> 24 Hrs Have not passed after the last execution yet. Script cannot Run now. <<=\n\n'
            )
            driver.quit()
            exit()

        dictionaryDetail['last_executed'] = timeNow.strftime(
            '%Y-%m-%d %H:%M:%S.%f')
        finalPage, scrapedLinks = getGoogleLinks(searchKey, startPage, filters)
        dictionaryDetail['next_page'] = finalPage
        dictionary.writeDict(dictionaryDetail)

        visitWebsites(scrapedLinks)
    except Exception as e:
        error_logger.logError(format(e))

    driver.quit()