def getGoogleLinks(self):
        if not self.links:
            usePaidVersion = random.randint(
                0, 99) >= settings.GOOGLE_SCRAPER_PROBABILITY

            try:
                if usePaidVersion:
                    searchEngine = GoogleSearchEngine(
                        self.query,
                        self.language,
                        self.country,
                        getGoogleHost(self.country),
                        max_results=self.downloadLimit)
                    self.links = searchEngine.search()
                else:
                    googleScrapper = GoogleScraper(
                        query=self.query,
                        language=self.language,
                        country=self.country,
                        googleHost=getGoogleHost(self.country),
                        max_results=self.downloadLimit)
                    self.links = googleScrapper.search()
            except Exception as e:
                app_logger.info('%s' % e)
                app_logger.info('Google Scrapper and API failed')

        return self.links
    def test_google_scraper_search(self):
        googleScraper = GoogleScraper(query=GoogleSearchTest.query,
                                      language=GoogleSearchTest.language,
                                      country=GoogleSearchTest.country,
                                      googleHost=GoogleSearchTest.googleHost,
                                      max_results=20)

        items = googleScraper.search()
        self.assertTrue(
            len(items) > 10, 'GoogleScraper: Problem obtaining 20 Urls')
def _getGoogleLinks(query, language, country, downloadLimit):
    try:
        print('Language: %s Country: %s Query: %s ' % (language, country, query))
        googleSearch = GoogleScraper(query,
                               language=language,
                               country=country,
                               googleHost=getGoogleHost(country),
                               max_results=downloadLimit)
        return googleSearch.search(True)
    except Exception as ex:
        print(ex)
        return []
Esempio n. 4
0
def getDomainCompetence(siteDomain, language, country):

    query = u'related:%s' % siteDomain

    googleScrapper = GoogleScraper(query=query,
                                   language=language,
                                   country=country,
                                   googleHost=getGoogleHost(country),
                                   max_results=20)
    links = googleScrapper.search()

    #results = list(set([getDomainFromUrl(link) for link in links]))
    results = OrderedDict([(getDomainFromUrl(link), 0)
                           for link in links]).keys()

    return results[:MAX_COMPETENCE_URLS]
Esempio n. 5
0
def getQueryRanking(query, language, country):
    googleScrapper = GoogleScraper(query=query,
                                   language=language,
                                   country=country,
                                   googleHost=getGoogleHost(country),
                                   max_results=150)

    #Descargamos hasta 150 para dejarlos cacheados ya que luego los vamos a usar en la parte del text audit, pero solo nos quedamos con 50
    links = googleScrapper.search()[0:50]

    results = {}

    for order, link in enumerate(links):
        domain = getDomainFromUrl(link)
        if domain not in results:
            results[domain] = GoogleRankedUrl(link, order + 1)

    return results
    def search(self, jump=True, exactSearch=False):
        fileStorage = FileStorageFactory.getFileStorage(
            GoogleSearchEngine.CACHE_PATH)
        key = '%s.%s.%s.%s' % (self.query, self.language, self.country,
                               self.max_results)
        links = fileStorage.get(key)
        if not links or not settings.CACHE:
            app_error_logger.error(80 * '-')
            app_error_logger.error(
                'EO EO Estamos usando el metodo de pago $$$$$$')
            app_error_logger.error(80 * '-')
            try:
                self._search(self.dateRestrict, 1)
                links = [item.link for item in self.items]
            except Exception as ex:
                app_error_logger.error('%s' % ex)

            if not links and jump:
                app_error_logger.error(
                    u"GoogleSearchEnginge Failed. Trying with Google Scrapper")
                from data_mining.search_engines.google.google_scraper import GoogleScraper
                googleScrapper = GoogleScraper(query=self.query,
                                               language=self.language,
                                               country=self.country,
                                               googleHost=self.googleHost,
                                               max_results=self.max_results)
                links = googleScrapper.search(jump=False,
                                              exactSearch=exactSearch)

            if not links:
                raise Exception('Google Download Error')

            uniqueLinks = []
            for link in links:
                if link not in uniqueLinks:
                    uniqueLinks.append(link)
            links = uniqueLinks

            fileStorage.set(key, links)

        return links
from data_mining.search_engines.google.google_scraper import GoogleScraper
from data_mining.search_engines.google import getGoogleHost
from seo.audits.site.google_ranking import getDomainFromUrl
from data_mining.web_pages.scraper import Scraper
from bs4 import BeautifulSoup


if __name__ == '__main__':
    
    siteDomain = u'http://www.dinersclub.com.ec'
    language = u'es'
    country = u'ES'
    max_results = 600
    
    query = u'site:%s' % getDomainFromUrl(siteDomain)
    googleScrapper = GoogleScraper(query=query, language=language, country=country, googleHost=getGoogleHost(country), max_results=max_results)
    internalLinks = googleScrapper.search()
    
    queries = []
    queriesTemplates = [u'%s', u'link:%s', u'"%s"', u'"* %s"']
    
    for qTemplate in queriesTemplates:
        queries.append(qTemplate % getDomainFromUrl(siteDomain))
    
    for link in internalLinks:
        try:
            scraper = Scraper(link)
            dataDocument = scraper.getDataDocument()
        except:
            for qTemplate in queriesTemplates:
                queries.append(qTemplate % link)