def getTextTokens(self, removeSplitter=True, lemmatize=True):
        '''
        Obtenemos las palabras a partir del texto
        '''
        if not self.tokens:
            fileStorage = FileStorageFactory.getFileStorage(
                SeoDocument.CACHE_PATH)
            key = u'tokens_%s_%s_%s' % (self.link, self.language, self.country)
            self.tokens = fileStorage.get(key)
            if not self.tokens or not self.cache or settings.SCRAPER_RELOAD_CONTENT:
                self.tokens = self._getTextRawTokens()
                if self.cache:
                    fileStorage.set(key, self.tokens)

        if lemmatize:
            tokens = self._getTextLemmaTokens(self.tokens, ngrade=1)
        else:
            tokens = self.tokens

        if removeSplitter:
            try:
                return [token for token in tokens if token not in SPLITTER]
            except Exception as ex:
                print(tokens)
        else:
            return tokens
 def textReadabilityScore(self, text):
     md5Text = hashlib.md5(force_bytes(text)).hexdigest()
     fileStorage = FileStorageFactory.getFileStorage(ReadabilityText.CACHE_PATH)
     key = u'textReadabilityScore_%s_%s' % (self.language, md5Text)
     result = fileStorage.get(key)
     if not result or not settings.CACHE:
         result = self._textReadabilityScore(text)
     return result
Exemple #3
0
 def getProxies(self):
     fileStorage = FileStorageFactory.getFileStorage(
         ProxyBuyProxies.CACHE_PATH)
     key = u'_proxy'
     self.proxies = fileStorage.get(key)
     if not self.proxies or not settings.CACHE:
         self.proxies = self._getProxies()
         random.shuffle(self.proxies)
         self._saveProxies()
     return self.proxies
Exemple #4
0
 def getTokens(self, contexLimit=5, display=DISPLAY):
     if not self.details:
         fileStorage = FileStorageFactory.getFileStorage(
             DetailedTerms.CACHE_PATH)
         key = u'detailedTerms_%s_%s_%s' % (self.query, self.language,
                                            self.country)
         self.details = fileStorage.get(key)
         if not self.details or not settings.CACHE:
             self.details = self._getTerms(contexLimit)
             fileStorage.set(key, self.details)
     return self.details
Exemple #5
0
 def getBadWords(self, text):
     ''' Returns the result list, and also the uri for next page (returned_list, next_uri) '''
     md5Text = hashlib.md5(force_bytes(text)).hexdigest()
     fileStorage = FileStorageFactory.getFileStorage(
         LaguageToolChecker.LANGUAGETOOLCHECKER_CACHE_PATH)
     key = u'languageToolChecker__%s_%s' % (md5Text, self.language_country)
     result = fileStorage.get(key)
     if not result or not settings.CACHE:
         result = self._getBadWords(text)
         fileStorage.set(key, result)
     return result
Exemple #6
0
 def getDataDocument(self):
     fileCache = FileStorageFactory.getFileStorage(Scraper.CACHE_PATH)
     dataDocument = fileCache.get(self.url)
     if dataDocument and settings.SCRAPER_RELOAD_CONTENT:
         self.rawHtml = dataDocument.rawHtml
         dataDocument = self._getDataDocument()
         fileCache.set(self.url, dataDocument)
     if not dataDocument or not settings.CACHE:
         dataDocument = self._getDataDocument()
         fileCache.set(self.url, dataDocument)
     return dataDocument
 def getSentences(self):
     if not self.sentences:
         fileStorage = FileStorageFactory.getFileStorage(
             SeoDocument.CACHE_PATH)
         key = u'sentences_%s_%s_%s' % (self.link, self.language,
                                        self.country)
         self.sentences = fileStorage.get(key)
         if not self.sentences or not self.cache:
             self.sentences = nltk_utils.sentenceTokenizer(
                 self.dataDocument.text.replace(SPLITTER_TAG, '.'),
                 self.language)
             if self.cache:
                 fileStorage.set(key, self.sentences)
     return self.sentences
Exemple #8
0
 def getTokens(self, window=2, minCount=5, ntotal=30, display=False):
     if not self.terms:
         fileStorage = FileStorageFactory.getFileStorage(
             ScoredTerms.CACHE_PATH)
         key = u'scoredTerms_%s_%s_%s' % (self.seoLibrary.query,
                                          self.seoLibrary.language,
                                          self.seoLibrary.country)
         self.terms = fileStorage.get(key)
         if not self.terms or not settings.CACHE:
             self.terms = self._getTerms(window, minCount, ntotal)
             fileStorage.set(key, self.terms)
     if display:
         for word, metric in self.terms.items():
             app_logger.debug(u'%s --> %s' % (word, metric))
     return self.terms
Exemple #9
0
 def search(self):
     fileStorage = FileStorageFactory.getFileStorage(GoogleScraperRelatedSelenium.CACHE_PATH)
     key = '%s.%s.%s.%s' % (self.query, self.language, self.country, self.max_results)
     related = fileStorage.get(key)
     if not related:
         related = []
         try:
             related.extend(self._search(0))
         except Exception as ex:
             app_logger.error(u"%s" % ex)
         
         if not related:
             raise Exception('Google Selenium Related Error')
         
         related = list(set(related))
         fileStorage.set(key, related)
     return related
    def snapshot(self):

        browser = None
        result = {}

        initTime = time.time()

        try:
            # ------------------------------------------------------
            key = u'%s.%s.%s' % (self.url, self.width, self.height)
            imageUrl = self.imageFileStorage.get(key)
            if not imageUrl:
                screen_path = self.imageFileStorage._key_to_file(key)
                screen_path = screen_path.replace('djcache', 'jpg')
                self.imageFileStorage._createSubFolder(
                )  # create cache subfolder

                ret, browser = self._snapshot(screen_path)
                if ret:
                    imageUrl = screen_path.replace(
                        settings.SCREENCAPTURE_PATH,
                        settings.SCREENCAPTURE_DOMAIN)
                    # store in cache
                    self.imageFileStorage.set(key, imageUrl)

            # ------------------------------------------------------
            if self.processSelenium:
                fileStorage = FileStorageFactory.getFileStorage(
                    ScreenCapture.SELENIUM_CACHE)
                key = u'seleniumCache_%s' % (self.url, )
                result = fileStorage.get(key)
                if not result or not settings.CACHE:
                    result, browser = self._processSelenium(browser)
                    fileStorage.set(key, result)

            elapsedTime = time.time() - initTime
            result.update({
                'imageUrl': imageUrl,
                'elapsedTime': elapsedTime,
            })

            return result

        finally:
            if browser:
                browser.close()
def getData(topics, initLevel = settings.TRAINER_INIT_LEVEL, language='es', country='ES'):
    fileStorage = FileStorageFactory.getFileStorage(CLASSIFIER_DATA_PATH)
    key = 'trainerData_%s_%s_%s_%s_%s_%s_%s' % (
                                              language, 
                                              country, 
                                              initLevel, 
                                              settings.TRAINER_DOWNLOAD_PERCENTAGE, 
                                              settings.TRAINER_DOWNLOADER_INTERVAL, 
                                              settings.TRAINER_DOWNLOADER_PARTS,
                                              settings.TRAINER_TREE_TYPE
                                              )
    result = fileStorage.get(key)
    if not result or not settings.CACHE or not settings.TRAINER_DOWNLOAD_DOCUMENTS  or settings.SCRAPER_RELOAD_CONTENT:
        print('NO CACHE --- Generando trainer data... %s' % key)
        result = _getData(topics, initLevel, language, country)
        if settings.TRAINER_DOWNLOAD_DOCUMENTS:
            fileStorage.set(key, result)
    return result
    def search(self, jump=True):
        fileStorage = FileStorageFactory.getFileStorage(
            GoogleSeleniumPlus.CACHE_PATH)
        key = '%s.%s.%s.%s' % (self.query, self.language, self.country,
                               self.max_results)
        links = fileStorage.get(key)
        if not links:
            pages = int(
                math.ceil(self.max_results * 1.0 /
                          GoogleSeleniumPlus.PAGE_LIMIT))
            links = []

            try:
                for start in range(pages):
                    links.extend(
                        self._search(start * GoogleSeleniumPlus.PAGE_LIMIT))
            except Exception as ex:
                app_logger.error(u"%s" % ex)

            if not links and jump:
                from data_mining.search_engines.google.google_api_search import GoogleSearchEngine
                app_logger.error(
                    u"Google Selenium Failed. Trying with SearchEngine")
                searchEngine = GoogleSearchEngine(self.query,
                                                  self.language,
                                                  self.country,
                                                  self.googleHost,
                                                  max_results=self.max_results)
                links = searchEngine.search(jump=False)

            if not links:
                raise Exception('Google Selenium Error')

            uniqueLinks = []
            forbidden_regex = re.compile(settings.FORBIDDEN_URLS)
            for link in links:
                if link not in uniqueLinks:
                    if not forbidden_regex.search(link):
                        uniqueLinks.append(link)

            links = uniqueLinks[0:self.max_results]
            fileStorage.set(key, links)
        return links
    def search(self):
        fileStorage = FileStorageFactory.getFileStorage(
            GoogleScraperRelated.CACHE_PATH)
        key = '%s.%s.%s' % (self.query, self.language, self.country)
        related = fileStorage.get(key)
        if not related:
            related = []

            try:
                related = self._search(retries=settings.GOOGLE_SCRAPER_RETRIES)
            except Exception as ex:
                app_logger.error(u"_googleRelated %s" % ex)

            if not related:
                raise Exception('Google Scrapper Related Empty')

            fileStorage.set(key, related)

        return related
    def search(self, jump=True, exactSearch=False):
        fileStorage = FileStorageFactory.getFileStorage(
            GoogleSearchEngine.CACHE_PATH)
        key = '%s.%s.%s.%s' % (self.query, self.language, self.country,
                               self.max_results)
        links = fileStorage.get(key)
        if not links or not settings.CACHE:
            app_error_logger.error(80 * '-')
            app_error_logger.error(
                'EO EO Estamos usando el metodo de pago $$$$$$')
            app_error_logger.error(80 * '-')
            try:
                self._search(self.dateRestrict, 1)
                links = [item.link for item in self.items]
            except Exception as ex:
                app_error_logger.error('%s' % ex)

            if not links and jump:
                app_error_logger.error(
                    u"GoogleSearchEnginge Failed. Trying with Google Scrapper")
                from data_mining.search_engines.google.google_scraper import GoogleScraper
                googleScrapper = GoogleScraper(query=self.query,
                                               language=self.language,
                                               country=self.country,
                                               googleHost=self.googleHost,
                                               max_results=self.max_results)
                links = googleScrapper.search(jump=False,
                                              exactSearch=exactSearch)

            if not links:
                raise Exception('Google Download Error')

            uniqueLinks = []
            for link in links:
                if link not in uniqueLinks:
                    uniqueLinks.append(link)
            links = uniqueLinks

            fileStorage.set(key, links)

        return links
 def getTokens(self,
               window=3,
               minCount=5,
               lowerLimit=0.45,
               positiveQueries=[],
               numTotal=40,
               display=False):
     if not self.terms:
         fileStorage = FileStorageFactory.getFileStorage(
             RelatedTerms.CACHE_PATH)
         key = u'relatedTerms_%s_%s_%s' % (self.seoLibrary.query,
                                           self.seoLibrary.language,
                                           self.seoLibrary.country)
         self.terms = fileStorage.get(key)
         if not self.terms or not settings.CACHE:
             self.terms = self._getTerms(window, minCount, lowerLimit,
                                         positiveQueries, numTotal)
             fileStorage.set(key, self.terms)
     if display:
         for word, metric in self.terms.items():
             app_logger.debug(u'%s --> %s' % (word, metric))
     return self.terms
Exemple #16
0
 def _saveProxies(self):
     fileStorage = FileStorageFactory.getFileStorage(
         ProxyBuyProxies.CACHE_PATH)
     key = u'_proxy'
     fileStorage.set(key, self.proxies, timeout=24 * 60 * 60)
    def getSeoDocuments(self):
        '''
        seoDocuments es un Diccionario porque de esa forma podíamos eliminar aquellos enlaces que estuvieran
        fallando sin tener que esperar a que sonara el timeout del pool
        
        https://docs.python.org/2/library/multiprocessing.html
        http://stackoverflow.com/questions/3160909/how-do-i-deal-with-certificates-using-curl-while-trying-to-access-an-https-url
        
        '''
        fileStorage = FileStorageFactory.getFileStorage(
            SeoDocumentDownloader.CACHE_PATH)
        key = 'seoDocumentDownloader_%s_%s_%s_%s' % (
            self.query, self.language, self.country, self.downloadLimit)
        seoDocumentDict = fileStorage.get(key, default={})
        if not seoDocumentDict or not settings.CACHE or settings.SCRAPER_RELOAD_CONTENT:
            self.getLinks()

            downloadPool = WorkersPoolFactory.getPool()
            # print 'Urls to download: '
            # print self.links
            app_download_logger.info('Urls to download: ')
            app_download_logger.info(self.links)

            results = []

            regex = re.compile(settings.FORBIDDEN_URLS)

            for order, link in enumerate(self.links):
                #print '%s --> %s' % (order, link)
                if not regex.search(link):
                    result = downloadPool.apply_async(
                        getSeoDocumentConcurrence,
                        args=(link, order, self.language, self.country,
                              self.sameOrigin, self.useProxy))
                    results.append(result)

            seoDocumentDict = {}

            for result in results:
                try:
                    seoDocument = result.get(
                        timeout=settings.SEO_TERMS_DOWNLOADER_TIMEOUT)
                    if seoDocument:
                        seoDocumentDict[seoDocument.link] = seoDocument
                    else:
                        app_download_logger.error(u"No seoDocument or timeout")
                except Exception as ex:
                    app_download_logger.error(u"%s" % ex)
                    pass
            # Con esto nos aseguramos que no haya url repetidas

            app_download_logger.info(
                'Number of documents downloaded (AFTER): %s' %
                len(seoDocumentDict))
            app_download_logger.info('Max to download: %s' % len(self.links))
            # print 'Number of documents downloaded (AFTER): %s' % len(seoDocumentDict)
            # print 'Max to download: %s' % len(self.links)

            fileStorage.set(key, seoDocumentDict)

        return sorted(seoDocumentDict.values(),
                      key=lambda x: x.order,
                      reverse=False)[0:self.downloadLimit]