def predictLink(
     self,
     link,
 ):
     googleScraper = Scraper(link, sameOrigin=True)
     dataDocument = googleScraper.getDataDocument()
     seoDocument = SeoDocument(link, dataDocument, 1, self.language,
                               self.country)
     return self.predictDocument(seoDocument)
def getSeoDocumentConcurrence(link, order, language, country, sameOrigin,
                              useProxy):
    try:
        googleScraper = Scraper(link, sameOrigin=sameOrigin, useProxy=useProxy)
        dataDocument = googleScraper.getDataDocument()
        if len(dataDocument.text) > settings.DOCUMENT_MIN_CHARACTERS:
            return SeoDocument(googleScraper.redirectedUrl, dataDocument,
                               order, language, country)
        else:
            app_download_logger.error(
                u"%s lenght %s < %s chars" % (link, len(
                    dataDocument.text), settings.DOCUMENT_MIN_CHARACTERS))
    except Exception as ex:
        app_download_logger.error(u"%s --> %s" % (link, ex))
        # print(ex)
        # print('_Error : %s' % link)
    return None
def predictLink(model, link, language, country):
    googleScraper = Scraper(link, sameOrigin=True)
    dataDocument = googleScraper.getDataDocument()
    seoDocument = SeoDocument(link, dataDocument, 1, language, country)
    document = ' '.join(seoDocument.getTextTokens(lemmatize=True))
    print('%s --> %s' % (model.predict([document])[0], link))
    try:
        probability = model.predict_proba([document])[0]
        results = []
        for i in range(0, len(probability)):
            results.append(
                (model.steps[-1][-1].classes_[i], int(probability[i] * 100)))
        print(link)
        for topic, prob in sorted(results,
                                  key=lambda tup: tup[1],
                                  reverse=True)[0:2]:
            print('-------    %s -->\t%s' % (topic, prob))
    except Exception as ex:
        print(ex)
Exemple #4
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
from data_mining.web_pages.scraper import Scraper

if __name__ == '__main__':
    url = 'https://2msoffice-downloads.phpnuke.org/en/c09262/microsoft-office-2010'
    scraper = Scraper(url)
    print scraper.getDataDocument().text
    
Exemple #5
0
 def setUpClass(cls):
     super(ScrappingTestCase, cls).setUpClass()
     cls.googleScraper = Scraper(
         'https://docs.python.org/2/library/urlparse.html')
 country = u'ES'
 max_results = 600
 
 query = u'site:%s' % getDomainFromUrl(siteDomain)
 googleScrapper = GoogleScraper(query=query, language=language, country=country, googleHost=getGoogleHost(country), max_results=max_results)
 internalLinks = googleScrapper.search()
 
 queries = []
 queriesTemplates = [u'%s', u'link:%s', u'"%s"', u'"* %s"']
 
 for qTemplate in queriesTemplates:
     queries.append(qTemplate % getDomainFromUrl(siteDomain))
 
 for link in internalLinks:
     try:
         scraper = Scraper(link)
         dataDocument = scraper.getDataDocument()
     except:
         for qTemplate in queriesTemplates:
             queries.append(qTemplate % link)
 
 
 backLinks = []
 
 for query in queries:
     query = u'%s' % getDomainFromUrl(siteDomain)
     googleScrapper = GoogleScraper(query=query, language=language, country=country, googleHost=getGoogleHost(country), max_results=max_results)
     backLinks.extend(googleScrapper.search())
     
 backLinks = list(set(backLinks))
 
        self.rawHtml = ''

        # ------------
        self.bodyWords = 0


if __name__ == '__main__':
    from data_mining.web_pages.scraper import Scraper
    from data_mining.web_pages.scrapers.readability import Readability
    url = u'http://www.animalclan.com/es/16739-scalibor-65cm-royal-canin-club-adult-special-performance.html'
    url = u'http://www.publico.es'
    url = u'http://www.animalclan.com/es/15295-royal-canin-gatos-norweian-forest.html?%20search_query=norw&results=1'
    language = u'es'
    country = u'ES'

    scraper = Scraper(url, scrapingFilterClass=Readability)
    dataDocument = scraper._getDataDocument()
    seoDocument = SeoDocument(url,
                              order=1,
                              language=language,
                              country=country,
                              dataDocument=dataDocument,
                              cache=False)

    print(seoDocument.getTitleTokens(unique=False))
    print(80 * '-')
    print(seoDocument._getTextRawTokens())
    print(80 * '-')
    for sentence in seoDocument.getSentences():
        print(sentence)
    print(80 * '-')