def predictLink( self, link, ): googleScraper = Scraper(link, sameOrigin=True) dataDocument = googleScraper.getDataDocument() seoDocument = SeoDocument(link, dataDocument, 1, self.language, self.country) return self.predictDocument(seoDocument)
def getSeoDocumentConcurrence(link, order, language, country, sameOrigin, useProxy): try: googleScraper = Scraper(link, sameOrigin=sameOrigin, useProxy=useProxy) dataDocument = googleScraper.getDataDocument() if len(dataDocument.text) > settings.DOCUMENT_MIN_CHARACTERS: return SeoDocument(googleScraper.redirectedUrl, dataDocument, order, language, country) else: app_download_logger.error( u"%s lenght %s < %s chars" % (link, len( dataDocument.text), settings.DOCUMENT_MIN_CHARACTERS)) except Exception as ex: app_download_logger.error(u"%s --> %s" % (link, ex)) # print(ex) # print('_Error : %s' % link) return None
def predictLink(model, link, language, country): googleScraper = Scraper(link, sameOrigin=True) dataDocument = googleScraper.getDataDocument() seoDocument = SeoDocument(link, dataDocument, 1, language, country) document = ' '.join(seoDocument.getTextTokens(lemmatize=True)) print('%s --> %s' % (model.predict([document])[0], link)) try: probability = model.predict_proba([document])[0] results = [] for i in range(0, len(probability)): results.append( (model.steps[-1][-1].classes_[i], int(probability[i] * 100))) print(link) for topic, prob in sorted(results, key=lambda tup: tup[1], reverse=True)[0:2]: print('------- %s -->\t%s' % (topic, prob)) except Exception as ex: print(ex)
#!/usr/bin/python # -*- coding: utf-8 -*- from data_mining.web_pages.scraper import Scraper if __name__ == '__main__': url = 'https://2msoffice-downloads.phpnuke.org/en/c09262/microsoft-office-2010' scraper = Scraper(url) print scraper.getDataDocument().text
def setUpClass(cls): super(ScrappingTestCase, cls).setUpClass() cls.googleScraper = Scraper( 'https://docs.python.org/2/library/urlparse.html')
country = u'ES' max_results = 600 query = u'site:%s' % getDomainFromUrl(siteDomain) googleScrapper = GoogleScraper(query=query, language=language, country=country, googleHost=getGoogleHost(country), max_results=max_results) internalLinks = googleScrapper.search() queries = [] queriesTemplates = [u'%s', u'link:%s', u'"%s"', u'"* %s"'] for qTemplate in queriesTemplates: queries.append(qTemplate % getDomainFromUrl(siteDomain)) for link in internalLinks: try: scraper = Scraper(link) dataDocument = scraper.getDataDocument() except: for qTemplate in queriesTemplates: queries.append(qTemplate % link) backLinks = [] for query in queries: query = u'%s' % getDomainFromUrl(siteDomain) googleScrapper = GoogleScraper(query=query, language=language, country=country, googleHost=getGoogleHost(country), max_results=max_results) backLinks.extend(googleScrapper.search()) backLinks = list(set(backLinks))
self.rawHtml = '' # ------------ self.bodyWords = 0 if __name__ == '__main__': from data_mining.web_pages.scraper import Scraper from data_mining.web_pages.scrapers.readability import Readability url = u'http://www.animalclan.com/es/16739-scalibor-65cm-royal-canin-club-adult-special-performance.html' url = u'http://www.publico.es' url = u'http://www.animalclan.com/es/15295-royal-canin-gatos-norweian-forest.html?%20search_query=norw&results=1' language = u'es' country = u'ES' scraper = Scraper(url, scrapingFilterClass=Readability) dataDocument = scraper._getDataDocument() seoDocument = SeoDocument(url, order=1, language=language, country=country, dataDocument=dataDocument, cache=False) print(seoDocument.getTitleTokens(unique=False)) print(80 * '-') print(seoDocument._getTextRawTokens()) print(80 * '-') for sentence in seoDocument.getSentences(): print(sentence) print(80 * '-')