def crawl(self, crawlCandidate): article = Article() parseCandidate = URLHelper.getCleanedUrl(crawlCandidate.url) rawHtml = self.getHTML(crawlCandidate, parseCandidate) if rawHtml is None: return article doc = self.getDocument(parseCandidate.url, rawHtml) extractor = self.getExtractor() docCleaner = self.getDocCleaner() outputFormatter = self.getOutputFormatter() # article article.finalUrl = parseCandidate.url article.linkhash = parseCandidate.linkhash article.rawHtml = rawHtml article.doc = doc article.rawDoc = deepcopy(doc) article.title = extractor.getTitle(article) # TODO # article.publishDate = config.publishDateExtractor.extract(doc) # article.additionalData = config.getAdditionalDataExtractor.extract(doc) article.metaLang = extractor.getMetaLang(article) article.metaFavicon = extractor.getMetaFavicon(article) article.metaDescription = extractor.getMetaDescription(article) article.metaKeywords = extractor.getMetaKeywords(article) article.canonicalLink = extractor.getCanonicalLink(article) article.domain = extractor.getDomain(article.finalUrl) article.tags = extractor.extractTags(article) # # before we do any calcs on the body itself let's clean up the document article.doc = docCleaner.clean(article) # big stuff article.topNode = extractor.calculateBestNodeBasedOnClustering(article) if article.topNode is not None: # TODO # movies and images # article.movies = extractor.extractVideos(article.topNode) if self.config.enableImageFetching: imageExtractor = self.getImageExtractor(article) article.topImage = imageExtractor.getBestImage(article.rawDoc, article.topNode) article.topNode = extractor.postExtractionCleanup(article.topNode) article.cleanedArticleText = outputFormatter.getFormattedText(article) article.topNode.attrib['rel'] = 'topnode' # mark html element article.h1 = extractor.getH1(article) # cleanup tmp file self.releaseResources(article) return article
def crawl(self, crawlCandidate): article = Article() parseCandidate = URLHelper.getCleanedUrl(crawlCandidate.url) rawHtml = self.getHTML(crawlCandidate, parseCandidate) if rawHtml is None: return article doc = self.getDocument(parseCandidate.url, rawHtml) extractor = self.getExtractor() docCleaner = self.getDocCleaner() outputFormatter = self.getOutputFormatter() # article article.finalUrl = parseCandidate.url article.linkhash = parseCandidate.linkhash article.rawHtml = rawHtml article.doc = doc article.rawDoc = deepcopy(doc) article.title = extractor.getTitle(article) # TODO # article.publishDate = config.publishDateExtractor.extract(doc) # article.additionalData = config.getAdditionalDataExtractor.extract(doc) article.metaLang = extractor.getMetaLang(article) article.metaFavicon = extractor.getMetaFavicon(article) article.metaDescription = extractor.getMetaDescription(article) article.metaKeywords = extractor.getMetaKeywords(article) article.canonicalLink = extractor.getCanonicalLink(article) article.domain = extractor.getDomain(article.finalUrl) article.tags = extractor.extractTags(article) # # before we do any calcs on the body itself let's clean up the document article.doc = docCleaner.clean(article) # big stuff article.topNode = extractor.calculateBestNodeBasedOnClustering(article) if article.topNode is not None: # TODO # movies and images # article.movies = extractor.extractVideos(article.topNode) if self.config.enableImageFetching: imageExtractor = self.getImageExtractor(article) article.topImage = imageExtractor.getBestImage( article.rawDoc, article.topNode) article.topNode = extractor.postExtractionCleanup(article.topNode) article.cleanedArticleText = outputFormatter.getFormattedText( article) return article
def crawl(self, crawlCandidate): article = Article() parseCandidate = URLHelper.getCleanedUrl(crawlCandidate.url) rawHtml = self.getHTML(crawlCandidate, parseCandidate) if rawHtml is None: return article doc = self.getDocument(parseCandidate.url, rawHtml) extractor = self.getExtractor() docCleaner = self.getDocCleaner() outputFormatter = self.getOutputFormatter() # article article.finalUrl = parseCandidate.url article.linkhash = parseCandidate.linkhash article.rawHtml = rawHtml article.doc = doc article.rawDoc = deepcopy(doc) article.title = extractor.getTitle(article) article.metaLang = extractor.getMetaLang(article) article.metaFavicon = extractor.getMetaFavicon(article) article.metaDescription = extractor.getMetaDescription(article) article.metaKeywords = extractor.getMetaKeywords(article) article.canonicalLink = extractor.getCanonicalLink(article) article.domain = extractor.getDomain(article.finalUrl) article.tags = extractor.extractTags(article) # if the user requested a full body response article.doc = docCleaner.clean(article) # big stuff article.topNode = extractor.calculateBestNodeBasedOnClustering(article) if article.topNode is not None and any( [self.config.enableImageFetching, self.config.enableBodyAnalysis]): if self.config.enableImageFetching: imageExtractor = self.getImageExtractor(article) article.topImage = imageExtractor.getBestImage( article.rawDoc, article.topNode) if self.config.enableBodyAnalysis: article.topNode = extractor.postExtractionCleanup( article.topNode) article.cleanedArticleText = outputFormatter.getFormattedText( article.topNode) return article
def crawl(self, crawlCandidate): article = Article() parseCandidate = URLHelper.getCleanedUrl(crawlCandidate.url) rawHtml = self.getHTML(crawlCandidate, parseCandidate) if rawHtml is None: return article doc = self.getDocument(parseCandidate.url, rawHtml) extractor = self.getExtractor() docCleaner = self.getDocCleaner() outputFormatter = self.getOutputFormatter() # article article.finalUrl = parseCandidate.url article.linkhash = parseCandidate.linkhash article.rawHtml = rawHtml article.doc = doc article.rawDoc = deepcopy(doc) article.title = extractor.getTitle(article) article.metaLang = extractor.getMetaLang(article) article.metaFavicon = extractor.getMetaFavicon(article) article.metaDescription = extractor.getMetaDescription(article) article.metaKeywords = extractor.getMetaKeywords(article) article.canonicalLink = extractor.getCanonicalLink(article) article.domain = extractor.getDomain(article.finalUrl) article.tags = extractor.extractTags(article) # if the user requested a full body response article.doc = docCleaner.clean(article) # big stuff article.topNode = extractor.calculateBestNodeBasedOnClustering(article) if article.topNode is not None and any([self.config.enableImageFetching, self.config.enableBodyAnalysis]): if self.config.enableImageFetching: imageExtractor = self.getImageExtractor(article) article.topImage = imageExtractor.getBestImage(article.rawDoc, article.topNode) if self.config.enableBodyAnalysis: article.topNode = extractor.postExtractionCleanup(article.topNode) article.cleanedArticleText = outputFormatter.getFormattedText(article.topNode) return article