Beispiel #1
0
    def crawl(self, crawlCandidate):
        article = Article()

        parseCandidate = URLHelper.getCleanedUrl(crawlCandidate.url)
        rawHtml = self.getHTML(crawlCandidate, parseCandidate)

        if rawHtml is None:
            return article

        doc = self.getDocument(parseCandidate.url, rawHtml)

        extractor = self.getExtractor()
        docCleaner = self.getDocCleaner()
        outputFormatter = self.getOutputFormatter()

        # article
        article.finalUrl = parseCandidate.url
        article.linkhash = parseCandidate.linkhash
        article.rawHtml = rawHtml
        article.doc = doc
        article.rawDoc = deepcopy(doc)
        article.title = extractor.getTitle(article)
        article.metaLang = extractor.getMetaLang(article)
        article.metaFavicon = extractor.getMetaFavicon(article)
        article.metaDescription = extractor.getMetaDescription(article)
        article.metaKeywords = extractor.getMetaKeywords(article)
        article.canonicalLink = extractor.getCanonicalLink(article)
        article.domain = extractor.getDomain(article.finalUrl)
        article.tags = extractor.extractTags(article)

        # if the user requested a full body response
        article.doc = docCleaner.clean(article)

        # big stuff
        article.topNode = extractor.calculateBestNodeBasedOnClustering(article)
        if article.topNode is not None and any(
            [self.config.enableImageFetching, self.config.enableBodyAnalysis]):
            if self.config.enableImageFetching:
                imageExtractor = self.getImageExtractor(article)
                article.topImage = imageExtractor.getBestImage(
                    article.rawDoc, article.topNode)

            if self.config.enableBodyAnalysis:
                article.topNode = extractor.postExtractionCleanup(
                    article.topNode)
                article.cleanedArticleText = outputFormatter.getFormattedText(
                    article.topNode)

        return article
Beispiel #2
0
    def crawl(self, crawlCandidate):
        article = Article()

        parseCandidate = URLHelper.getCleanedUrl(crawlCandidate.url)
        rawHtml = self.getHTML(crawlCandidate, parseCandidate)

        if rawHtml is None:
            return article

        doc = self.getDocument(parseCandidate.url, rawHtml)

        extractor = self.getExtractor()
        docCleaner = self.getDocCleaner()
        outputFormatter = self.getOutputFormatter()

        # article
        article.finalUrl = parseCandidate.url
        article.linkhash = parseCandidate.linkhash
        article.rawHtml = rawHtml
        article.doc = doc
        article.rawDoc = deepcopy(doc)
        article.title = extractor.getTitle(article)
        # TODO
        # article.publishDate = config.publishDateExtractor.extract(doc)
        # article.additionalData = config.getAdditionalDataExtractor.extract(doc)
        article.metaLang = extractor.getMetaLang(article)
        article.metaFavicon = extractor.getMetaFavicon(article)
        article.metaDescription = extractor.getMetaDescription(article)
        article.metaKeywords = extractor.getMetaKeywords(article)
        article.canonicalLink = extractor.getCanonicalLink(article)
        article.domain = extractor.getDomain(article.finalUrl)
        article.tags = extractor.extractTags(article)
        # # before we do any calcs on the body itself let's clean up the document
        article.doc = docCleaner.clean(article)

        # big stuff
        article.topNode = extractor.calculateBestNodeBasedOnClustering(article)
        if article.topNode is not None:
            # TODO
            # movies and images
            # article.movies = extractor.extractVideos(article.topNode)
            if self.config.enableImageFetching:
                imageExtractor = self.getImageExtractor(article)
                article.topImage = imageExtractor.getBestImage(
                    article.rawDoc, article.topNode)

            article.topNode = extractor.postExtractionCleanup(article.topNode)
            article.cleanedArticleText = outputFormatter.getFormattedText(
                article)

        return article
Beispiel #3
0
 def crawl(self, crawlCandidate):
     article = Article()
     
     parseCandidate = URLHelper.getCleanedUrl(crawlCandidate.url)
     rawHtml = self.getHTML(crawlCandidate, parseCandidate)
     
     if rawHtml is None:
         return article
     
     doc = self.getDocument(parseCandidate.url, rawHtml)
     
     
     extractor = self.getExtractor()
     docCleaner = self.getDocCleaner()
     outputFormatter = self.getOutputFormatter()
     
     # article
     article.finalUrl = parseCandidate.url
     article.linkhash = parseCandidate.linkhash
     article.rawHtml = rawHtml
     article.doc = doc
     article.rawDoc = deepcopy(doc)
     article.title = extractor.getTitle(article)
     # TODO
     # article.publishDate = config.publishDateExtractor.extract(doc)
     # article.additionalData = config.getAdditionalDataExtractor.extract(doc)
     article.metaLang = extractor.getMetaLang(article)
     article.metaFavicon = extractor.getMetaFavicon(article)
     article.metaDescription = extractor.getMetaDescription(article)
     article.metaKeywords = extractor.getMetaKeywords(article)
     article.canonicalLink = extractor.getCanonicalLink(article)
     article.domain = extractor.getDomain(article.finalUrl)
     article.tags = extractor.extractTags(article)
     # # before we do any calcs on the body itself let's clean up the document
     article.doc = docCleaner.clean(article)
     
     # big stuff
     article.topNode = extractor.calculateBestNodeBasedOnClustering(article)
     if article.topNode is not None:
         # TODO
         # movies and images
         # article.movies = extractor.extractVideos(article.topNode)
         if self.config.enableImageFetching:
             imageExtractor = self.getImageExtractor(article)
             article.topImage = imageExtractor.getBestImage(article.rawDoc, article.topNode)
         
         article.topNode = extractor.postExtractionCleanup(article.topNode)
         article.cleanedArticleText = outputFormatter.getFormattedText(article.topNode)
     
     return article
Beispiel #4
0
    def crawl(self, crawlCandidate):
        article = Article()

        parseCandidate = URLHelper.getCleanedUrl(crawlCandidate.url)
        rawHtml = self.getHTML(crawlCandidate, parseCandidate)

        if rawHtml is None:
            return article

        doc = self.getDocument(parseCandidate.url, rawHtml)
        if doc is None: return article

        extractor = self.getExtractor()
        docCleaner = self.getDocCleaner()
        outputFormatter = self.getOutputFormatter()
        
        # article
        article.finalUrl = parseCandidate.url
        article.linkhash = parseCandidate.linkhash
        article.rawHtml = rawHtml
        article.doc = doc
        article.rawDoc = deepcopy(doc)
        article.title = extractor.getTitle(article)
        # TODO
        # article.publishDate = config.publishDateExtractor.extract(doc)
        # article.additionalData = config.getAdditionalDataExtractor.extract(doc)
        article.metaLang = extractor.getMetaLang(article)
        if not isinstance(self.config.targetLanguage,list): self.config.targetLanguage = [self.config.targetLanguage]
        if article.metaLang:
            self.config.targetLanguage.extend(article.metaLang)
            extractor.setLanguage(self.config.targetLanguage)
        article.metaFavicon = extractor.getMetaFavicon(article)
        article.metaDescription = extractor.getMetaDescription(article)
        article.metaKeywords = extractor.getMetaKeywords(article)
        article.canonicalLink = extractor.getCanonicalLink(article)
        article.domain = extractor.getDomain(article.finalUrl)
        article.tags = extractor.extractTags(article)
        # before we do any calcs on the body itself let's clean up the document
        article.doc = docCleaner.clean(article)

        # detects languages by unicode range
        langs = get_languages(Parser.getText(article.doc))
        self.config.targetLanguage.extend(langs)
        extractor.setLanguage(self.config.targetLanguage)

        # big stuff
        article.h1 = ''
        article.topNode = extractor.calculateBestNodeBasedOnClustering(article)
        if article.topNode is not None:
            # TODO
            # movies and images
            # article.movies = extractor.extractVideos(article.topNode)
            if self.config.enableImageFetching:
                imageExtractor = self.getImageExtractor(article)
                article.topImage = imageExtractor.getBestImage(article.rawDoc, article.topNode)

            article.topNode.attrib['rel'] = 'topnode' # mark html element
            article.h1 = extractor.getH1(article)
            article.topNode = extractor.postExtractionCleanup(article.topNode)
            Parser.removeTitle(article.topNode,article.title,article.h1)
            article.cleanedArticleText = outputFormatter.getFormattedText(article)
        # cleanup tmp file
        self.releaseResources(article)

        return article
Beispiel #5
0
 def test_instance(self):
     a = Article()
     self.assertIsInstance(a, Article)
Beispiel #6
0
    def crawl(self, crawlCandidate):
        article = Article()
        
        parseCandidate = URLHelper.getCleanedUrl(crawlCandidate.url)
        rawHtml = self.getHTML(crawlCandidate, parseCandidate)
        
        if rawHtml is None:
            return article
        
        doc = self.getDocument(parseCandidate.url, rawHtml)
        
        
        extractor = self.getExtractor()
        docCleaner = self.getDocCleaner()
        outputFormatter = self.getOutputFormatter()
        
        # article
        article.finalUrl = parseCandidate.url
        article.linkhash = parseCandidate.linkhash
        article.rawHtml = rawHtml
        article.doc = doc
        article.rawDoc = deepcopy(doc)
        article.title = extractor.getTitle(article)
        article.metaLang = extractor.getMetaLang(article)
        article.metaFavicon = extractor.getMetaFavicon(article)
        article.metaDescription = extractor.getMetaDescription(article)
        article.metaKeywords = extractor.getMetaKeywords(article)
        article.canonicalLink = extractor.getCanonicalLink(article)
        article.domain = extractor.getDomain(article.finalUrl)
        article.tags = extractor.extractTags(article)

        # if the user requested a full body response
        article.doc = docCleaner.clean(article)

        # big stuff
        article.topNode = extractor.calculateBestNodeBasedOnClustering(article)
        if article.topNode is not None and any([self.config.enableImageFetching, self.config.enableBodyAnalysis]):
            if self.config.enableImageFetching:
                imageExtractor = self.getImageExtractor(article)
                article.topImage = imageExtractor.getBestImage(article.rawDoc, article.topNode)

            if self.config.enableBodyAnalysis:
                article.topNode = extractor.postExtractionCleanup(article.topNode)
                article.cleanedArticleText = outputFormatter.getFormattedText(article.topNode)

        return article