Esempio n. 1
0
    def crawl(self, crawlCandidate):
        article = Article()

        parseCandidate = URLHelper.getCleanedUrl(crawlCandidate.url)
        rawHtml = self.getHTML(crawlCandidate, parseCandidate)

        if rawHtml is None:
            return article

        doc = self.getDocument(parseCandidate.url, rawHtml)

        extractor = self.getExtractor()
        docCleaner = self.getDocCleaner()
        outputFormatter = self.getOutputFormatter()

        # article
        article.finalUrl = parseCandidate.url
        article.linkhash = parseCandidate.linkhash
        article.rawHtml = rawHtml
        article.doc = doc
        article.rawDoc = deepcopy(doc)
        article.title = extractor.getTitle(article)
        # TODO
        # article.publishDate = config.publishDateExtractor.extract(doc)
        # article.additionalData = config.getAdditionalDataExtractor.extract(doc)
        article.metaLang = extractor.getMetaLang(article)
        article.metaFavicon = extractor.getMetaFavicon(article)
        article.metaDescription = extractor.getMetaDescription(article)
        article.metaKeywords = extractor.getMetaKeywords(article)
        article.canonicalLink = extractor.getCanonicalLink(article)
        article.domain = extractor.getDomain(article.finalUrl)
        article.tags = extractor.extractTags(article)
        # # before we do any calcs on the body itself let's clean up the document
        article.doc = docCleaner.clean(article)

        # big stuff
        article.topNode = extractor.calculateBestNodeBasedOnClustering(article)
        if article.topNode is not None:
            # TODO
            # movies and images
            # article.movies = extractor.extractVideos(article.topNode)
            if self.config.enableImageFetching:
                imageExtractor = self.getImageExtractor(article)
                article.topImage = imageExtractor.getBestImage(article.rawDoc, article.topNode)

            article.topNode = extractor.postExtractionCleanup(article.topNode)
            article.cleanedArticleText = outputFormatter.getFormattedText(article)
            article.topNode.attrib['rel'] = 'topnode' # mark html element
        article.h1 = extractor.getH1(article)
        # cleanup tmp file
        self.releaseResources(article)

        return article
Esempio n. 2
0
    def crawl(self, crawlCandidate):
        article = Article()

        parseCandidate = URLHelper.getCleanedUrl(crawlCandidate.url)
        rawHtml = self.getHTML(crawlCandidate, parseCandidate)

        if rawHtml is None:
            return article

        doc = self.getDocument(parseCandidate.url, rawHtml)

        extractor = self.getExtractor()
        docCleaner = self.getDocCleaner()
        outputFormatter = self.getOutputFormatter()

        # article
        article.finalUrl = parseCandidate.url
        article.linkhash = parseCandidate.linkhash
        article.rawHtml = rawHtml
        article.doc = doc
        article.rawDoc = deepcopy(doc)
        article.title = extractor.getTitle(article)
        # TODO
        # article.publishDate = config.publishDateExtractor.extract(doc)
        # article.additionalData = config.getAdditionalDataExtractor.extract(doc)
        article.metaLang = extractor.getMetaLang(article)
        article.metaFavicon = extractor.getMetaFavicon(article)
        article.metaDescription = extractor.getMetaDescription(article)
        article.metaKeywords = extractor.getMetaKeywords(article)
        article.canonicalLink = extractor.getCanonicalLink(article)
        article.domain = extractor.getDomain(article.finalUrl)
        article.tags = extractor.extractTags(article)
        # # before we do any calcs on the body itself let's clean up the document
        article.doc = docCleaner.clean(article)

        # big stuff
        article.topNode = extractor.calculateBestNodeBasedOnClustering(article)
        if article.topNode is not None:
            # TODO
            # movies and images
            # article.movies = extractor.extractVideos(article.topNode)
            if self.config.enableImageFetching:
                imageExtractor = self.getImageExtractor(article)
                article.topImage = imageExtractor.getBestImage(
                    article.rawDoc, article.topNode)

            article.topNode = extractor.postExtractionCleanup(article.topNode)
            article.cleanedArticleText = outputFormatter.getFormattedText(
                article)

        return article
Esempio n. 3
0
    def crawl(self, crawlCandidate):
        article = Article()

        parseCandidate = URLHelper.getCleanedUrl(crawlCandidate.url)
        rawHtml = self.getHTML(crawlCandidate, parseCandidate)

        if rawHtml is None:
            return article

        doc = self.getDocument(parseCandidate.url, rawHtml)

        extractor = self.getExtractor()
        docCleaner = self.getDocCleaner()
        outputFormatter = self.getOutputFormatter()

        # article
        article.finalUrl = parseCandidate.url
        article.linkhash = parseCandidate.linkhash
        article.rawHtml = rawHtml
        article.doc = doc
        article.rawDoc = deepcopy(doc)
        article.title = extractor.getTitle(article)
        article.metaLang = extractor.getMetaLang(article)
        article.metaFavicon = extractor.getMetaFavicon(article)
        article.metaDescription = extractor.getMetaDescription(article)
        article.metaKeywords = extractor.getMetaKeywords(article)
        article.canonicalLink = extractor.getCanonicalLink(article)
        article.domain = extractor.getDomain(article.finalUrl)
        article.tags = extractor.extractTags(article)

        # if the user requested a full body response
        article.doc = docCleaner.clean(article)

        # big stuff
        article.topNode = extractor.calculateBestNodeBasedOnClustering(article)
        if article.topNode is not None and any(
            [self.config.enableImageFetching, self.config.enableBodyAnalysis]):
            if self.config.enableImageFetching:
                imageExtractor = self.getImageExtractor(article)
                article.topImage = imageExtractor.getBestImage(
                    article.rawDoc, article.topNode)

            if self.config.enableBodyAnalysis:
                article.topNode = extractor.postExtractionCleanup(
                    article.topNode)
                article.cleanedArticleText = outputFormatter.getFormattedText(
                    article.topNode)

        return article
Esempio n. 4
0
    def crawl(self, crawlCandidate):
        article = Article()
        
        parseCandidate = URLHelper.getCleanedUrl(crawlCandidate.url)
        rawHtml = self.getHTML(crawlCandidate, parseCandidate)
        
        if rawHtml is None:
            return article
        
        doc = self.getDocument(parseCandidate.url, rawHtml)
        
        
        extractor = self.getExtractor()
        docCleaner = self.getDocCleaner()
        outputFormatter = self.getOutputFormatter()
        
        # article
        article.finalUrl = parseCandidate.url
        article.linkhash = parseCandidate.linkhash
        article.rawHtml = rawHtml
        article.doc = doc
        article.rawDoc = deepcopy(doc)
        article.title = extractor.getTitle(article)
        article.metaLang = extractor.getMetaLang(article)
        article.metaFavicon = extractor.getMetaFavicon(article)
        article.metaDescription = extractor.getMetaDescription(article)
        article.metaKeywords = extractor.getMetaKeywords(article)
        article.canonicalLink = extractor.getCanonicalLink(article)
        article.domain = extractor.getDomain(article.finalUrl)
        article.tags = extractor.extractTags(article)

        # if the user requested a full body response
        article.doc = docCleaner.clean(article)

        # big stuff
        article.topNode = extractor.calculateBestNodeBasedOnClustering(article)
        if article.topNode is not None and any([self.config.enableImageFetching, self.config.enableBodyAnalysis]):
            if self.config.enableImageFetching:
                imageExtractor = self.getImageExtractor(article)
                article.topImage = imageExtractor.getBestImage(article.rawDoc, article.topNode)

            if self.config.enableBodyAnalysis:
                article.topNode = extractor.postExtractionCleanup(article.topNode)
                article.cleanedArticleText = outputFormatter.getFormattedText(article.topNode)

        return article