Exemple #1
0
    def __init__(self, config):
        # config
        self.config = config
        # parser
        self.parser = self.config.get_parser()

        # article
        self.article = Article()

        # init the extractor
        self.extractor = self.get_extractor()

        # init the document cleaner
        self.cleaner = self.get_cleaner()

        # init the output formatter
        self.formatter = self.get_formatter()

        # video extractor
        self.video_extractor = self.get_video_extractor()

        # image extrator
        self.image_extractor = self.get_image_extractor()

        # html fetcher
        self.htmlfetcher = HtmlFetcher(self.config)

        # TODO : log prefix
        self.logPrefix = "crawler:"
Exemple #2
0
 def getHTML(self, crawlCandidate, parsingCandidate):
     if crawlCandidate.rawHTML:
         return crawlCandidate.rawHTML
     else:
         # fetch HTML
         html = HtmlFetcher().getHtml(self.config, parsingCandidate.url)
         return html
    def __init__(self, config):
        # config
        self.config = config
        # parser
        self.parser = self.config.get_parser()

        # article
        self.article = Article()

        # init the extractor
        self.extractor = self.get_extractor()

        # init the document cleaner
        self.cleaner = self.get_cleaner()

        # init the output formatter
        self.formatter = self.get_formatter()

        # metas extractor
        self.metas_extractor = self.get_metas_extractor()

        # publishdate extractor
        self.publishdate_extractor = self.get_publishdate_extractor()

        # opengraph extractor
        self.opengraph_extractor = self.get_opengraph_extractor()

        # tags extractor
        self.tags_extractor = self.get_tags_extractor()

        # authors extractor
        self.authors_extractor = self.get_authors_extractor()

        # tweets extractor
        self.tweets_extractor = self.get_tweets_extractor()

        # links extractor
        self.links_extractor = self.get_links_extractor()

        # video extractor
        self.video_extractor = self.get_video_extractor()

        # title extractor
        self.title_extractor = self.get_title_extractor()

        # image extrator
        self.image_extractor = self.get_image_extractor()

        # html fetcher
        self.htmlfetcher = HtmlFetcher(self.config)

        # TODO : log prefix
        self.logPrefix = "crawler:"
Exemple #4
0
 def get_html(self, crawl_candidate, parsing_candidate):
     if crawl_candidate.raw_html:
         return crawl_candidate.raw_html
     # fetch HTML
     html = HtmlFetcher().get_html(self.config, parsing_candidate.url)
     return html