def __init__(self, config): # config self.config = config # parser self.parser = self.config.get_parser() # article self.article = Article() # init the extractor self.extractor = self.get_extractor() # init the document cleaner self.cleaner = self.get_cleaner() # init the output formatter self.formatter = self.get_formatter() # video extractor self.video_extractor = self.get_video_extractor() # image extrator self.image_extractor = self.get_image_extractor() # html fetcher self.htmlfetcher = HtmlFetcher(self.config) # TODO : log prefix self.logPrefix = "crawler:"
def getHTML(self, crawlCandidate, parsingCandidate): if crawlCandidate.rawHTML: return crawlCandidate.rawHTML else: # fetch HTML html = HtmlFetcher().getHtml(self.config, parsingCandidate.url) return html
def __init__(self, config): # config self.config = config # parser self.parser = self.config.get_parser() # article self.article = Article() # init the extractor self.extractor = self.get_extractor() # init the document cleaner self.cleaner = self.get_cleaner() # init the output formatter self.formatter = self.get_formatter() # metas extractor self.metas_extractor = self.get_metas_extractor() # publishdate extractor self.publishdate_extractor = self.get_publishdate_extractor() # opengraph extractor self.opengraph_extractor = self.get_opengraph_extractor() # tags extractor self.tags_extractor = self.get_tags_extractor() # authors extractor self.authors_extractor = self.get_authors_extractor() # tweets extractor self.tweets_extractor = self.get_tweets_extractor() # links extractor self.links_extractor = self.get_links_extractor() # video extractor self.video_extractor = self.get_video_extractor() # title extractor self.title_extractor = self.get_title_extractor() # image extrator self.image_extractor = self.get_image_extractor() # html fetcher self.htmlfetcher = HtmlFetcher(self.config) # TODO : log prefix self.logPrefix = "crawler:"
def get_html(self, crawl_candidate, parsing_candidate): if crawl_candidate.raw_html: return crawl_candidate.raw_html # fetch HTML html = HtmlFetcher().get_html(self.config, parsing_candidate.url) return html