def __init__(self, config): # config self.config = config # parser self.parser = self.config.get_parser() # article self.article = Article() # init the extractor self.extractor = self.get_extractor() # init the document cleaner self.cleaner = self.get_cleaner() # init the output formatter self.formatter = self.get_formatter() # video extractor self.video_extractor = self.get_video_extractor() # image extrator self.image_extractor = self.get_image_extractor() # html fetcher self.htmlfetcher = HtmlFetcher(self.config) # TODO : log prefix self.logPrefix = "crawler:"
def get_html(self, crawl_candidate, parsing_candidate): # we got a raw_tml # no need to fetch remote content if crawl_candidate.raw_html: return crawl_candidate.raw_html # fetch HTML fetcher = HtmlFetcher(self.config, parsing_candidate.url) html = fetcher.get_html() #html = HtmlFetcher().get_html(self.config, parsing_candidate.url) return html
def __init__(self, config): # config self.config = config # parser self.parser = self.config.get_parser() # article self.article = Article() # init the extractor self.extractor = self.get_extractor() # init the document cleaner self.cleaner = self.get_cleaner() # init the output formatter self.formatter = self.get_formatter() # metas extractor self.metas_extractor = self.get_metas_extractor() # publishdate extractor self.publishdate_extractor = self.get_publishdate_extractor() # opengraph extractor self.opengraph_extractor = self.get_opengraph_extractor() # tags extractor self.tags_extractor = self.get_tags_extractor() # authors extractor self.authors_extractor = self.get_authors_extractor() # tweets extractor self.tweets_extractor = self.get_tweets_extractor() # links extractor self.links_extractor = self.get_links_extractor() # video extractor self.video_extractor = self.get_video_extractor() # title extractor self.title_extractor = self.get_title_extractor() # image extrator self.image_extractor = self.get_image_extractor() # html fetcher self.htmlfetcher = HtmlFetcher(self.config) # TODO : log prefix self.logPrefix = "crawler:"
def getHTML(self, crawlCandidate, parsingCandidate): if crawlCandidate.rawHTML: return crawlCandidate.rawHTML else: # fetch HTML html = HtmlFetcher().getHtml(self.config, parsingCandidate.url) return html
class Crawler(object): def __init__(self, config): # config self.config = config # parser self.parser = self.config.get_parser() # article self.article = Article() # init the extractor self.extractor = self.get_extractor() # init the document cleaner self.cleaner = self.get_cleaner() # init the output formatter self.formatter = self.get_formatter() # metas extractor self.metas_extractor = self.get_metas_extractor() # publishdate extractor self.publishdate_extractor = self.get_publishdate_extractor() # opengraph extractor self.opengraph_extractor = self.get_opengraph_extractor() # tags extractor self.tags_extractor = self.get_tags_extractor() # authors extractor self.authors_extractor = self.get_authors_extractor() # tweets extractor self.tweets_extractor = self.get_tweets_extractor() # links extractor self.links_extractor = self.get_links_extractor() # video extractor self.video_extractor = self.get_video_extractor() # title extractor self.title_extractor = self.get_title_extractor() # image extrator self.image_extractor = self.get_image_extractor() # html fetcher self.htmlfetcher = HtmlFetcher(self.config) # TODO : log prefix self.logPrefix = "crawler:" def crawl(self, crawl_candidate): # parser candidate parse_candidate = self.get_parse_candidate(crawl_candidate) # raw html raw_html = self.get_html(crawl_candidate, parse_candidate) if raw_html is None: return self.article # create document doc = self.get_document(raw_html) # article self.article.final_url = parse_candidate.url self.article.link_hash = parse_candidate.link_hash self.article.raw_html = raw_html self.article.doc = doc self.article.raw_doc = deepcopy(doc) # open graph self.article.opengraph = self.opengraph_extractor.extract() # publishdate self.article.publish_date = self.publishdate_extractor.extract() # meta metas = self.metas_extractor.extract() self.article.meta_lang = metas['lang'] self.article.meta_favicon = metas['favicon'] self.article.meta_description = metas['description'] self.article.meta_keywords = metas['keywords'] self.article.canonical_link = metas['canonical'] self.article.domain = metas['domain'] # tags self.article.tags = self.tags_extractor.extract() # authors self.article.authors = self.authors_extractor.extract() # title self.article.title = self.title_extractor.extract() # check for known node as content body # if we find one force the article.doc to be the found node # this will prevent the cleaner to remove unwanted text content article_body = self.extractor.get_known_article_tags() if article_body is not None: self.article.doc = article_body # before we do any calcs on the body itself let's clean up the document self.article.doc = self.cleaner.clean() # big stuff self.article.top_node = self.extractor.calculate_best_node() # if we have a top node # let's process it if self.article.top_node is not None: # article links self.article.links = self.links_extractor.extract() # tweets self.article.tweets = self.tweets_extractor.extract() # video handling self.video_extractor.get_videos() # image handling if self.config.enable_image_fetching: self.get_image() # post cleanup self.article.top_node = self.extractor.post_cleanup() # clean_text self.article.cleaned_text = self.formatter.get_formatted_text() # cleanup tmp file self.relase_resources() # return the article return self.article def get_parse_candidate(self, crawl_candidate): if crawl_candidate.raw_html: return RawHelper.get_parsing_candidate(crawl_candidate.url, crawl_candidate.raw_html) return URLHelper.get_parsing_candidate(crawl_candidate.url) def get_image(self): doc = self.article.raw_doc top_node = self.article.top_node self.article.top_image = self.image_extractor.get_best_image(doc, top_node) def get_html(self, crawl_candidate, parsing_candidate): # we got a raw_tml # no need to fetch remote content if crawl_candidate.raw_html: return crawl_candidate.raw_html # fetch HTML html = self.htmlfetcher.get_html(parsing_candidate.url) self.article.additional_data.update({ 'request': self.htmlfetcher.request, 'result': self.htmlfetcher.result, }) return html def get_metas_extractor(self): return MetasExtractor(self.config, self.article) def get_publishdate_extractor(self): return PublishDateExtractor(self.config, self.article) def get_opengraph_extractor(self): return OpenGraphExtractor(self.config, self.article) def get_tags_extractor(self): return TagsExtractor(self.config, self.article) def get_authors_extractor(self): return AuthorsExtractor(self.config, self.article) def get_tweets_extractor(self): return TweetsExtractor(self.config, self.article) def get_links_extractor(self): return LinksExtractor(self.config, self.article) def get_title_extractor(self): return TitleExtractor(self.config, self.article) def get_image_extractor(self): return ImageExtractor(self.config, self.article) def get_video_extractor(self): return VideoExtractor(self.config, self.article) def get_formatter(self): return StandardOutputFormatter(self.config, self.article) def get_cleaner(self): return StandardDocumentCleaner(self.config, self.article) def get_document(self, raw_html): doc = self.parser.fromstring(raw_html) return doc def get_extractor(self): return StandardContentExtractor(self.config, self.article) def relase_resources(self): path = os.path.join(self.config.local_storage_path, '%s_*' % self.article.link_hash) for fname in glob.glob(path): try: os.remove(fname) except OSError: # TODO better log handeling pass
class Crawler(object): def __init__(self, config): # config self.config = config # parser self.parser = self.config.get_parser() # article self.article = Article() # init the extractor self.extractor = self.get_extractor() # init the document cleaner self.cleaner = self.get_cleaner() # init the output formatter self.formatter = self.get_formatter() # video extractor self.video_extractor = self.get_video_extractor() # image extrator self.image_extractor = self.get_image_extractor() # html fetcher self.htmlfetcher = HtmlFetcher(self.config) # TODO : log prefix self.logPrefix = "crawler:" def crawl(self, crawl_candidate): # parser candidate parse_candidate = self.get_parse_candidate(crawl_candidate) # raw html raw_html = self.get_html(crawl_candidate, parse_candidate) if raw_html is None: return self.article # create document doc = self.get_document(raw_html) # article self.article.final_url = parse_candidate.url self.article.link_hash = parse_candidate.link_hash self.article.raw_html = raw_html self.article.doc = doc self.article.raw_doc = deepcopy(doc) # TODO # self.article.publish_date = config.publishDateExtractor.extract(doc) # self.article.additional_data = config.get_additionaldata_extractor.extract(doc) self.article.title = self.extractor.get_title() self.article.meta_lang = self.extractor.get_meta_lang() self.article.meta_favicon = self.extractor.get_favicon() self.article.meta_description = self.extractor.get_meta_description() self.article.meta_keywords = self.extractor.get_meta_keywords() self.article.canonical_link = self.extractor.get_canonical_link() self.article.domain = self.extractor.get_domain() self.article.tags = self.extractor.extract_tags() # before we do any calcs on the body itself let's clean up the document self.article.doc = self.cleaner.clean() # big stuff self.article.top_node = self.extractor.calculate_best_node() # if we have a top node # let's process it if self.article.top_node is not None: # video handeling self.video_extractor.get_videos() # image handeling if self.config.enable_image_fetching: self.get_image() self.get_all_images() # post cleanup self.article.top_node = self.extractor.post_cleanup() # clean_text self.article.cleaned_text = self.formatter.get_formatted_text() # cleanup tmp file self.relase_resources() # return the article return self.article def get_parse_candidate(self, crawl_candidate): if crawl_candidate.raw_html: return RawHelper.get_parsing_candidate(crawl_candidate.url, crawl_candidate.raw_html) return URLHelper.get_parsing_candidate(crawl_candidate.url) def get_image(self): doc = self.article.raw_doc top_node = self.article.top_node self.article.top_image = self.image_extractor.get_best_image(doc, top_node) def get_all_images(self): top_node = self.article.top_node self.article.images = self.image_extractor.get_all_images(top_node) def get_html(self, crawl_candidate, parsing_candidate): # we got a raw_tml # no need to fetch remote content if crawl_candidate.raw_html: return crawl_candidate.raw_html # fetch HTML html = self.htmlfetcher.get_html(parsing_candidate.url) self.article.additional_data.update({ 'request': self.htmlfetcher.request, 'result': self.htmlfetcher.result, }) return html def get_image_extractor(self): return UpgradedImageIExtractor(self.config, self.article) def get_video_extractor(self): return VideoExtractor(self.config, self.article) def get_formatter(self): return StandardOutputFormatter(self.config, self.article) def get_cleaner(self): return StandardDocumentCleaner(self.config, self.article) def get_document(self, raw_html): doc = self.parser.fromstring(raw_html) return doc def get_extractor(self): return StandardContentExtractor(self.config, self.article) def relase_resources(self): path = os.path.join(self.config.local_storage_path, '%s_*' % self.article.link_hash) for fname in glob.glob(path): try: os.remove(fname) except OSError: # TODO better log handeling pass
class Crawler(object): def __init__(self, config): # config self.config = config # parser self.parser = self.config.get_parser() # article self.article = Article() # init the extractor self.extractor = self.get_extractor() # init the document cleaner self.cleaner = self.get_cleaner() # init the output formatter self.formatter = self.get_formatter() # metas extractor self.metas_extractor = self.get_metas_extractor() # publishdate extractor self.publishdate_extractor = self.get_publishdate_extractor() # opengraph extractor self.opengraph_extractor = self.get_opengraph_extractor() # tags extractor self.tags_extractor = self.get_tags_extractor() # authors extractor self.authors_extractor = self.get_authors_extractor() # tweets extractor self.tweets_extractor = self.get_tweets_extractor() # links extractor self.links_extractor = self.get_links_extractor() # video extractor self.video_extractor = self.get_video_extractor() # title extractor self.title_extractor = self.get_title_extractor() # image extrator self.image_extractor = self.get_image_extractor() # html fetcher self.htmlfetcher = HtmlFetcher(self.config) # TODO : log prefix self.logPrefix = "crawler:" def crawl(self, crawl_candidate): # parser candidate parse_candidate = self.get_parse_candidate(crawl_candidate) # raw html raw_html = self.get_html(crawl_candidate, parse_candidate) if raw_html is None: return self.article # create document doc = self.get_document(raw_html) # article self.article.final_url = parse_candidate.url self.article.link_hash = parse_candidate.link_hash self.article.raw_html = raw_html self.article.doc = doc self.article.raw_doc = deepcopy(doc) # open graph self.article.opengraph = self.opengraph_extractor.extract() # publishdate self.article.publish_date = self.publishdate_extractor.extract() # meta metas = self.metas_extractor.extract() self.article.meta_lang = metas['lang'] self.article.meta_favicon = metas['favicon'] self.article.meta_description = metas['description'] self.article.meta_keywords = metas['keywords'] self.article.canonical_link = metas['canonical'] self.article.domain = metas['domain'] # tags self.article.tags = self.tags_extractor.extract() # authors self.article.authors = self.authors_extractor.extract() # title self.article.title = self.title_extractor.extract() # handle encoding issue print("goose: crawler: crawl: self.article.title : ", self.article.title) # check for known node as content body # if we find one force the article.doc to be the found node # this will prevent the cleaner to remove unwanted text content article_body = self.extractor.get_known_article_tags() print("goose: crawler: crawl: article_body : ", article_body) if article_body is not None: self.article.doc = article_body # before we do any calcs on the body itself let's clean up the document ### TODO Improve self.article.doc = self.cleaner.clean() print("goose: crawler: crawl: self.article.doc : ", self.cleaner.clean(), " self.article.top_node :", self.article.top_node) # big stuff self.article.top_node = self.extractor.calculate_best_node() # if we have a top node # let's process it if self.article.top_node is not None: # article links self.article.links = self.links_extractor.extract() # tweets self.article.tweets = self.tweets_extractor.extract() # video handling self.video_extractor.get_videos() # image handling if self.config.enable_image_fetching: self.get_image() # post cleanup self.article.top_node = self.extractor.post_cleanup() # clean_text self.article.cleaned_text = self.formatter.get_formatted_text() # cleanup tmp file self.relase_resources() self.ascii_only() # return the article print("\ngoose: crawler: crawl: self.article.title : ", self.article.title) return self.article def ascii_only(self): # remove non ascii try: self.article.cleaned_text = self.article.cleaned_text.encode( 'ascii', errors='ignore').decode('ascii', errors='ignore') self.article.cleaned_text = self.remove_punctuation( self.article.cleaned_text) self.article.title = self.article.title.encode( 'ascii', errors='ignore').decode('ascii', errors='ignore') self.article.title = self.remove_punctuation(self.article.title) except Exception as e: print("goose: crawler: crawl: ascii_only : Exception :", e) return # next ditch punctuations def remove_punctuation(self, content): # code taken form # http://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python # The opposite method of bytes.decode() is # str.encode(), which returns a bytes representation # of the Unicode string, encoded in the requested encoding. # content = content.encode('utf-8') # content = content.decode("utf-8", "strict") for c in string.punctuation: content = content.replace(c, "") return content def get_parse_candidate(self, crawl_candidate): if crawl_candidate.raw_html: return RawHelper.get_parsing_candidate(crawl_candidate.url, crawl_candidate.raw_html) return URLHelper.get_parsing_candidate(crawl_candidate.url) def get_image(self): doc = self.article.raw_doc top_node = self.article.top_node self.article.top_image = self.image_extractor.get_best_image( doc, top_node) def get_html(self, crawl_candidate, parsing_candidate): # we got a raw_tml # no need to fetch remote content if crawl_candidate.raw_html: return crawl_candidate.raw_html # fetch HTML html = self.htmlfetcher.get_html(parsing_candidate.url) self.article.additional_data.update({ 'request': self.htmlfetcher.request, 'result': self.htmlfetcher.result, }) return html def get_metas_extractor(self): return MetasExtractor(self.config, self.article) def get_publishdate_extractor(self): return PublishDateExtractor(self.config, self.article) def get_opengraph_extractor(self): return OpenGraphExtractor(self.config, self.article) def get_tags_extractor(self): return TagsExtractor(self.config, self.article) def get_authors_extractor(self): return AuthorsExtractor(self.config, self.article) def get_tweets_extractor(self): return TweetsExtractor(self.config, self.article) def get_links_extractor(self): return LinksExtractor(self.config, self.article) def get_title_extractor(self): return TitleExtractor(self.config, self.article) def get_image_extractor(self): return ImageExtractor(self.config, self.article) def get_video_extractor(self): return VideoExtractor(self.config, self.article) def get_formatter(self): return StandardOutputFormatter(self.config, self.article) def get_cleaner(self): return StandardDocumentCleaner(self.config, self.article) def get_document(self, raw_html): doc = self.parser.fromstring(raw_html) return doc def get_extractor(self): return StandardContentExtractor(self.config, self.article) def relase_resources(self): path = os.path.join(self.config.local_storage_path, '%s_*' % self.article.link_hash) for fname in glob.glob(path): try: os.remove(fname) except OSError: # TODO better log handeling pass
class Crawler(object): def __init__(self, config): # config self.config = config # parser self.parser = self.config.get_parser() # article self.article = Article() # init the extractor self.extractor = self.get_extractor() # init the document cleaner self.cleaner = self.get_cleaner() # init the output formatter self.formatter = self.get_formatter() # video extractor self.video_extractor = self.get_video_extractor() # image extrator self.image_extractor = self.get_image_extractor() # html fetcher self.htmlfetcher = HtmlFetcher(self.config) # TODO : log prefix self.logPrefix = "crawler:" def crawl(self, crawl_candidate): # parser candidate parse_candidate = self.get_parse_candidate(crawl_candidate) # raw html raw_html = self.get_html(crawl_candidate, parse_candidate) if raw_html is None: return self.article # create document doc = self.get_document(raw_html) # article self.article.final_url = parse_candidate.url self.article.link_hash = parse_candidate.link_hash self.article.raw_html = raw_html self.article.doc = doc self.article.raw_doc = deepcopy(doc) # TODO # self.article.publish_date = config.publishDateExtractor.extract(doc) # self.article.additional_data = config.get_additionaldata_extractor.extract(doc) self.article.title = self.extractor.get_title() self.article.meta_lang = self.extractor.get_meta_lang() self.article.meta_favicon = self.extractor.get_favicon() self.article.meta_description = self.extractor.get_meta_description() self.article.meta_keywords = self.extractor.get_meta_keywords() self.article.canonical_link = self.extractor.get_canonical_link() self.article.domain = self.extractor.get_domain() self.article.tags = self.extractor.extract_tags() # before we do any calcs on the body itself let's clean up the document self.article.doc = self.cleaner.clean() # big stuff self.article.top_node = self.extractor.calculate_best_node() # if we have a top node # let's process it if self.article.top_node is not None: # video handeling self.video_extractor.get_videos() # image handeling if self.config.enable_image_fetching: self.get_image() self.get_all_images() # post cleanup self.article.top_node = self.extractor.post_cleanup() # clean_text self.article.cleaned_text = self.formatter.get_formatted_text() # cleanup tmp file self.relase_resources() # return the article return self.article def get_parse_candidate(self, crawl_candidate): if crawl_candidate.raw_html: return RawHelper.get_parsing_candidate(crawl_candidate.url, crawl_candidate.raw_html) return URLHelper.get_parsing_candidate(crawl_candidate.url) def get_image(self): doc = self.article.raw_doc top_node = self.article.top_node self.article.top_image = self.image_extractor.get_best_image( doc, top_node) def get_all_images(self): top_node = self.article.top_node self.article.images = self.image_extractor.get_all_images(top_node) def get_html(self, crawl_candidate, parsing_candidate): # we got a raw_tml # no need to fetch remote content if crawl_candidate.raw_html: return crawl_candidate.raw_html # fetch HTML html = self.htmlfetcher.get_html(parsing_candidate.url) self.article.additional_data.update({ 'request': self.htmlfetcher.request, 'result': self.htmlfetcher.result, }) return html def get_image_extractor(self): return UpgradedImageIExtractor(self.config, self.article) def get_video_extractor(self): return VideoExtractor(self.config, self.article) def get_formatter(self): return StandardOutputFormatter(self.config, self.article) def get_cleaner(self): return StandardDocumentCleaner(self.config, self.article) def get_document(self, raw_html): doc = self.parser.fromstring(raw_html) return doc def get_extractor(self): return StandardContentExtractor(self.config, self.article) def relase_resources(self): path = os.path.join(self.config.local_storage_path, '%s_*' % self.article.link_hash) for fname in glob.glob(path): try: os.remove(fname) except OSError: # TODO better log handeling pass
def get_html(self, crawl_candidate, parsing_candidate): if crawl_candidate.raw_html: return crawl_candidate.raw_html # fetch HTML html = HtmlFetcher().get_html(self.config, parsing_candidate.url) return html
class Crawler(object): def __init__(self, config): # config self.config = config # parser self.parser = self.config.get_parser() # article self.article = Article() # init the extractor self.extractor = self.get_extractor() # init the document cleaner self.cleaner = self.get_cleaner() # init the output formatter self.formatter = self.get_formatter() # metas extractor self.metas_extractor = self.get_metas_extractor() # publishdate extractor self.publishdate_extractor = self.get_publishdate_extractor() # opengraph extractor self.opengraph_extractor = self.get_opengraph_extractor() # tags extractor self.tags_extractor = self.get_tags_extractor() # authors extractor self.authors_extractor = self.get_authors_extractor() # tweets extractor self.tweets_extractor = self.get_tweets_extractor() # links extractor self.links_extractor = self.get_links_extractor() # video extractor self.video_extractor = self.get_video_extractor() # title extractor self.title_extractor = self.get_title_extractor() # image extrator self.image_extractor = self.get_image_extractor() # html fetcher self.htmlfetcher = HtmlFetcher(self.config) # TODO : log prefix self.logPrefix = "crawler:" def crawl(self, crawl_candidate): # parser candidate parse_candidate = self.get_parse_candidate(crawl_candidate) # raw html raw_html = self.get_html(crawl_candidate, parse_candidate) if raw_html is None: return self.article # create document doc = self.get_document(raw_html) # article self.article.final_url = parse_candidate.url self.article.link_hash = parse_candidate.link_hash self.article.raw_html = raw_html self.article.doc = doc self.article.raw_doc = deepcopy(doc) # open graph self.article.opengraph = self.opengraph_extractor.extract() # publishdate self.article.publish_date = self.publishdate_extractor.extract() # meta metas = self.metas_extractor.extract() self.article.meta_lang = metas['lang'] self.article.meta_favicon = metas['favicon'] self.article.meta_description = metas['description'] self.article.meta_keywords = metas['keywords'] self.article.canonical_link = metas['canonical'] self.article.domain = metas['domain'] # tags self.article.tags = self.tags_extractor.extract() # authors self.article.authors = self.authors_extractor.extract() # title self.article.title = self.title_extractor.extract() # check for known node as content body # if we find one force the article.doc to be the found node # this will prevent the cleaner to remove unwanted text content article_body = self.extractor.get_known_article_tags() if article_body is not None: self.article.doc = article_body # before we do any calcs on the body itself let's clean up the document self.article.doc = self.cleaner.clean() # big stuff self.article.top_node = self.extractor.calculate_best_node() # if we have a top node # let's process it if self.article.top_node is not None: # article links self.article.links = self.links_extractor.extract() # tweets self.article.tweets = self.tweets_extractor.extract() # video handling self.video_extractor.get_videos() # image handling if self.config.enable_image_fetching: self.image_extractor = self.get_image_extractor() self.get_image() # post cleanup self.article.top_node = self.extractor.post_cleanup() # clean_text self.article.cleaned_text = self.formatter.get_formatted_text() # cleanup tmp file self.relase_resources() # return the article return self.article def get_parse_candidate(self, crawl_candidate): if crawl_candidate.raw_html: return RawHelper.get_parsing_candidate(crawl_candidate.url, crawl_candidate.raw_html) return URLHelper.get_parsing_candidate(crawl_candidate.url) def get_image(self): doc = self.article.raw_doc top_node = self.article.top_node self.article.top_image = self.image_extractor.get_best_image( doc, top_node) def get_html(self, crawl_candidate, parsing_candidate): # we got a raw_tml # no need to fetch remote content if crawl_candidate.raw_html: return crawl_candidate.raw_html # fetch HTML html = self.htmlfetcher.get_html(parsing_candidate.url) self.article.additional_data.update({ 'request': self.htmlfetcher.request, 'result': self.htmlfetcher.result, }) return html def get_metas_extractor(self): return MetasExtractor(self.config, self.article) def get_publishdate_extractor(self): return PublishDateExtractor(self.config, self.article) def get_opengraph_extractor(self): return OpenGraphExtractor(self.config, self.article) def get_tags_extractor(self): return TagsExtractor(self.config, self.article) def get_authors_extractor(self): return AuthorsExtractor(self.config, self.article) def get_tweets_extractor(self): return TweetsExtractor(self.config, self.article) def get_links_extractor(self): return LinksExtractor(self.config, self.article) def get_title_extractor(self): return TitleExtractor(self.config, self.article) def get_image_extractor(self): return ImageExtractor(self.config, self.article) def get_video_extractor(self): return VideoExtractor(self.config, self.article) def get_formatter(self): return StandardOutputFormatter(self.config, self.article) def get_cleaner(self): return StandardDocumentCleaner(self.config, self.article) def get_document(self, raw_html): doc = self.parser.fromstring(raw_html) return doc def get_extractor(self): return StandardContentExtractor(self.config, self.article) def relase_resources(self): path = os.path.join(self.config.local_storage_path, '%s_*' % self.article.link_hash) for fname in glob.glob(path): try: os.remove(fname) except OSError: # TODO better log handeling pass