def extract(cls, html, html_formated): potential_titles = [] soup = BeautifulSoup(html, 'html.parser') if soup.title: page_title = TitleExtractor.extract_text(soup.title) for split_char in TitleExtractor.SPLIT_CHARS: if split_char in page_title: page_title = page_title.split(split_char)[0].strip() potential_titles.append(page_title) for heading_tag in (soup.find_all('h1') + soup.find_all('h2')): potential_title = TitleExtractor.extract_text(heading_tag) if potential_title: potential_titles.append(potential_title) # Extract article from goose article = Article() article.raw_html = html article.raw_doc = html_formated article.doc = article.raw_doc try: goose_title = TitleExtractorGoose(Configuration(), article).get_title() except AttributeError, e: goose_title = None
def __init__(self, config): # config self.config = config # parser self.parser = self.config.get_parser() # article self.article = Article() # init the extractor self.extractor = self.get_extractor() # init the document cleaner self.cleaner = self.get_cleaner() # init the output formatter self.formatter = self.get_formatter() # video extractor self.video_extractor = self.get_video_extractor() # image extrator self.image_extractor = self.get_image_extractor() # html fetcher self.htmlfetcher = HtmlFetcher(self.config) # TODO : log prefix self.logPrefix = "crawler:"
def _goose_cleaned_text(cls, html, page_html): article = Article() article.raw_html = html article.raw_doc = page_html article.doc = article.raw_doc goose_extractor = ContentExtractor(Configuration(), article) goose_cleaner = DocumentCleaner(Configuration(), article) goose_formatter = OutputFormatter(Configuration(), article) # goose_image_extractor = ImageExtractor(Configuration(), article) use try: article.doc = goose_cleaner.clean() article.top_node = goose_extractor.calculate_best_node() if article.top_node is not None: article.top_node = goose_extractor.post_cleanup() article.cleaned_text = goose_formatter.get_formatted_text() except UnicodeDecodeError, e: article.top_node = None
def __init__(self, config): # config self.config = config # parser self.parser = self.config.get_parser() # article self.article = Article() # init the extractor self.extractor = self.get_extractor() # init the document cleaner self.cleaner = self.get_cleaner() # init the output formatter self.formatter = self.get_formatter() # metas extractor self.metas_extractor = self.get_metas_extractor() # publishdate extractor self.publishdate_extractor = self.get_publishdate_extractor() # opengraph extractor self.opengraph_extractor = self.get_opengraph_extractor() # tags extractor self.tags_extractor = self.get_tags_extractor() # authors extractor self.authors_extractor = self.get_authors_extractor() # tweets extractor self.tweets_extractor = self.get_tweets_extractor() # links extractor self.links_extractor = self.get_links_extractor() # video extractor self.video_extractor = self.get_video_extractor() # title extractor self.title_extractor = self.get_title_extractor() # image extrator self.image_extractor = self.get_image_extractor() # html fetcher self.htmlfetcher = HtmlFetcher(self.config) # TODO : log prefix self.logPrefix = "crawler:"
def crawl(self, crawl_candidate): article = Article() parse_candidate = self.get_parse_candidate(crawl_candidate) raw_html = self.get_html(crawl_candidate, parse_candidate) if raw_html is None: return article doc = self.get_document(raw_html) extractor = self.get_extractor() document_cleaner = self.get_document_cleaner() output_formatter = self.get_output_formatter() # article article.final_url = parse_candidate.url article.link_hash = parse_candidate.link_hash article.raw_html = raw_html article.doc = doc article.raw_doc = deepcopy(doc) article.title = extractor.get_title(article) # TODO # article.publish_date = config.publishDateExtractor.extract(doc) # article.additional_data = config.get_additionaldata_extractor.extract(doc) article.meta_lang = extractor.get_meta_lang(article) article.meta_favicon = extractor.get_favicon(article) article.meta_description = extractor.get_meta_description(article) article.meta_keywords = extractor.get_meta_keywords(article) article.canonical_link = extractor.get_canonical_link(article) article.domain = extractor.get_domain(article.final_url) article.tags = extractor.extract_tags(article) # # before we do any calcs on the body itself let's clean up the document article.doc = document_cleaner.clean(article) # big stuff article.top_node = extractor.calculate_best_node(article) if article.top_node is not None: # video handeling video_extractor = self.get_video_extractor(article) video_extractor.get_videos() # image handeling if self.config.enable_image_fetching: image_extractor = self.get_image_extractor(article) article.top_image = image_extractor.get_best_image(article.raw_doc, article.top_node) # post cleanup article.top_node = extractor.post_cleanup(article.top_node) # clean_text article.cleaned_text = output_formatter.get_formatted_text(article) # cleanup tmp file self.relase_resources(article) # extract video info video_info = self.get_video_info_extractor(article).get_video_info() if video_info: article.additional_data['video_info'] = video_info return article
def test_instance(self): a = Article() self.assertEqual(isinstance(a, Article), True)
def crawl(self, crawl_candidate): article = Article() parse_candidate = self.get_parse_candidate(crawl_candidate) raw_html = self.get_html(crawl_candidate, parse_candidate) if raw_html is None: return article charset = get_charset(raw_html) raw_html = raw_html.decode(charset, 'ignore') pattern = re.compile("[\u4e00-\u9fa5]") if 'GB2312 GBK GB18030'.find(charset.upper()) != -1 \ or pattern.search(raw_html) is not None: self.config.stopwords_class = StopWordsChinese print("中文") raw_html = clean_tags(raw_html, ['SOHUADCODE', 'script', 'style']) if charset != 'utf-8': raw_html = replace_meta_charset(raw_html) raw_html = force_meta(raw_html) doc = self.get_document(parse_candidate.url, raw_html) extractor = self.get_extractor() document_cleaner = self.get_document_cleaner() output_formatter = self.get_output_formatter() # article article.final_url = parse_candidate.url article.link_hash = parse_candidate.link_hash article.raw_html = raw_html article.doc = doc article.raw_doc = deepcopy(doc) article.title = extractor.get_title(article) # TODO # article.publish_date = config.publishDateExtractor.extract(doc) # article.additional_data = config.get_additionaldata_extractor.extract(doc) article.meta_lang = extractor.get_meta_lang(article) article.meta_favicon = extractor.get_favicon(article) article.meta_description = extractor.get_meta_description(article) article.meta_keywords = extractor.get_meta_keywords(article) article.canonical_link = extractor.get_canonical_link(article) article.domain = extractor.get_domain(article.final_url) article.tags = extractor.extract_tags(article) # before we do any calcs on the body itself let's clean up the document article.doc = document_cleaner.clean(article) # import lxml.html # lxml.html.open_in_browser(article.doc) # big stuff article.top_node = extractor.calculate_best_node(article) if article.top_node is None: article.top_node = doc if article.top_node is not None: # video handeling video_extractor = self.get_video_extractor(article) video_extractor.get_videos() # image handeling if self.config.enable_image_fetching: image_extractor = self.get_image_extractor(article) article.top_image = image_extractor.get_best_image( article.raw_doc, article.top_node) # post cleanup # article.top_node = extractor.post_cleanup(article.top_node) # clean_text article.cleaned_text = output_formatter.get_formatted_text(article) # import lxml.html # lxml.html.open_in_browser(article.top_node) # article.cleaned_text = self.parser.nodeToString(article.top_node) if article.meta_description is None: article.meta_description = text_content( article.cleaned_text)[:150] # cleanup tmp file self.relase_resources(article) return article
def crawl(self, crawl_candidate): article = Article() parse_candidate = self.get_parse_candidate(crawl_candidate) raw_html = self.get_html(crawl_candidate, parse_candidate) if raw_html is None: return article doc = self.get_document(raw_html) extractor = self.get_extractor() document_cleaner = self.get_document_cleaner() output_formatter = self.get_output_formatter() # article article.final_url = parse_candidate.url article.link_hash = parse_candidate.link_hash article.raw_html = raw_html article.doc = doc article.raw_doc = deepcopy(doc) article.title = extractor.get_title(article) # TODO # article.publish_date = config.publishDateExtractor.extract(doc) # article.additional_data = config.get_additionaldata_extractor.extract(doc) article.meta_lang = extractor.get_meta_lang(article) article.meta_favicon = extractor.get_favicon(article) article.meta_description = extractor.get_meta_description(article) article.meta_keywords = extractor.get_meta_keywords(article) article.canonical_link = extractor.get_canonical_link(article) article.domain = extractor.get_domain(article.final_url) article.tags = extractor.extract_tags(article) # # before we do any calcs on the body itself let's clean up the document article.doc = document_cleaner.clean(article) # big stuff article.top_node = extractor.calculate_best_node(article) if article.top_node is not None: # video handeling video_extractor = self.get_video_extractor(article) video_extractor.get_videos() # image handeling if self.config.enable_image_fetching: image_extractor = self.get_image_extractor(article) article.top_image = image_extractor.get_best_image( article.raw_doc, article.top_node) # post cleanup article.top_node = extractor.post_cleanup(article.top_node) # clean_text article.cleaned_text = output_formatter.get_formatted_text(article) # cleanup tmp file self.release_resources(article) return article