def crawl(self, crawl_candidate): article = Article() parse_candidate = self.get_parse_candidate(crawl_candidate) raw_html = self.get_html(crawl_candidate, parse_candidate) if raw_html is None: return article doc = self.get_document(raw_html) extractor = self.get_extractor() document_cleaner = self.get_document_cleaner() output_formatter = self.get_output_formatter() # article article.final_url = parse_candidate.url article.link_hash = parse_candidate.link_hash article.raw_html = raw_html article.doc = doc article.raw_doc = deepcopy(doc) article.title = extractor.get_title(article) # TODO # article.publish_date = config.publishDateExtractor.extract(doc) # article.additional_data = config.get_additionaldata_extractor.extract(doc) article.meta_lang = extractor.get_meta_lang(article) article.meta_favicon = extractor.get_favicon(article) article.meta_description = extractor.get_meta_description(article) article.meta_keywords = extractor.get_meta_keywords(article) article.canonical_link = extractor.get_canonical_link(article) article.domain = extractor.get_domain(article.final_url) article.tags = extractor.extract_tags(article) # # before we do any calcs on the body itself let's clean up the document article.doc = document_cleaner.clean(article) # big stuff article.top_node = extractor.calculate_best_node(article) if article.top_node is not None: # video handeling video_extractor = self.get_video_extractor(article) video_extractor.get_videos() # image handeling if self.config.enable_image_fetching: image_extractor = self.get_image_extractor(article) article.top_image = image_extractor.get_best_image( article.raw_doc, article.top_node) # post cleanup article.top_node = extractor.post_cleanup(article.top_node) # clean_text article.cleaned_text = output_formatter.get_formatted_text(article) # cleanup tmp file self.relase_resources(article) # extract video info video_info = self.get_video_info_extractor(article).get_video_info() if video_info: article.additional_data['video_info'] = video_info return article
def crawl(self, crawl_candidate): article = Article() parse_candidate = self.get_parse_candidate(crawl_candidate) raw_html = self.get_html(crawl_candidate, parse_candidate) if raw_html is None: return article doc = self.get_document(raw_html) extractor = self.get_extractor() document_cleaner = self.get_document_cleaner() output_formatter = self.get_output_formatter() # article article.final_url = parse_candidate.url article.link_hash = parse_candidate.link_hash article.raw_html = raw_html article.doc = doc article.raw_doc = deepcopy(doc) article.title = extractor.get_title(article) # TODO # article.publish_date = config.publishDateExtractor.extract(doc) # article.additional_data = config.get_additionaldata_extractor.extract(doc) article.meta_lang = extractor.get_meta_lang(article) article.meta_favicon = extractor.get_favicon(article) article.meta_description = extractor.get_meta_description(article) article.meta_keywords = extractor.get_meta_keywords(article) article.canonical_link = extractor.get_canonical_link(article) article.domain = extractor.get_domain(article.final_url) article.tags = extractor.extract_tags(article) # # before we do any calcs on the body itself let's clean up the document article.doc = document_cleaner.clean(article) # big stuff article.top_node = extractor.calculate_best_node(article) if article.top_node is not None: # video handeling video_extractor = self.get_video_extractor(article) video_extractor.get_videos() # image handeling if self.config.enable_image_fetching: image_extractor = self.get_image_extractor(article) article.top_image = image_extractor.get_best_image(article.raw_doc, article.top_node) # post cleanup article.top_node = extractor.post_cleanup(article.top_node) # clean_text article.cleaned_text = output_formatter.get_formatted_text(article) # cleanup tmp file self.relase_resources(article) # extract video info video_info = self.get_video_info_extractor(article).get_video_info() if video_info: article.additional_data['video_info'] = video_info return article
def crawl(self, crawl_candidate): article = Article() parse_candidate = self.get_parse_candidate(crawl_candidate) raw_html = self.get_html(crawl_candidate, parse_candidate) if raw_html is None: return article charset = get_charset(raw_html) raw_html = raw_html.decode(charset, 'ignore') pattern = re.compile("[\u4e00-\u9fa5]") if 'GB2312 GBK GB18030'.find(charset.upper()) != -1 \ or pattern.search(raw_html) is not None: self.config.stopwords_class = StopWordsChinese print("中文") raw_html = clean_tags(raw_html, ['SOHUADCODE', 'script', 'style']) if charset != 'utf-8': raw_html = replace_meta_charset(raw_html) raw_html = force_meta(raw_html) doc = self.get_document(parse_candidate.url, raw_html) extractor = self.get_extractor() document_cleaner = self.get_document_cleaner() output_formatter = self.get_output_formatter() # article article.final_url = parse_candidate.url article.link_hash = parse_candidate.link_hash article.raw_html = raw_html article.doc = doc article.raw_doc = deepcopy(doc) article.title = extractor.get_title(article) # TODO # article.publish_date = config.publishDateExtractor.extract(doc) # article.additional_data = config.get_additionaldata_extractor.extract(doc) article.meta_lang = extractor.get_meta_lang(article) article.meta_favicon = extractor.get_favicon(article) article.meta_description = extractor.get_meta_description(article) article.meta_keywords = extractor.get_meta_keywords(article) article.canonical_link = extractor.get_canonical_link(article) article.domain = extractor.get_domain(article.final_url) article.tags = extractor.extract_tags(article) # before we do any calcs on the body itself let's clean up the document article.doc = document_cleaner.clean(article) # import lxml.html # lxml.html.open_in_browser(article.doc) # big stuff article.top_node = extractor.calculate_best_node(article) if article.top_node is None: article.top_node = doc if article.top_node is not None: # video handeling video_extractor = self.get_video_extractor(article) video_extractor.get_videos() # image handeling if self.config.enable_image_fetching: image_extractor = self.get_image_extractor(article) article.top_image = image_extractor.get_best_image( article.raw_doc, article.top_node) # post cleanup # article.top_node = extractor.post_cleanup(article.top_node) # clean_text article.cleaned_text = output_formatter.get_formatted_text(article) # import lxml.html # lxml.html.open_in_browser(article.top_node) # article.cleaned_text = self.parser.nodeToString(article.top_node) if article.meta_description is None: article.meta_description = text_content( article.cleaned_text)[:150] # cleanup tmp file self.relase_resources(article) return article