def extract(self, link, entity_description=False, sentiment=False, data_path='./data/'): errors, summaries, categories, entities, keywords = [], [], [], [], [] article = Goose().extract(link) valid_html = bool( BeautifulSoup(article.raw_html[0:100], "html.parser").find()) if not valid_html: r = requests.get(link) article.raw_html = r.text article.raw_doc = html.fromstring(r.text) if article.raw_doc is None: raise NoMainTextException authors = AuthorExtractor.extract(link, article.raw_html) publish_date = article.publish_date if article.publish_date else None if not article.title: article.title = TitleExtractor.extract(article.raw_html, article.raw_doc)[0] k = KeywordsExtractor(num_kewyords=20, verbose=True, data_path=data_path) if article.top_node is not None: main_body = etree.tostring(article.top_node) else: cleant_text_suggestions = MainTextExtractor.extract( article.raw_html, article.raw_doc) article.cleaned_text = cleant_text_suggestions[1] if not article.cleaned_text: article.cleaned_text = cleant_text_suggestions[2] if not article.cleaned_text: raise NoMainTextException main_body = 'Sorry, we could not detect the main HTML content for this article' try: summaries = Summarize( article.title, article.cleaned_text.encode('utf-8', 'ignore')) except Exception, e: summaries.append('We could not make summaries at this time.')
def extract(self, link, entity_description=False, sentiment=False, data_path='./data/'): errors, summaries, categories, entities, keywords = [], [], [], [], [] pdf_pattern = re.compile( '.*application\/pdf.*|.*application\/octet-stream.*') html_pattern = re.compile('.*text\/html.*') article = Goose().extract(link) content_type = article.__dict__['additional_data']['result'].info( )['content-type'] matches_html = len(re.findall(html_pattern, content_type)) matches_pdf = len(re.findall(pdf_pattern, content_type)) if matches_html == 0: # Textract url2text = Url2Text() texts = url2text.extract(link) k = KeywordsExtractor(num_kewyords=20, verbose=True, data_path=data_path) ent = Entities() clf = Classifier(data_path=data_path) return { "title": os.path.basename(link), "link": link, "author": [], "cleaned_text": texts[0], "text_sentiment": getSentimentText(texts[0]), "main_body": None, "images": None, "image": None, "date": article.__dict__['additional_data']['result'].info() ['last-modified'], "tags": k.extract([texts[0]], None, None, 'news')[0], "entities": ent.extract(texts[0], entity_description), "language": langid.classify(texts[0])[0], "summary": Summarize(None, texts[0]), "categories": clf.predict(texts[0]) } pass else: valid_html = bool( BeautifulSoup(article.raw_html[0:100], "html.parser").find()) if not valid_html: headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36' } r = requests.get(link, headers=headers) article.raw_html = r.text article.raw_doc = html.fromstring(r.text) if article.raw_doc is None: raise NoMainTextException authors = AuthorExtractor.extract(link, article.raw_html) publish_date = article.publish_date if article.publish_date else None if not article.title: article.title = TitleExtractor.extract(article.raw_html, article.raw_doc)[0] k = KeywordsExtractor(num_kewyords=20, verbose=True, data_path=data_path) if article.top_node is not None: main_body = etree.tostring(article.top_node) else: cleant_text_suggestions = MainTextExtractor.extract( article.raw_html, article.raw_doc) article.cleaned_text = cleant_text_suggestions[1] if not article.cleaned_text: article.cleaned_text = cleant_text_suggestions[2] if not article.cleaned_text: raise NoMainTextException main_body = 'Sorry, we could not detect the main HTML content for this article' try: summaries = Summarize( article.title, article.cleaned_text.encode('utf-8', 'ignore')) except Exception, e: summaries.append('We could not make summaries at this time.') try: text_sentiment = getSentimentText(article.cleaned_text) except Exception, e: text_sentiment = None
def extract(self, link, entity_description=False, sentiment=False, data_path='./data/'): errors, summaries, categories, entities, keywords = [], [], [], [], [] pdf_pattern = re.compile('.*application\/pdf.*|.*application\/octet-stream.*') html_pattern = re.compile('.*text\/html.*') article = Goose().extract(link) content_type = article.__dict__['additional_data']['result'].info()['content-type'] matches_html = len(re.findall(html_pattern, content_type)) matches_pdf = len(re.findall(pdf_pattern, content_type)) if matches_html == 0: # Textract url2text = Url2Text() texts = url2text.extract(link) k = KeywordsExtractor(num_kewyords=20, verbose=True, data_path=data_path) ent = Entities() clf = Classifier(data_path=data_path) return { "title": os.path.basename(link), "link": link, "author": [], "cleaned_text": texts[0], "text_sentiment": getSentimentText(texts[0]), "main_body": None, "images": None, "image": None, "date": article.__dict__['additional_data']['result'].info()['last-modified'], "tags": k.extract([texts[0]], None, None, 'news')[0], "entities": ent.extract(texts[0], entity_description), "language": langid.classify(texts[0])[0], "summary": Summarize(None, texts[0]), "categories": clf.predict(texts[0]) } pass else: valid_html = bool(BeautifulSoup(article.raw_html[0:100], "html.parser").find()) if not valid_html: headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'} r = requests.get(link, headers=headers) article.raw_html = r.text article.raw_doc = html.fromstring(r.text) if article.raw_doc is None: raise NoMainTextException authors = AuthorExtractor.extract(link, article.raw_html) publish_date = article.publish_date if article.publish_date else None if not article.title: article.title = TitleExtractor.extract( article.raw_html, article.raw_doc)[0] k = KeywordsExtractor(num_kewyords=20, verbose=True, data_path=data_path) if article.top_node is not None: main_body = etree.tostring(article.top_node) else: cleant_text_suggestions = MainTextExtractor.extract(article.raw_html, article.raw_doc) article.cleaned_text = cleant_text_suggestions[1] if not article.cleaned_text: article.cleaned_text = cleant_text_suggestions[2] if not article.cleaned_text: raise NoMainTextException main_body = 'Sorry, we could not detect the main HTML content for this article' try: summaries = Summarize( article.title, article.cleaned_text.encode('utf-8', 'ignore')) except Exception, e: summaries.append('We could not make summaries at this time.') try: text_sentiment = getSentimentText(article.cleaned_text) except Exception, e: text_sentiment = None