Esempio n. 1
0
    def extract(self,
                link,
                entity_description=False,
                sentiment=False,
                data_path='./data/'):
        errors, summaries, categories, entities, keywords = [], [], [], [], []
        article = Goose().extract(link)

        valid_html = bool(
            BeautifulSoup(article.raw_html[0:100], "html.parser").find())

        if not valid_html:
            r = requests.get(link)
            article.raw_html = r.text
            article.raw_doc = html.fromstring(r.text)

        if article.raw_doc is None:
            raise NoMainTextException

        authors = AuthorExtractor.extract(link, article.raw_html)
        publish_date = article.publish_date if article.publish_date else None

        if not article.title:
            article.title = TitleExtractor.extract(article.raw_html,
                                                   article.raw_doc)[0]

        k = KeywordsExtractor(num_kewyords=20,
                              verbose=True,
                              data_path=data_path)

        if article.top_node is not None:
            main_body = etree.tostring(article.top_node)
        else:
            cleant_text_suggestions = MainTextExtractor.extract(
                article.raw_html, article.raw_doc)
            article.cleaned_text = cleant_text_suggestions[1]
            if not article.cleaned_text:
                article.cleaned_text = cleant_text_suggestions[2]
            if not article.cleaned_text:
                raise NoMainTextException
            main_body = 'Sorry, we could not detect the main HTML content for this article'

        try:
            summaries = Summarize(
                article.title, article.cleaned_text.encode('utf-8', 'ignore'))
        except Exception, e:
            summaries.append('We could not make summaries at this time.')
Esempio n. 2
0
    def extract(self,
                link,
                entity_description=False,
                sentiment=False,
                data_path='./data/'):
        errors, summaries, categories, entities, keywords = [], [], [], [], []
        pdf_pattern = re.compile(
            '.*application\/pdf.*|.*application\/octet-stream.*')
        html_pattern = re.compile('.*text\/html.*')

        article = Goose().extract(link)

        content_type = article.__dict__['additional_data']['result'].info(
        )['content-type']
        matches_html = len(re.findall(html_pattern, content_type))
        matches_pdf = len(re.findall(pdf_pattern, content_type))

        if matches_html == 0:
            # Textract
            url2text = Url2Text()
            texts = url2text.extract(link)

            k = KeywordsExtractor(num_kewyords=20,
                                  verbose=True,
                                  data_path=data_path)
            ent = Entities()
            clf = Classifier(data_path=data_path)

            return {
                "title":
                os.path.basename(link),
                "link":
                link,
                "author": [],
                "cleaned_text":
                texts[0],
                "text_sentiment":
                getSentimentText(texts[0]),
                "main_body":
                None,
                "images":
                None,
                "image":
                None,
                "date":
                article.__dict__['additional_data']['result'].info()
                ['last-modified'],
                "tags":
                k.extract([texts[0]], None, None, 'news')[0],
                "entities":
                ent.extract(texts[0], entity_description),
                "language":
                langid.classify(texts[0])[0],
                "summary":
                Summarize(None, texts[0]),
                "categories":
                clf.predict(texts[0])
            }
            pass
        else:

            valid_html = bool(
                BeautifulSoup(article.raw_html[0:100], "html.parser").find())

            if not valid_html:
                headers = {
                    'User-Agent':
                    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'
                }
                r = requests.get(link, headers=headers)
                article.raw_html = r.text
                article.raw_doc = html.fromstring(r.text)

            if article.raw_doc is None:
                raise NoMainTextException

            authors = AuthorExtractor.extract(link, article.raw_html)
            publish_date = article.publish_date if article.publish_date else None

            if not article.title:
                article.title = TitleExtractor.extract(article.raw_html,
                                                       article.raw_doc)[0]

            k = KeywordsExtractor(num_kewyords=20,
                                  verbose=True,
                                  data_path=data_path)

            if article.top_node is not None:
                main_body = etree.tostring(article.top_node)
            else:
                cleant_text_suggestions = MainTextExtractor.extract(
                    article.raw_html, article.raw_doc)
                article.cleaned_text = cleant_text_suggestions[1]
                if not article.cleaned_text:
                    article.cleaned_text = cleant_text_suggestions[2]
                if not article.cleaned_text:
                    raise NoMainTextException
                main_body = 'Sorry, we could not detect the main HTML content for this article'

            try:
                summaries = Summarize(
                    article.title,
                    article.cleaned_text.encode('utf-8', 'ignore'))
            except Exception, e:
                summaries.append('We could not make summaries at this time.')

            try:
                text_sentiment = getSentimentText(article.cleaned_text)
            except Exception, e:
                text_sentiment = None
Esempio n. 3
0
	def extract(self, link, entity_description=False, sentiment=False, data_path='./data/'):
		errors, summaries, categories, entities, keywords = [], [], [], [], []
		pdf_pattern = re.compile('.*application\/pdf.*|.*application\/octet-stream.*')
		html_pattern = re.compile('.*text\/html.*')

		article = Goose().extract(link)

		content_type = article.__dict__['additional_data']['result'].info()['content-type']
		matches_html = len(re.findall(html_pattern, content_type))
		matches_pdf = len(re.findall(pdf_pattern, content_type))

		if matches_html == 0:
			# Textract
			url2text = Url2Text()
			texts = url2text.extract(link)

			k = KeywordsExtractor(num_kewyords=20, verbose=True, data_path=data_path)
			ent = Entities()
			clf = Classifier(data_path=data_path)

			return {
				"title": os.path.basename(link),
				"link": link,
				"author": [],
				"cleaned_text": texts[0],
				"text_sentiment": getSentimentText(texts[0]),
				"main_body": None,
				"images": None,
				"image": None,
				"date": article.__dict__['additional_data']['result'].info()['last-modified'],
				"tags": k.extract([texts[0]], None, None, 'news')[0],
				"entities": ent.extract(texts[0], entity_description),
				"language": langid.classify(texts[0])[0],
				"summary": Summarize(None, texts[0]),
				"categories": clf.predict(texts[0])
			}
			pass
		else:

			valid_html = bool(BeautifulSoup(article.raw_html[0:100], "html.parser").find())

			if not valid_html:
				headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'}
				r = requests.get(link, headers=headers)
				article.raw_html = r.text
				article.raw_doc = html.fromstring(r.text)

			if article.raw_doc is None:
				raise NoMainTextException

			authors = AuthorExtractor.extract(link, article.raw_html)
			publish_date = article.publish_date if article.publish_date else None

			if not article.title:
				article.title = TitleExtractor.extract(
					article.raw_html, article.raw_doc)[0]

			k = KeywordsExtractor(num_kewyords=20, verbose=True, data_path=data_path)

			if article.top_node is not None:
				main_body = etree.tostring(article.top_node)
			else:
				cleant_text_suggestions = MainTextExtractor.extract(article.raw_html, article.raw_doc)
				article.cleaned_text = cleant_text_suggestions[1]
				if not article.cleaned_text:
					article.cleaned_text = cleant_text_suggestions[2]
				if not article.cleaned_text:
					raise NoMainTextException
				main_body = 'Sorry, we could not detect the main HTML content for this article'

			try:
				summaries = Summarize(
					article.title, article.cleaned_text.encode('utf-8', 'ignore'))
			except Exception, e:
				summaries.append('We could not make summaries at this time.')

			try:
				text_sentiment = getSentimentText(article.cleaned_text)
			except Exception, e:
				text_sentiment = None