def get_answer_from_doc(rep_doc):
    extractor = MasterExtractor()
    doc = Document(rep_doc['title'], rep_doc[' description'], rep_doc[' body'],
                   rep_doc[' time'])
    doc = extractor.parse(doc)
    try:
        top_who_answer = doc.get_top_answer('who').get_parts_as_text()
    except:
        top_who_answer = 'unknown'
    try:
        top_what_answer = doc.get_top_answer('what').get_parts_as_text()
    except:
        top_what_answer = 'unknown'
    try:
        top_when_answer = doc.get_top_answer('when').get_parts_as_text()
    except:
        top_when_answer = 'unknown'
    try:
        top_where_answer = doc.get_top_answer('where').get_parts_as_text()
    except:
        top_where_answer = 'unknown'
    try:
        top_why_answer = doc.get_top_answer('why').get_parts_as_text()
    except:
        top_why_answer = 'unknown'
    try:
        top_how_answer = doc.get_top_answer('how').get_parts_as_text()
    except:
        top_how_answer = 'unknown'

    return (top_who_answer, top_what_answer, top_when_answer, top_where_answer,
            top_why_answer, top_how_answer)
def __main(offset):
    df = get_articles('./data', load=True)
    clsfyd = list()
    for i, yeardf in split_by_year(df):
        if int(i.year) == 2018:
            continue
        model = get_LDA_model('./saves', int(i.year))
        clsfyd += classify_docs(yeardf, model)
    print(len(clsfyd))
    resf = open('res_on_issue_month.txt', 'w')

    extractor = MasterExtractor()
    for idx, quarter in enumerate(clsfyd):
        resf.write('===Quarter %d===\n' % idx)
        for cat, docs in enumerate(quarter):
            if cat in TO_FIND[idx // 12]:
                res = get_issue_stats(docs, extractor)

                resf.write('===Category %d===\n' % cat)
                for key in res:
                    resf.write('%s: %s\n' %
                               (key, str(res[key].most_common(3))))
                top_headline, top_res = get_top_headline(res, docs, extractor)
                resf.write('Headline: %s\n' % top_headline)
                for key in top_res:
                    resf.write('%s: %s\n' % (key, top_res[key]))
                resf.flush()
    resf.close()
Esempio n. 3
0
def extract_article(news_json):
    extractor = MasterExtractor()
    #doc = Document.from_text(sample["text"], sample["date_publish"])
    doc = Document(news_json["title"], news_json["description"],
                   news_json["text"], news_json["date_publish"])
    # or: doc = Document(title, lead, text, date_publish)
    doc = extractor.parse(doc)
    who = doc.get_top_answer('who').get_parts_as_text() if len(
        doc.get_answers('who')) > 0 else ""
    what = doc.get_top_answer('what').get_parts_as_text() if len(
        doc.get_answers('what')) > 0 else ""
    where = doc.get_top_answer('where').get_parts_as_text() if len(
        doc.get_answers('where')) > 0 else ""
    why = doc.get_top_answer('why').get_parts_as_text() if len(
        doc.get_answers('why')) > 0 else ""
    how = doc.get_top_answer('how').get_parts_as_text() if len(
        doc.get_answers('how')) > 0 else ""
    return {'who': who, 'what': what, 'where': where, 'why': why, 'how': how}
Esempio n. 4
0
    def __init__(self,
                 source,
                 dest,
                 id_col_name="ID",
                 source_col_name="Preprocessed Text",
                 dest_col_name=None,
                 include_cols=None):

        super().__init__(source=source,
                         dest=dest,
                         id_col_name=id_col_name,
                         source_col_name=source_col_name,
                         include_cols=include_cols)
        configure_nltk()
        sleep(60)  # Wait for Stanford CoreNLP server to start.
        extractor_preprocessor = Preprocessor("http://corenlp-service:9000")
        extractors = [
            action_extractor.ActionExtractor(),
            cause_extractor.CauseExtractor(),
            method_extractor.MethodExtractor()
        ]
        self.extractor = MasterExtractor(preprocessor=extractor_preprocessor,
                                         extractors=extractors)
Esempio n. 5
0
# -*- coding: utf-8 -*-
from Giveme5W1H.extractor.preprocessors.preprocessor_core_nlp import Preprocessor
preprocessor = Preprocessor('http://49.156.128.11:9000')
#MasterExtractor(preprocessor=preprocessor)
from Giveme5W1H.extractor.document import Document
from Giveme5W1H.extractor.extractor import MasterExtractor
from fake_useragent import UserAgent
from googletrans import Translator
import requests
import itertools
import sys
import time

extractor = MasterExtractor(preprocessor=preprocessor)

ua = str(UserAgent().random)
#ua='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0'


def proxy():
    response = requests.get(
        "https://www.proxy-list.download/api/v1/get?type=http&a}non=elite&country=in"
    )
    proxy = (response.text)
    proxt_list = proxy.split('\r\n')
    proxy_dict = dict(
        itertools.zip_longest(*[iter(proxt_list)] * 2, fillvalue=""))
    return proxy_dict


proxy_dict = proxy()
Esempio n. 6
0
# test file path
# huffPostDataFilePath = '../../5w1h-test-data/rawHuffPostData.json'
# keywordsTextFilePath = '../../5w1h-test-data/keywordsTextFile.txt'
# writingErrorIndexesFilePath = '../../5w1h-test-data/keywordMakingErrorIndexes.json'
# real file path
huffPostDataFilePath = '../../lda-ner-result-data/rawHuffPostData.json'
keywordsTextFilePath = '../../5w1h-result-data/keywordsTextFile.txt'
writingErrorIndexesFilePath = '../../5w1h-result-data/keywordMakingErrorIndexes.json'

huffPostData = json.load(codecs.open(huffPostDataFilePath, 'r', 'utf-8-sig'))
keywordsTextFile = codecs.open(keywordsTextFilePath, 'r', 'utf-8-sig')

# find start index by readLengh
startTextFileLength = len(keywordsTextFile.readlines())

extractor = MasterExtractor()
wordNetLemmatizer = WordNetLemmatizer()

errorIndexes = []
print('process start')
with open(keywordsTextFilePath, 'a') as f:
    # f.write('\nasdf')
    for i in range(startTextFileLength, len(huffPostData)):
        # for i in range(startTextFileLength, 22552):
        try:
            print(i, 'th index start')
            huffPostDatum = huffPostData[i]
            keywords = []
            doc = Document(huffPostDatum['title'], huffPostDatum['subtitle'],
                           huffPostDatum['content'], huffPostDatum['date'])
            print(i, 'extractor.parse(doc) start')
Esempio n. 7
0
class WHPhrasesBatchProcessor(BatchProcessorBase):
    """
    Extracts the WH phrases (who, what, when, where, why, how) from text.

    This is intended to be run from within a Docker network, since
    access to a Stanford CoreNLP server API at http://corenlp-service:9000
    is required. Please see the readme file at https://github.com/stevengt/whatwhy
    for more information.
    """
    def __init__(self,
                 source,
                 dest,
                 id_col_name="ID",
                 source_col_name="Preprocessed Text",
                 dest_col_name=None,
                 include_cols=None):

        super().__init__(source=source,
                         dest=dest,
                         id_col_name=id_col_name,
                         source_col_name=source_col_name,
                         include_cols=include_cols)
        configure_nltk()
        sleep(60)  # Wait for Stanford CoreNLP server to start.
        extractor_preprocessor = Preprocessor("http://corenlp-service:9000")
        extractors = [
            action_extractor.ActionExtractor(),
            cause_extractor.CauseExtractor(),
            method_extractor.MethodExtractor()
        ]
        self.extractor = MasterExtractor(preprocessor=extractor_preprocessor,
                                         extractors=extractors)

    def get_top_wh_phrases(self, text_segment):
        top_phrases = {}
        for question_type in QUESTION_WORDS:
            top_phrases[question_type] = None

        if text_segment is not None and text_segment is not np.nan:
            try:
                doc = Document.from_text(text_segment)
                doc = self.extractor.parse(doc)
                for question_type in QUESTION_WORDS:
                    if question_type == "where" or question_type == "when":
                        top_phrases[question_type] = "NOT PROCESSED"
                    else:
                        try:
                            top_phrases[question_type] = doc.get_top_answer(
                                question_type).get_parts_as_text()
                        except:
                            continue
            except:
                pass

        return top_phrases

    def get_batch_results(self, batch):
        batch_as_df = get_df_from_csv_string(batch)
        for question_type in QUESTION_WORDS:
            batch_as_df[question_type] = None
        for i, row in batch_as_df.iterrows():
            top_wh_phrases = self.get_top_wh_phrases(row[self.source_col_name])
            for question_type in QUESTION_WORDS:
                batch_as_df.at[i, question_type] = top_wh_phrases.get(
                    question_type)

        results_df_cols = [self.id_col_name]
        results_df_cols.extend(QUESTION_WORDS)
        results_df_cols.extend(self.include_cols)
        results_df = batch_as_df[results_df_cols]
        results_csv_string = get_csv_string_from_df(results_df)

        results = {
            "target_results_file_name":
            f"batch{batch_as_df[self.id_col_name].iloc[0]}.csv",
            "file_content": results_csv_string
        }
        return results
Esempio n. 8
0
"""
This is a simple example how to use the extractor in combination with news-please, a news crawler and scraper (https://github.com/fhamborg/news-please).

- Nothing is cached

"""

# don`t forget to start up core_nlp_host
# giveme5w1h-corenlp

from newsplease import NewsPlease

from Giveme5W1H.extractor.document import Document
from Giveme5W1H.extractor.extractor import MasterExtractor

extractor = MasterExtractor()


def main():
    article = NewsPlease.from_url(
        'https://www.foxnews.com/politics/house-democrat-subpoenas-mnuchin-irs-for-trumps-tax-returns'
    )
    doc = Document.from_newsplease(article)
    doc = extractor.parse(doc)
    answers = doc.get_top_answer('who').get_parts_as_text()


if __name__ == '__main__':
    main()