def get_answer_from_doc(rep_doc): extractor = MasterExtractor() doc = Document(rep_doc['title'], rep_doc[' description'], rep_doc[' body'], rep_doc[' time']) doc = extractor.parse(doc) try: top_who_answer = doc.get_top_answer('who').get_parts_as_text() except: top_who_answer = 'unknown' try: top_what_answer = doc.get_top_answer('what').get_parts_as_text() except: top_what_answer = 'unknown' try: top_when_answer = doc.get_top_answer('when').get_parts_as_text() except: top_when_answer = 'unknown' try: top_where_answer = doc.get_top_answer('where').get_parts_as_text() except: top_where_answer = 'unknown' try: top_why_answer = doc.get_top_answer('why').get_parts_as_text() except: top_why_answer = 'unknown' try: top_how_answer = doc.get_top_answer('how').get_parts_as_text() except: top_how_answer = 'unknown' return (top_who_answer, top_what_answer, top_when_answer, top_where_answer, top_why_answer, top_how_answer)
def __main(offset): df = get_articles('./data', load=True) clsfyd = list() for i, yeardf in split_by_year(df): if int(i.year) == 2018: continue model = get_LDA_model('./saves', int(i.year)) clsfyd += classify_docs(yeardf, model) print(len(clsfyd)) resf = open('res_on_issue_month.txt', 'w') extractor = MasterExtractor() for idx, quarter in enumerate(clsfyd): resf.write('===Quarter %d===\n' % idx) for cat, docs in enumerate(quarter): if cat in TO_FIND[idx // 12]: res = get_issue_stats(docs, extractor) resf.write('===Category %d===\n' % cat) for key in res: resf.write('%s: %s\n' % (key, str(res[key].most_common(3)))) top_headline, top_res = get_top_headline(res, docs, extractor) resf.write('Headline: %s\n' % top_headline) for key in top_res: resf.write('%s: %s\n' % (key, top_res[key])) resf.flush() resf.close()
def extract_article(news_json): extractor = MasterExtractor() #doc = Document.from_text(sample["text"], sample["date_publish"]) doc = Document(news_json["title"], news_json["description"], news_json["text"], news_json["date_publish"]) # or: doc = Document(title, lead, text, date_publish) doc = extractor.parse(doc) who = doc.get_top_answer('who').get_parts_as_text() if len( doc.get_answers('who')) > 0 else "" what = doc.get_top_answer('what').get_parts_as_text() if len( doc.get_answers('what')) > 0 else "" where = doc.get_top_answer('where').get_parts_as_text() if len( doc.get_answers('where')) > 0 else "" why = doc.get_top_answer('why').get_parts_as_text() if len( doc.get_answers('why')) > 0 else "" how = doc.get_top_answer('how').get_parts_as_text() if len( doc.get_answers('how')) > 0 else "" return {'who': who, 'what': what, 'where': where, 'why': why, 'how': how}
def __init__(self, source, dest, id_col_name="ID", source_col_name="Preprocessed Text", dest_col_name=None, include_cols=None): super().__init__(source=source, dest=dest, id_col_name=id_col_name, source_col_name=source_col_name, include_cols=include_cols) configure_nltk() sleep(60) # Wait for Stanford CoreNLP server to start. extractor_preprocessor = Preprocessor("http://corenlp-service:9000") extractors = [ action_extractor.ActionExtractor(), cause_extractor.CauseExtractor(), method_extractor.MethodExtractor() ] self.extractor = MasterExtractor(preprocessor=extractor_preprocessor, extractors=extractors)
# -*- coding: utf-8 -*- from Giveme5W1H.extractor.preprocessors.preprocessor_core_nlp import Preprocessor preprocessor = Preprocessor('http://49.156.128.11:9000') #MasterExtractor(preprocessor=preprocessor) from Giveme5W1H.extractor.document import Document from Giveme5W1H.extractor.extractor import MasterExtractor from fake_useragent import UserAgent from googletrans import Translator import requests import itertools import sys import time extractor = MasterExtractor(preprocessor=preprocessor) ua = str(UserAgent().random) #ua='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0' def proxy(): response = requests.get( "https://www.proxy-list.download/api/v1/get?type=http&a}non=elite&country=in" ) proxy = (response.text) proxt_list = proxy.split('\r\n') proxy_dict = dict( itertools.zip_longest(*[iter(proxt_list)] * 2, fillvalue="")) return proxy_dict proxy_dict = proxy()
# test file path # huffPostDataFilePath = '../../5w1h-test-data/rawHuffPostData.json' # keywordsTextFilePath = '../../5w1h-test-data/keywordsTextFile.txt' # writingErrorIndexesFilePath = '../../5w1h-test-data/keywordMakingErrorIndexes.json' # real file path huffPostDataFilePath = '../../lda-ner-result-data/rawHuffPostData.json' keywordsTextFilePath = '../../5w1h-result-data/keywordsTextFile.txt' writingErrorIndexesFilePath = '../../5w1h-result-data/keywordMakingErrorIndexes.json' huffPostData = json.load(codecs.open(huffPostDataFilePath, 'r', 'utf-8-sig')) keywordsTextFile = codecs.open(keywordsTextFilePath, 'r', 'utf-8-sig') # find start index by readLengh startTextFileLength = len(keywordsTextFile.readlines()) extractor = MasterExtractor() wordNetLemmatizer = WordNetLemmatizer() errorIndexes = [] print('process start') with open(keywordsTextFilePath, 'a') as f: # f.write('\nasdf') for i in range(startTextFileLength, len(huffPostData)): # for i in range(startTextFileLength, 22552): try: print(i, 'th index start') huffPostDatum = huffPostData[i] keywords = [] doc = Document(huffPostDatum['title'], huffPostDatum['subtitle'], huffPostDatum['content'], huffPostDatum['date']) print(i, 'extractor.parse(doc) start')
class WHPhrasesBatchProcessor(BatchProcessorBase): """ Extracts the WH phrases (who, what, when, where, why, how) from text. This is intended to be run from within a Docker network, since access to a Stanford CoreNLP server API at http://corenlp-service:9000 is required. Please see the readme file at https://github.com/stevengt/whatwhy for more information. """ def __init__(self, source, dest, id_col_name="ID", source_col_name="Preprocessed Text", dest_col_name=None, include_cols=None): super().__init__(source=source, dest=dest, id_col_name=id_col_name, source_col_name=source_col_name, include_cols=include_cols) configure_nltk() sleep(60) # Wait for Stanford CoreNLP server to start. extractor_preprocessor = Preprocessor("http://corenlp-service:9000") extractors = [ action_extractor.ActionExtractor(), cause_extractor.CauseExtractor(), method_extractor.MethodExtractor() ] self.extractor = MasterExtractor(preprocessor=extractor_preprocessor, extractors=extractors) def get_top_wh_phrases(self, text_segment): top_phrases = {} for question_type in QUESTION_WORDS: top_phrases[question_type] = None if text_segment is not None and text_segment is not np.nan: try: doc = Document.from_text(text_segment) doc = self.extractor.parse(doc) for question_type in QUESTION_WORDS: if question_type == "where" or question_type == "when": top_phrases[question_type] = "NOT PROCESSED" else: try: top_phrases[question_type] = doc.get_top_answer( question_type).get_parts_as_text() except: continue except: pass return top_phrases def get_batch_results(self, batch): batch_as_df = get_df_from_csv_string(batch) for question_type in QUESTION_WORDS: batch_as_df[question_type] = None for i, row in batch_as_df.iterrows(): top_wh_phrases = self.get_top_wh_phrases(row[self.source_col_name]) for question_type in QUESTION_WORDS: batch_as_df.at[i, question_type] = top_wh_phrases.get( question_type) results_df_cols = [self.id_col_name] results_df_cols.extend(QUESTION_WORDS) results_df_cols.extend(self.include_cols) results_df = batch_as_df[results_df_cols] results_csv_string = get_csv_string_from_df(results_df) results = { "target_results_file_name": f"batch{batch_as_df[self.id_col_name].iloc[0]}.csv", "file_content": results_csv_string } return results
""" This is a simple example how to use the extractor in combination with news-please, a news crawler and scraper (https://github.com/fhamborg/news-please). - Nothing is cached """ # don`t forget to start up core_nlp_host # giveme5w1h-corenlp from newsplease import NewsPlease from Giveme5W1H.extractor.document import Document from Giveme5W1H.extractor.extractor import MasterExtractor extractor = MasterExtractor() def main(): article = NewsPlease.from_url( 'https://www.foxnews.com/politics/house-democrat-subpoenas-mnuchin-irs-for-trumps-tax-returns' ) doc = Document.from_newsplease(article) doc = extractor.parse(doc) answers = doc.get_top_answer('who').get_parts_as_text() if __name__ == '__main__': main()