def get_answer_from_doc(rep_doc): extractor = MasterExtractor() doc = Document(rep_doc['title'], rep_doc[' description'], rep_doc[' body'], rep_doc[' time']) doc = extractor.parse(doc) try: top_who_answer = doc.get_top_answer('who').get_parts_as_text() except: top_who_answer = 'unknown' try: top_what_answer = doc.get_top_answer('what').get_parts_as_text() except: top_what_answer = 'unknown' try: top_when_answer = doc.get_top_answer('when').get_parts_as_text() except: top_when_answer = 'unknown' try: top_where_answer = doc.get_top_answer('where').get_parts_as_text() except: top_where_answer = 'unknown' try: top_why_answer = doc.get_top_answer('why').get_parts_as_text() except: top_why_answer = 'unknown' try: top_how_answer = doc.get_top_answer('how').get_parts_as_text() except: top_how_answer = 'unknown' return (top_who_answer, top_what_answer, top_when_answer, top_where_answer, top_why_answer, top_how_answer)
def extract_article(news_json): extractor = MasterExtractor() #doc = Document.from_text(sample["text"], sample["date_publish"]) doc = Document(news_json["title"], news_json["description"], news_json["text"], news_json["date_publish"]) # or: doc = Document(title, lead, text, date_publish) doc = extractor.parse(doc) who = doc.get_top_answer('who').get_parts_as_text() if len( doc.get_answers('who')) > 0 else "" what = doc.get_top_answer('what').get_parts_as_text() if len( doc.get_answers('what')) > 0 else "" where = doc.get_top_answer('where').get_parts_as_text() if len( doc.get_answers('where')) > 0 else "" why = doc.get_top_answer('why').get_parts_as_text() if len( doc.get_answers('why')) > 0 else "" how = doc.get_top_answer('how').get_parts_as_text() if len( doc.get_answers('how')) > 0 else "" return {'who': who, 'what': what, 'where': where, 'why': why, 'how': how}
wordNetLemmatizer = WordNetLemmatizer() errorIndexes = [] print('process start') with open(keywordsTextFilePath, 'a') as f: # f.write('\nasdf') for i in range(startTextFileLength, len(huffPostData)): # for i in range(startTextFileLength, 22552): try: print(i, 'th index start') huffPostDatum = huffPostData[i] keywords = [] doc = Document(huffPostDatum['title'], huffPostDatum['subtitle'], huffPostDatum['content'], huffPostDatum['date']) print(i, 'extractor.parse(doc) start') doc = extractor.parse(doc) print(i, 'doc.get_answers() start') answers = doc.get_answers() for (fivew1h, answer) in answers.items(): if len(answer) != 0: text = answer[0].get_parts_as_text() words = word_tokenize(text) wordPosTuples = pos_tag(words) for wordPosTuple in wordPosTuples: if wordPosTuple[1].startswith('N'): lemmatizedWord = wordNetLemmatizer.lemmatize( wordPosTuple[0].lower()) keywords.append({ 'keyword': lemmatizedWord,
class WHPhrasesBatchProcessor(BatchProcessorBase): """ Extracts the WH phrases (who, what, when, where, why, how) from text. This is intended to be run from within a Docker network, since access to a Stanford CoreNLP server API at http://corenlp-service:9000 is required. Please see the readme file at https://github.com/stevengt/whatwhy for more information. """ def __init__(self, source, dest, id_col_name="ID", source_col_name="Preprocessed Text", dest_col_name=None, include_cols=None): super().__init__(source=source, dest=dest, id_col_name=id_col_name, source_col_name=source_col_name, include_cols=include_cols) configure_nltk() sleep(60) # Wait for Stanford CoreNLP server to start. extractor_preprocessor = Preprocessor("http://corenlp-service:9000") extractors = [ action_extractor.ActionExtractor(), cause_extractor.CauseExtractor(), method_extractor.MethodExtractor() ] self.extractor = MasterExtractor(preprocessor=extractor_preprocessor, extractors=extractors) def get_top_wh_phrases(self, text_segment): top_phrases = {} for question_type in QUESTION_WORDS: top_phrases[question_type] = None if text_segment is not None and text_segment is not np.nan: try: doc = Document.from_text(text_segment) doc = self.extractor.parse(doc) for question_type in QUESTION_WORDS: if question_type == "where" or question_type == "when": top_phrases[question_type] = "NOT PROCESSED" else: try: top_phrases[question_type] = doc.get_top_answer( question_type).get_parts_as_text() except: continue except: pass return top_phrases def get_batch_results(self, batch): batch_as_df = get_df_from_csv_string(batch) for question_type in QUESTION_WORDS: batch_as_df[question_type] = None for i, row in batch_as_df.iterrows(): top_wh_phrases = self.get_top_wh_phrases(row[self.source_col_name]) for question_type in QUESTION_WORDS: batch_as_df.at[i, question_type] = top_wh_phrases.get( question_type) results_df_cols = [self.id_col_name] results_df_cols.extend(QUESTION_WORDS) results_df_cols.extend(self.include_cols) results_df = batch_as_df[results_df_cols] results_csv_string = get_csv_string_from_df(results_df) results = { "target_results_file_name": f"batch{batch_as_df[self.id_col_name].iloc[0]}.csv", "file_content": results_csv_string } return results