def __stanford_openie(self, input, output, verbose=False): with open(input, 'r') as input_file: contents = input_file.read() input_file.close() if verbose: print('Searching for triples using Stanford OpenIE ...') nlp = CoreNLPWrapper() annotated = nlp.annotate( contents, properties={ 'annotators': 'tokenize, ssplit, pos, ner, depparse, parse, openie' }) for sentence in annotated['sentences']: for openie in sentence['openie']: with open(output, 'a') as output_file: triple = Triple(sentence['index'], openie['subject'], openie['relation'], openie['object']) if verbose: print(triple.to_string()) output_file.write(triple.to_string() + '\n') output_file.close() return output
def __clausie(self, input, output, verbose=False): with open(input, 'r') as input_file: contents = input_file.read() input_file.close() if verbose: print('Searching for triples using ClausIE ...') input_clausie = os.path.splitext(input)[0] + '_clausie_input.txt' open(input_clausie, 'w').close() print('Preparing contents to be processed by ClausIE at {}'.format( input_clausie)) nlp = CoreNLPWrapper() annotated = nlp.annotate( contents, properties={'annotators': 'tokenize, ssplit, pos'}) for sentence in annotated['sentences']: sent_str = '' for token in sentence['tokens']: if token['pos'] == 'POS': sent_str.strip() sent_str += token['word'] + ' ' with open(input_clausie, 'a') as clausie_file: clausie_file.write( str(sentence['index']) + '\t' + sent_str.strip() + '\n') clausie_file.close() clausie_out = ClausIEWrapper.run_clausie(input_clausie, output, verbose) os.remove(input_clausie) # We need to do some adjustments to the output. final_contents = "" with open(clausie_out, 'r') as clausie_out_file: line = clausie_out_file.readline() while line: line = line.replace('\"', '').split('\t') triple = Triple(line[0].strip(), NLPUtils.adjust_tokens(line[1].strip()), line[2].strip(), NLPUtils.adjust_tokens(line[3].strip())) if verbose: print(triple.to_string()) final_contents += triple.to_string() + '\n' line = clausie_out_file.readline() final_file = open(clausie_out, "w") n = final_file.write(final_contents) final_file.close() return final_file
def __senna(self, input_filename, output_filename, verbose=False): if verbose: print('Performing Sentence Role Labeling with SENNA...') senna = SennaWrapper() out_contents = '' with open(input_filename, 'r') as input_file: sentence_number = 0 for line in input_file.readlines(): if len(line) < 1: continue senna_output = senna.srl(NLPUtils.adjust_tokens(line), verbose=False) for predicate in senna_output.keys(): dict_contents = senna_output[predicate] agent = None patient = None if 'A0' in dict_contents and 'A1' in dict_contents: agent = dict_contents['A0'] patient = dict_contents['A1'] elif 'A0' in dict_contents: # No A1 agent = dict_contents['A0'] if 'A2' in dict_contents: patient = dict_contents['A2'] else: for key in dict_contents.keys(): if not key == 'A0': patient = dict_contents elif 'A1' in dict_contents: # No A0 patient = dict_contents['A1'] if 'A2' in dict_contents: agent = dict_contents['A2'] else: for key in dict_contents.keys(): if not key == 'A1': agent = dict_contents[key] else: # Neither A0 nor A1 if 'A2' in dict_contents: agent = dict_contents['A2'] for key in dict_contents.keys(): if not key == 'A2': patient = dict_contents[key] else: # Very unlikely key_lst = dict_contents.keys() key_lst.sort(key = len) # sort by string length agent = dict_contents[key_lst[0]] patient = dict_contents[key_lst[1]] if agent is None or patient is None: print('-Warning: No agent or patient determined for predicate {}'.format(predicate)) print('-- agent: {}'.format(agent)) print('-- patient: {}'.format(patient)) continue triple = Triple(sentence_number, agent, predicate, patient) if verbose: print(triple.to_string()) out_contents += triple.to_string() + '\n' sentence_number += 1 input_file.close() with open(output_filename, 'w') as output_file: output_file.write(out_contents) output_file.close() return output_filename
def __senna(self, input_filename, output_filename, verbose=False): if verbose: print('Performing Sentence Role Labeling with SENNA...') senna = SennaWrapper() out_contents = '' with open(input_filename, 'r') as input_file: sentence_number = 0 for line in input_file.readlines(): if len(line) < 1: continue dependency_list = NLPUtils.dependency_parse( line, deps_key='enhancedPlusPlusDependencies', verbose=verbose) previous_term = '' previous_compound = '' dict_basic_to_most_specific = {} connective_dependencies = [] while len(dependency_list) > 0: elem = dependency_list.pop() if elem[1] in ['ROOT', 'punct', 'det' ] or 'subj' in elem[1] or 'obj' in elem[1]: continue if elem[1] in ['compound', 'nmod:poss', 'aux', 'neg' ] or elem[1].endswith('mod'): if previous_term == elem[0]: updated_term = '{} {}'.format( elem[2], previous_compound) else: updated_term = '{} {}'.format(elem[2], elem[0]) previous_compound = elem[0] dict_basic_to_most_specific[elem[0]] = updated_term triple = Triple(sentence_number, updated_term, 'rdfs:subClassOf', previous_compound) previous_compound = updated_term previous_term = elem[0] if verbose: print(triple.to_string()) out_contents += triple.to_string() + '\n' elif elem[1] in ['acl', 'appos' ] or elem[1].startswith('nmod:'): connective_dependencies.append(elem) while len(connective_dependencies) > 0: elem = connective_dependencies.pop() if elem[1] == 'nmod:poss': continue if elem[1].find(':') > 0: # e.g. 'nmod:of' connector = elem[1][elem[1].find(':') + 1:] elif elem[1] in ['acl', 'appos']: connector = '' else: connector = elem[1] first = elem[0] if first in dict_basic_to_most_specific.keys(): first = dict_basic_to_most_specific[first] second = elem[2] if second in dict_basic_to_most_specific.keys(): second = dict_basic_to_most_specific[second] if connector == '': full = '{} {}'.format(first, second) else: full = '{} {} {}'.format(first, connector, second) triple = Triple( sentence_number, full, 'local:{}_{}'.format(connector, second.replace(' ', '')), first) if verbose: print(triple.to_string()) out_contents += triple.to_string() + '\n' triple = Triple( sentence_number, full, 'local:{}_{}'.format(first.replace(' ', ''), connector), second) if verbose: print(triple.to_string()) out_contents += triple.to_string() + '\n' dict_basic_to_most_specific[elem[0]] = full senna_output = senna.srl(line, verbose=False) for predicate in senna_output.keys(): pred_args = senna_output[predicate] pred_arg_names = NLPUtils.get_verbnet_args(predicate, verbose=True) if len(pred_arg_names) < 1: print( 'WARNING -- Unable to retrieve predicate arg names for "{}"' .format(predicate)) if verbose: print('predicate: {}, args: {}'.format( predicate, pred_args)) for pred_arg in pred_args: if 'AM-NEG' == pred_arg: predicate = 'not {}'.format(predicate) elif 'AM-MOD' == pred_arg: predicate = ' '.join( [pred_args['AM-MOD'].strip(), predicate]) elif pred_arg.startswith('AM-'): # Remove initial stopwords (e.g. determiners) s = pred_args[pred_arg].strip() split = s.split(' ', 1) if NLPUtils.is_stopword( split[0]) and len(split) > 1: s = s.split(' ', 1)[1] triple = Triple(sentence_number, predicate, 'local:{}'.format(pred_arg), s) if verbose: print(triple.to_string()) out_contents += triple.to_string() + '\n' for i in range(len(pred_arg_names)): pred_args_index = 'A{}'.format(i) if pred_args_index in pred_args: # Remove initial stopwords (e.g. determiners) s = pred_args[pred_args_index].strip() split = s.split(' ', 1) if NLPUtils.is_stopword( split[0]) and len(split) > 1: s = s.split(' ', 1)[1] triple = Triple( sentence_number, predicate, 'vn.role:{}'.format(pred_arg_names[i]), s) if verbose: print(triple.to_string()) out_contents += triple.to_string() + '\n' sentence_number += 1 input_file.close() with open(output_filename, 'w') as output_file: output_file.write(out_contents) output_file.close() return output_filename