def convert_trees(input_file, output_file): import StanfordDependencies sd = StanfordDependencies.get_instance(backend='jpype') with open(input_file, 'r') as inpf, open(output_file, 'w') as outf: for line_no, line in enumerate(inpf): if line_no % 1000 == 0: print("Processing sentence pair {}.".format(line_no)) sentence = json.loads(line) prem = sentence.get('sentence1_parse') hypo = sentence.get('sentence2_parse') prem_dep = ' '.join([ '{}({}({}({}'.format(x.index, x.form, x.head, x.deprel) for x in sd.convert_tree(prem) ]) hypo_dep = ' '.join([ '{}({}({}({}'.format(x.index, x.form, x.head, x.deprel) for x in sd.convert_tree(hypo) ]) sentence.update({ 'sentence1_dependency_parse': prem_dep, 'sentence2_dependency_parse': hypo_dep }) outf.write(json.dumps(sentence) + "\n") print("Wrote file with dependency parse annotation to {}".format( output_file))
def parse_reports(data_path, sheet_name, file_path): report_data_file = xlrd.open_workbook(data_path) sheet = report_data_file.sheet_by_name(sheet_name) rrp = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True) sd = StanfordDependencies.get_instance(backend='subprocess') for i in range(910, 3852): finding = sheet.cell(i, 6).value with open(file_path, mode='a') as f: f.write('finding no.' + str(i)) f.write('\n') sent_tokenize_list = sent_tokenize(finding) for j in range(len(sent_tokenize_list)): try: with open(file_path, mode='a') as f: f.write('sentence no.' + str(j)) f.write('\n') sentence = sent_tokenize_list[j] tree = rrp.simple_parse(sentence) dependencies = sd.convert_tree(tree) for token in dependencies: with open(file_path, mode='a') as f: f.write(str(token)) f.write('\n') except: print('error!') with open(file_path, mode='a') as f: f.write('error!!!') f.write('\n')
def __init__(self, lang={ 'spacy': 'en', 'benepar': 'benepar_en2' }, config=None): super().__init__() self.download = False # Checking if NLTK sentence and word tokenizers should be downloaded if not config_berkeley_nlp['benepar_sent_word_tok_downloaded']: spacy.load(lang['spacy']) config_global['config_benepar'][ 'benepar_sent_word_tok_downloaded'] = True self.download = True # Checking if parsing model should be downloaded if not config_berkeley_nlp['parsing_model_downloaded']: benepar.download(lang['benepar']) config_global['config_benepar']['parsing_model_downloaded'] = True self.download = True # Updating yaml file if necessary if self.download: with open("./config.yaml", "w") as f: yaml.dump(config_global, f) self.nlp = spacy.load(lang['spacy']) self.nlp.add_pipe(BeneparComponent(lang['benepar'])) self.sd = StanfordDependencies.get_instance( backend='subprocess') # to convert trees self.name_save = 'benepar'
def corenlpLemmaPOS_stanfordparserDependency_split_equalChecking(): ## corenlp setting corenlp_dir = "stanford-corenlp-full-2014-08-27/" corenlp = StanfordCoreNLP(corenlp_dir) ## stanfordDependencies setting sd = StanfordDependencies.get_instance(backend="subprocess", version="3.4.1") os.environ["STANFORD_PARSER"] = "stanford-parser-full-2014-08-27/" os.environ["STANFORD_MODELS"] = "stanford-parser-full-2014-08-27/" parser = stanford.StanfordParser( model_path="stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" ) with open("../../dataclean_Nov8_2015/train_afterdataclean_modifiedcleanedTupleNov8.json") as t: trainTup = json.load(t) for num, tup in enumerate(trainTup): ## after modify col8 and save, col8 now may be empty.. if not tup[8]: continue ## use corenlp to split sentence print "No.", num print tup[8] res = corenlp.parse(tup[8]) par = json.loads(res) slist = par["sentences"][0]["words"] print slist temp = [] for s in slist: temp.append(s[0]) print temp ## use stanfordDependencies to do split sentence sentences = parser.raw_parse(tup[8]) s = "" for line in sentences: for sentence in line: s += str(sentence) sent = sd.convert_tree(s) print sent detemp = [] for t in sent: detemp.append(t[1]) print detemp for di, ti in zip(detemp, temp): if di == ti: pass else: if ( (ti == "(" and di == "-LRB-") or (ti == ")" and di == "-RRB-") or (ti == "[" and di == "-LSB-") or (ti == "]" and di == "-RSB-") ): print "diff in parenthesis" pass else: print "{", di, " ,", ti, " }"
def main(): if len(sys.argv) < 3: print("Usage: path_to_genia path_to_conllu") return genia_path = sys.argv[1] conllu_path = sys.argv[2] sd = StanfordDependencies.get_instance() with open(conllu_path, mode='w', encoding='utf-8') as conllu: convert_genia(genia_path, conllu, sd)
def f30_usingCorenlpSplit(dataset_,f30file_): ## stanfordDependencies setting sd = StanfordDependencies.get_instance(backend="subprocess",version='3.4.1') os.environ['STANFORD_PARSER'] = 'stanford-parser-full-2014-08-27/' os.environ['STANFORD_MODELS'] = 'stanford-parser-full-2014-08-27/' parser = stanford.StanfordParser(model_path="stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") ## load dataset with open(dataset_) as f: data = json.load(f) ## record how many sentence split unequal sentSplitUnequal = 0 ## structure to save {word:abc,gramatical relationship: xx} res = [] for num,tupl in enumerate(data): if not tupl[3]: continue newStr = ' '.join(tupl[3]) print num,newStr sentences = parser.raw_parse(newStr) s="" for line in sentences: for sentence in line: s+=str(sentence) sent = sd.convert_tree(s) tem = [] ## a list of the depent split words temp_gram = [] ## record gramatical relationship for t in sent: detemp = {} detemp['Word'] = t[1] detemp['Grammatical relation'] = t[7] temp_gram.append(detemp) tem.append(t[1]) if tem != tupl[3]: print 'depen split:',tem print 'corenlp split:',tupl[3] sentSplitUnequal += 1 ## record index print 'unequal sentence No.',num print 'No. of sentence:',num,'begin index:',len(res),'end index:',len(res)+len(tupl[3]) else: res = res+temp_gram with open(f30file_,'w') as l: json.dump(res,l) print sentSplitUnequal
def convert_to_sd(scored_parse, sd_converter=None, representation='CCprocessed'): """Converts parse tree to Stanford dependencies. :param scored_parse: ScoredParse object in RerankingParser :type scored_parse: ScoredParse :param sd_converter: StanfordDependencies converter :type sd_converter: StanfordDependencies :param representation: one of basic, collapsed, CCProcessed :type representation: str :return: array of tokens :rtype: Token """ sd_converter = sd_converter or StanfordDependencies.get_instance() return sd_converter.convert_tree(str(scored_parse.ptb_parse), representation=representation)
def checkCoreNLPSplit_DependencySplit(file_): with open(file_) as f: tset = json.load(f) ## corenlp setting corenlp_dir = "stanford-corenlp-full-2014-08-27/" corenlp = StanfordCoreNLP(corenlp_dir) ## stanfordDependencies setting sd = StanfordDependencies.get_instance(backend="subprocess",version='3.4.1') os.environ['STANFORD_PARSER'] = 'stanford-parser-full-2014-08-27/' os.environ['STANFORD_MODELS'] = 'stanford-parser-full-2014-08-27/' parser = stanford.StanfordParser(model_path="stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") for num, tup in enumerate(tset): print num if not tup[8]: continue ## use corenlp to splitup res = corenlp.parse(tup[8]) par = json.loads(res) slist = par["sentences"][0]['words'] temp = [] for s in slist: temp.append(s[0]) ## use stanfordDependencies to do split sentence sentences = parser.raw_parse(tup[8]) s="" for line in sentences: for sentence in line: s+=str(sentence) sent = sd.convert_tree(s) detemp = [] for t in sent: detemp.append(t[1]) ## check if same for di,ti in zip(detemp,temp): if di == ti: pass else: if (ti == '(' and di == '-LRB-') or (ti == ')' and di == '-RRB-') or (ti == '[' and di == '-LSB-') or (ti == ']' and di == '-RSB-'): print "diff in parenthesis" pass else: print "!!!" print "{",di,' ,',ti," }"
def __init__(self, document_as_string): """ Construct a document from a string representation. The Format must follow the CoNLL format, see http://conll.cemantix.org/2012/data.html. Args: document_as_string (str): A representation of a document in the CoNLL format. """ identifier = " ".join(document_as_string.split("\n")[0].split(" ")[2:]) print(identifier) self.document_table = CoNLLDocument.__string_to_table( document_as_string) in_sentence_ids = [int(i) for i in self.__extract_from_column(2)] indexing_start = in_sentence_ids[0] if indexing_start != 0: logger.warning("Detected " + str(indexing_start) + "-based indexing for tokens in sentences in input," "transformed to 0-based indexing.") in_sentence_ids = [i - indexing_start for i in in_sentence_ids] sentence_spans = CoNLLDocument.__extract_sentence_spans(in_sentence_ids) temp_tokens = self.__extract_from_column(3) temp_pos = self.__extract_from_column(4) temp_ner = self.__extract_ner() temp_speakers = self.__extract_from_column(9) coref = CoNLLDocument.__get_span_to_id(self.__extract_from_column(-1)) parses = [CoNLLDocument.get_parse(span, self.__extract_from_column(5), temp_pos, temp_tokens) for span in sentence_spans] sd = StanfordDependencies.get_instance() dep_trees = sd.convert_trees( [parse.replace("NOPARSE", "S") for parse in parses],include_erased=True ) sentences = [] for i, span in enumerate(sentence_spans): sentences.append( (temp_tokens[span.begin:span.end + 1], temp_pos[span.begin:span.end + 1], temp_ner[span.begin:span.end + 1], temp_speakers[span.begin:span.end + 1], parses[i], dep_trees[i]) ) super(CoNLLDocument, self).__init__(identifier, sentences, coref)
def __init__(self, document_as_string): """ Construct a document from a string representation. The Format must follow the CoNLL format, see http://conll.cemantix.org/2012/data.html. Args: document_as_string (str): A representation of a document in the CoNLL format. """ identifier = " ".join(document_as_string.split("\n")[0].split(" ")[2:]) self.document_table = CoNLLDocument.__string_to_table( document_as_string) in_sentence_ids = [int(i) for i in self.__extract_from_column(2)] indexing_start = in_sentence_ids[0] if indexing_start != 0: logger.warning("Detected " + str(indexing_start) + "-based indexing for tokens in sentences in input," "transformed to 0-based indexing.") in_sentence_ids = [i - indexing_start for i in in_sentence_ids] sentence_spans = CoNLLDocument.__extract_sentence_spans(in_sentence_ids) temp_tokens = self.__extract_from_column(3) temp_pos = self.__extract_from_column(4) temp_ner = self.__extract_ner() temp_speakers = self.__extract_from_column(9) coref = CoNLLDocument.__get_span_to_id(self.__extract_from_column(-1)) parses = [CoNLLDocument.get_parse(span, self.__extract_from_column(5), temp_pos, temp_tokens) for span in sentence_spans] sd = StanfordDependencies.get_instance() dep_trees = sd.convert_trees( [parse.replace("NOPARSE", "S") for parse in parses], ) sentences = [] for i, span in enumerate(sentence_spans): sentences.append( (temp_tokens[span.begin:span.end + 1], temp_pos[span.begin:span.end + 1], temp_ner[span.begin:span.end + 1], temp_speakers[span.begin:span.end + 1], parses[i], dep_trees[i]) ) super(CoNLLDocument, self).__init__(identifier, sentences, coref)
def __init__(self, representation='CCprocessed', universal=False): """ Args: representation(str): Currently supported representations are 'basic', 'collapsed', 'CCprocessed', and 'collapsedTree' universal(bool): if True, use universal dependencies if they're available """ try: import jpype self._backend = 'jpype' except ImportError: self._backend = 'subprocess' self._sd = StanfordDependencies.get_instance(backend=self._backend) self.representation = representation self.universal = universal
def sd_tokens(self, sd_converter=None, conversion_kwargs=None): """Convert this Tree to Stanford Dependencies (requires PyStanfordDependencies). Returns a list of StanfordDependencies.Token objects. This method caches the converted tokens. You may optionally specify a StanfordDependencies instance in sd_converter and keyword arguments to StanfordDependencies.convert_tree as a dictionary in conversion_kwargs.""" if not self._sd_tokens: try: import StanfordDependencies except ImportError: raise ImportError("For sd_tokens(), you need to install" "PyStanfordDependencies from PyPI") sd_converter = sd_converter or StanfordDependencies.get_instance() conversion_kwargs = conversion_kwargs or {} self._sd_tokens = sd_converter.convert_tree( str(self), **conversion_kwargs) return self._sd_tokens
def sd_tokens(self, sd_converter=None, conversion_kwargs=None): """Convert this Tree to Stanford Dependencies (requires PyStanfordDependencies). Returns a list of StanfordDependencies.Token objects. This method caches the converted tokens. You may optionally specify a StanfordDependencies instance in sd_converter and keyword arguments to StanfordDependencies.convert_tree as a dictionary in conversion_kwargs.""" if not self._sd_tokens: try: import StanfordDependencies except ImportError: raise ImportError("For sd_tokens(), you need to install" "PyStanfordDependencies from PyPI") sd_converter = sd_converter or StanfordDependencies.get_instance() conversion_kwargs = conversion_kwargs or {} self._sd_tokens = sd_converter.convert_tree(str(self), **conversion_kwargs) return self._sd_tokens
def convert_tree(line, entities, sent_id): print ' convert_tree with '+line sd = StanfordDependencies.get_instance( jar_filename='/root/xhong/stanford/stanford-corenlp-full-2018-10-05/stanford-corenlp-3.9.2.jar', backend='subprocess') #ex='(ROOT(S(NP (PRP$ My) (NN dog))(ADVP (RB also))(VP (VBZ likes)(S(VP (VBG eating)(NP (NN sausage)))))(. .)))' #dependencies = sd.convert_tree(ex, debug=True) idx = 0 #returns a list of sentences (list of list of Token objects) dependencies = sd.convert_tree(line, debug=True) for token in dependencies: print token if token.pos in nouns : print ' .. is a noun-'+token.pos grammatical_role = '-' if token.deprel in subject: grammatical_role = 'S' elif token.deprel in object: grammatical_role = 'O' else: grammatical_role = 'X' # print token.form token_lemma = wnl.lemmatize(token.form, get_POS(token.pos)) print token.form, token_lemma ''' if this entity has already occurred in the sentence, store the reference with highest grammatical role , judged here as S > O > X ''' if token_lemma in entities and entities[token_lemma][sent_id] : print str(entities[token_lemma][sent_id]) + ' comparing to '+str(r2i[grammatical_role]) if (entities[token_lemma][sent_id]) < r2i[grammatical_role]: entities[token_lemma][sent_id] = r2i[grammatical_role] else: entities[token_lemma][sent_id] = r2i[grammatical_role] ''' entity->list of : sentence_number->grammatical_role''' idx +=1 # print entities return entities, idx
def convert_tree(line, entities): print ' convert_tree with '+line sd = StanfordDependencies.get_instance( jar_filename='C:\SMT\StanfordNLP\stanford-corenlp-full-2013-11-12\stanford-corenlp-full-2013-11-12\stanford-corenlp-3.3.0.jar', backend='subprocess') #ex='(ROOT(S(NP (PRP$ My) (NN dog))(ADVP (RB also))(VP (VBZ likes)(S(VP (VBG eating)(NP (NN sausage)))))(. .)))' #dependencies = sd.convert_tree(ex, debug=True) idx = 0 #returns a list of sentences (list of list of Token objects) dependencies = sd.convert_tree(line, debug=True) for token in dependencies: print token if token.pos in nouns : print ' .. is a noun-'+token.pos grammatical_role = '-' if token.deprel in subject: grammatical_role = 'S' elif token.deprel in object: grammatical_role = 'O' else: grammatical_role = 'X' ''' if this entity has already occurred in the sentence, store the reference with highest grammatical role , judged here as S > O > X ''' if token.lemma in entities and entities[token.lemma][idx] : print str(entities[token.lemma][idx]) + ' comparing to '+str(r2i[grammatical_role]) if (entities[token.lemma][idx]) < r2i[grammatical_role]: entities[token.lemma][idx] = r2i[grammatical_role] else: entities[token.lemma][idx] = r2i[grammatical_role] ''' entity->list of : sentence_number->grammatical_role''' idx +=1 return entities, idx
def f30(file_,wfile_): ## stanfordDependencies setting sd = StanfordDependencies.get_instance(backend="subprocess",version='3.4.1') os.environ['STANFORD_PARSER'] = 'stanford-parser-full-2014-08-27/' os.environ['STANFORD_MODELS'] = 'stanford-parser-full-2014-08-27/' parser = stanford.StanfordParser(model_path="stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") resDepent = [] ## loaddataset with open(file_) as t: dataset = json.load(t) for num, tup in enumerate(dataset): print 'No.'+str(num)+ ": "+ tup[8] if not tup[8]: continue ## use stanfordDependencies to do split sentence sentences = parser.raw_parse(tup[8]) s="" for line in sentences: for sentence in line: s+=str(sentence) sent = sd.convert_tree(s) for t in sent: detemp = {} detemp['Word'] = t[1] detemp['Grammatical relation'] = t[7] print detemp resDepent.append(detemp) with open(wfile_,'w') as u: json.dump(resDepent,u) print len(resDepent)
#feature prepare # f19; fill entail set stop = stopwords.words('english') # entailLst = entailfeaturePrepare('/Volumes/Seagate Backup Plus Drive/npov_paper_data/reverb_local_global/Resource0812/reverb_local_clsf_all.txt') # HedgeLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/bias_related_lexicons/hedges_hyland2005.txt','r','utf-8') if ('#' not in line)]) # FactiveLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/bias_related_lexicons/factives_hooper1975.txt','r','utf-8') if ('#' not in line)]) # AssertiveLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/bias_related_lexicons/assertives_hooper1975.txt','r','utf-8') if ('#' not in line)]) # ImplicativeLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/bias_related_lexicons/implicatives_karttunen1971.txt','r','utf-8') if ('#' not in line)]) # ReportLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/bias_related_lexicons/report_verbs.txt','r','utf-8') if ('#' not in line)]) # StrongSubjLst = subjectivePrepare('/Volumes/Seagate Backup Plus Drive/npov_paper_data/subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff','strongsubj') # WeakSubjLst = subjectivePrepare('/Volumes/Seagate Backup Plus Drive/npov_paper_data/subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff','weaksubj') # PolarityDict = polarityPrepare('/Volumes/Seagate Backup Plus Drive/npov_paper_data/subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff') # PosiveLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/opinion-lexicon-English/positive-words.txt','r','utf-8') if (';' not in line)]) # NegativeLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/opinion-lexicon-English/negative-words.txt','r') if (';' not in line)]) sd = StanfordDependencies.get_instance(backend="subprocess",version='3.4.1') os.environ['STANFORD_PARSER'] = '/Users/wxbks/Downloads/stanford-parser-full-2014-08-27/' os.environ['STANFORD_MODELS'] = '/Users/wxbks/Downloads/stanford-parser-full-2014-08-27/' parser = stanford.StanfordParser(model_path="/Users/wxbks/Downloads/stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") # os.environ['STANFORD_PARSER'] = '/Users/wxbks/Downloads/stanford-parser-2012-11-12/' # os.environ['STANFORD_MODELS'] = '/Users/wxbks/Downloads/stanford-parser-2012-11-12/' # parser = stanford.StanfordParser() # parser = stanford.StanfordParser(model_path="/Users/wxbks/Downloads/stanford-parser-2012-11-12/stanford-parser-2.0.4-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") # npovdict = {} # with open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/npov-edits/npov_words_lemma_2_downcase.json') as t: # npovdict = json.load(t) # print HedgeLst freqArtDict = {} labels = []
def __init__(self, model_name=None): if model_name is None: model_name = 'GENIA+PubMed' self.model_name = model_name self.sd = StanfordDependencies.get_instance()
def analyser(): sd = StanfordDependencies.get_instance(backend='subprocess') sent = sd.convert_tree(tree_maker('Hari was a great dancer.')) for token in sent: print(token)
def getDependency(dependencyInput): parsedtext = rrp.simple_parse(dependencyInput) # Parse the PENN TreeBank format text using Stanford Dependency Parsing sd = StanfordDependencies.get_instance(backend='subprocess') sent = sd.convert_tree(parsedtext) return sent
# test = f_test.readlines() # cnt = Counter() # relations = IndexDict() # vol = IndexDict() # os.environ['STANFORD_PARSER'] = STANFORD_DIR + "stanford-parser.jar" # os.environ['STANFORD_MODELS'] = STANFORD_DIR + "stanford-parser-3.3.0-models.jar" # parser = stanford.StanfordParser(model_path= STANFORD_DIR + "englishPCFG.ser.gz") # st = (" ".join(nltk.word_tokenize("Hello, My (name) is Melroy."))).replace("(", "-LRB-").replace(")","-RRB-") # sentences = parse_tokenized_sentences(parser, [st]) # print st # print sentences sd = StanfordDependencies.get_instance(version='3.3.0') # print [sd.convert_tree(s._pprint_flat(nodesep='', parens='()', quotes=False)) for s in sentences] # arg1s = [] # arg2s = [] # ind = 0 # for line in train: # ind += 1 # print ind # args = line.split("||||") # arg1s.append(((" ".join( [w for w in nltk.word_tokenize(args[0].strip()) if len(w) > 0] )).strip()).replace("(", "-LRB-").replace(")","-RRB-").encode('utf-8') ) # arg2s.append(((" ".join( [w for w in nltk.word_tokenize(args[1].strip()) if len(w) > 0] )).strip()).replace("(", "-LRB-").replace(")","-RRB-").encode('utf-8') ) # print len(arg1s), len(arg2s)
def _main(args): if args.deps: import StanfordDependencies dep = StanfordDependencies.get_instance(backend='subprocess') delta = args.delta assert 0 <= delta <= 1 T = {} P = {} check_llh = 1 color = {name: c for name, c in zip(sorted(P), 'rgbym' * len(P))} marker = {name: 'o' for name in P} if args.experiment not in ('grammars', ): # benchmark default parser if args.grammar == 'medium': grammar_file = 'data/medium' g = Grammar.load('data/medium') elif args.grammar == 'big': grammar_file = 'data/bubs/wsj_6' chomsky = False g = Grammar.load(grammar_file) if args.experiment == 'default-parser': P['lchild'] = Parser(leftchild, g, chomsky=0) elif args.experiment == 'grammar-loops': # Experiment: grammar loops P['lcbptr'] = Parser(leftchild_bp, g, chomsky=chomsky) P['lchild'] = Parser(leftchild, g, chomsky=chomsky) #P['x-prod'] = Parser(xprod, g, chomsky=chomsky) #P['agenda'] = AgendaParser(g, chomsky=chomsky) elif args.experiment == 'grammars': #P, color, marker = _leftchild_v_dense_yj_on_many_grammars() P, color, marker = _many_grammars() check_llh = False else: raise ValueError('Fail to recognize experiment %r' % args.experiment) T = {x: Timer(x) for x in P} overall = [] errors = [] examples = ptb(args.fold, minlength=3, maxlength=args.maxlength, n=args.examples) if 1: examples = list(examples) np.random.shuffle(examples) _evalb_gold = {} _evalb_pred = {} for k, p in enumerate(P): _evalb_gold[p] = open('tmp/evalb-%s.gold' % k, 'wb') _evalb_pred[p] = open('tmp/evalb-%s.pred' % k, 'wb') if args.policy: from ldp.prune.features import Features theta = np.load(args.policy)['coef'] policy_grammar = Grammar.load( 'data/bubs/wsj_6') # FIXME: shouldn't be hardcoded! F = Features(policy_grammar, nfeatures=2**22) for i, (s, t) in enumerate(examples): print print green % 'Example: %s, length: %s' % (i, len(s.split())) print yellow % s e = Example(s, grammar=None, gold=t) sentence = e.tokens N = e.N if args.policy: e.tokens = policy_grammar.encode_sentence(e.sentence.split()) keep = F.mask(e, theta) else: # don't prune anything keep = np.ones((N, N + 1), dtype=np.int) for x in e.nodes: keep[x] = np.random.uniform(0, 1) <= delta for x in e.gold_spans: keep[x] = 1 data = [] #ugold = Tree.fromstring(e.gold_unbinarized) if args.deps: dep_gold = dep.convert_tree( e.gold_unbinarized, universal=0) # TODO: include function tags??? dep_unlabel_gold = {(z.index, z.head) for z in dep_gold} dep_label_gold = {(z.index, z.deprel, z.head) for z in dep_gold} for parser in sorted(P): b4 = time() with T[parser]: state = P[parser](e, keep) wallclock = time() - b4 s = state.likelihood d = state.derivation pops = state.pops pushes = state.pushes ucoarse = P[parser].decode(e, d) # print # print parser # print ucoarse # write gold and predicted trees to files so we can call evalb print >> _evalb_gold[parser], e.gold_unbinarized print >> _evalb_pred[parser], oneline(ucoarse) GW, G, W = evalb_unofficial(e.gold_unbinarized, binarize(ucoarse)) h = cgw_f(GW, G, W) # h = evalb(e.gold_unbinarized, ucoarse) row = { 'name': parser, 'llh': s, 'sentence': sentence, 'N': N, #'tree': tree, 'evalb': h, 'GotWant': GW, 'Got': G, 'Want': W, 'pops': pops, 'pushes': pushes, 'wallclock': wallclock } if args.deps: # TODO: include function tags? What is the correct way to get target trees? dep_parse = dep.convert_tree(oneline(ucoarse), universal=0) dep_label = {(z.index, z.deprel, z.head) for z in dep_parse} dep_unlabel = {(z.index, z.head) for z in dep_parse} # TODO: Use the official eval.pl script from CoNLL task. UAS = len(dep_unlabel & dep_unlabel_gold) / e.N LAS = len(dep_label & dep_label_gold) / e.N row['LAS'] = LAS row['UAS'] = UAS data.append(row) overall.append(row) df = DataFrame(overall).groupby('name').mean() #df['wallclock'] = sum_df.wallclock # use total time df.sort_values('wallclock', inplace=1) df['speedup'] = df.wallclock.max() / df.wallclock df['wps'] = df['N'] / df['wallclock'] # ok to use avg instead of sum # Determine which columns to display given command-line options. show_cols = [ 'evalb_corpus', 'wallclock', 'wps', 'speedup', 'pushes', 'pops', 'LAS', 'UAS' ] if len(P) == 1: show_cols.remove('speedup') if not args.deps: show_cols.remove('LAS') show_cols.remove('UAS') def foo(df): "Add column" s = DataFrame(overall).groupby( 'name').sum() # create separate sum dataframe. P = s.GotWant / s.Got R = s.GotWant / s.Want df['evalb_corpus'] = 2 * P * R / (P + R) df['evalb_avg'] = df.pop('evalb') # get rid of old column. foo(df) print df[show_cols] if args.pareto: accuracy_name = 'evalb' with axman('speed-accuracy ($\delta= %g$)' % delta) as ax: df = DataFrame(overall).groupby('name').mean() runtime = df.wallclock / df.wallclock.max() for name, x, y in zip(df.index, runtime, df[accuracy_name]): c = color[name] ax.scatter([x], [y], alpha=0.75, lw=0, s=50, c=c, label=name, marker=marker[name]) ax.legend(loc=4) ax.set_xlim(-0.1, 1.1) ax.set_ylim(0, 1) ax.grid(True) ax.set_xlabel('runtime (relative to slowest)') ax.set_ylabel('accuracy (%s)' % accuracy_name) show_frontier(runtime, df[accuracy_name], ax=ax) if args.bylength: # Breakdown runtime differences of parsers by length. bylength = {name: [] for name in T} for length, df in DataFrame(overall).groupby('N'): df = df.groupby('name').mean() for name, v in df.wallclock.iteritems(): bylength[name].append([length, v]) with axman('benchmark') as ax: for name, d in sorted(bylength.items()): d.sort() xs, ys = np.array(d).T ax.plot(xs, ys, alpha=0.5, c=color[name], label=name) ax.scatter(xs, ys, alpha=0.5, lw=1, c=color[name]) ax.legend(loc=2) ax.set_xlabel('sentence length') ax.set_ylabel('seconds / sentence') if check_llh: # Only run this test when it makes sense, e.g., when all parses come # from the same grammar. s0 = data[0]['llh'] for x in data: s = x['llh'] name = x['name'] if abs(s0 - s) > 1e-10: errors.append({'parser': name, 'sentence': sentence}) print '[%s]: name: %s expect: %g got: %g' % (red % 'error', name, s0, s) Timer.compare_many(*T.values(), verbose=False) if errors: print red % 'errors: %s' % len(errors) print print green % '===============================' print green % 'DONE!' print print 'EVALB-unofficial:' print 2 * (df.GotWant / df.Got * df.GotWant / df.Want) / (df.GotWant / df.Got + df.GotWant / df.Want) print print 'EVALB-official:' import os for k, p in enumerate(P): _evalb_pred[p].close() _evalb_gold[p].close() out = 'tmp/evalb-%s.out' % k os.system('./bin/EVALB/evalb %s %s > %s' % (_evalb_gold[p].name, _evalb_pred[p].name, out)) with file(out) as f: for x in f: if x.startswith('Bracketing FMeasure'): print p, float(x.strip().split()[-1]) break # use the first one which is for all lengths
""" source from: https://pypi.org/project/PyStanfordDependencies/ https://stackoverflow.com/questions/13883277/stanford-parser-and-nltk """ import StanfordDependencies, os.path, sys from nltk.parse.stanford import StanfordParser parser = StanfordParser( ) #be sure to have set environmental path to englishPCFG.ser.gz sd = StanfordDependencies.get_instance(backend='subprocess') def getTypeD(input): 'returns our the string with the dependency tags' sS = "" myList = list(parser.raw_parse(input)) for l in myList: sS += str(l) return sS def createDepData(tag_sent): 'method from the PyStanfordDependencies 0.3.1 package' data = sd.convert_tree(tag_sent) return data
from collections import defaultdict from glob import glob import numpy as np import StanfordDependencies from docopt import docopt from joblib import Parallel, delayed from pycorenlp import StanfordCoreNLP from tqdm import tqdm logger = logging.getLogger(__name__) class_paths = os.environ['CLASSPATH'].split(':') corenlp_path = list( filter(lambda p: p.endswith('stanford-corenlp-3.9.2.jar'), class_paths))[0] sd = StanfordDependencies.get_instance(jar_filename=corenlp_path, backend='jpype') noun_tags = ['NNP', 'NP', 'NNS', 'NN', 'N', 'NE'] subject_tags = ['csubj', 'csubjpass', 'subj', 'nsubj', 'nsubjpass'] object_tags = ['pobj', 'dobj', 'iobj'] idx2role = ['-', 'X', 'O', 'S'] role2idx = {role: idx for idx, role in enumerate(idx2role)} def get_deps(sentences): entities = defaultdict(lambda: defaultdict(dict)) parse_trees = [] for sentence in sentences: parse_trees.append(sentence['parse'])
import StanfordDependencies import simplejson import sys dp = StanfordDependencies.get_instance(backend='subprocess') def to_nltk_tree(sent): """Transforms a sentence to an NLTK tree. :param sent: (str) a bracketed parse :return: (nltk.Tree) an NLTK ``Tree'' instance """ import nltk return nltk.Tree.fromstring(sent) def to_cnf(sent): """Transforms a parsed sentence to a parsed sentence in CNF. :param sent: (str) a bracketed parse :return: (str) a bracketed parse in Chomsky Normal Form """ import nltk tree = nltk.Tree.fromstring(sent) tree.chomsky_normal_form() return str(tree) def to_deps(sent): """Transforms a parsed sentence to raw universal dependencies. :param sent: (str) a bracketed parse :return: (list[Token]) a list of Tokens representing a dependency tree
def main(): """ Imports a data set from the Penn Treebank. Splits the data set into training and test set. Calls the functions to learn a grammar and to perform parsing and evaluation. """ fileids = treebank.fileids() # This file is removed for being no complete sentence: fileids.remove('wsj_0056.mrg') sd = StanfordDependencies.get_instance(backend='subprocess') # Toy sentence, not extracted from the Penn Treebank. No gold standard is # available for this sentence, therefore evaluation must be done manually. # In order to allow a better visual representation of the chart in the attached # report, the period at the end of the sentence is omitted. simple_sent = 'The cat sleeps in the garden' os.system('clear') while True: answer_1 = input('\nPlease select one of the following actions (1, 2):\n\n' \ '1 - Use the same splitting in training and test set that was used ' \ 'for the evaluation illustrated in the report. If you choose this ' \ 'option, the parser will use an already existing grammar.\n\n' '2 - Randomly split the data set into training and test set and ' \ 'perform a new evaluation. If you choose this option, the parser will ' \ 'have to learn a new grammar based on the training set. ' \ 'This might take a few minutes.\n') if answer_1 in ['1', '2']: break if answer_1 == '1': test_set = ['wsj_0004.mrg', 'wsj_0008.mrg', 'wsj_0014.mrg', 'wsj_0050.mrg', \ 'wsj_0063.mrg', 'wsj_0065.mrg', 'wsj_0070.mrg', 'wsj_0073.mrg', \ 'wsj_0089.mrg', 'wsj_0096.mrg', 'wsj_0099.mrg', 'wsj_0118.mrg', \ 'wsj_0120.mrg', 'wsj_0137.mrg', 'wsj_0144.mrg', 'wsj_0165.mrg', \ 'wsj_0171.mrg', 'wsj_0181.mrg', 'wsj_0182.mrg', 'wsj_0199.mrg'] training_set = [fileid for fileid in fileids if fileid not in test_set] print_test_training_set(test_set, training_set) with open('grammar_reduced.json', 'rb') as data_file: grammar = json.load(data_file) elif answer_1 == '2': test_set, training_set = split_data_set(fileids) print_test_training_set(test_set, training_set) while True: answer_2 = input( "\nPlease press 't' to proceed with the training.\n") if answer_2 == 't': break tic = time.time() # Call function to learn a grammar from the Penn Treebank grammar = learn_grammar(training_set, sd) toc = time.time() - tic print('\nIt took {0:.4f} seconds to learn a grammar'.format(toc)) while True: answer_3 = input("\nPlease press 'p' to proceed with the parsing.\n") if answer_3 == 'p': break valid = [] test_sentences = [] for i, fileid in enumerate(test_set): # Call function to convert the conll format into a string input_sent_test = get_sentence(fileid, sd) test_sentences.append(input_sent_test) print() print('Converting sentence n. {} into string form...'.format(i + 1)) valid.append(str(i + 1)) quit_program = 'no' while True: print('\n\nThis is a list of the sentences in the test set:\n') for i, (sentence, fileid) in enumerate(zip(test_sentences, test_set)): print() print( str(i + 1) + ') id: ' + fileid + '\tSentence length: ' + str(len(sentence.split()))) print(sentence) print() print('-' * 40) print() while True: to_parse = input('\nPlease select the number of the sentence to be parsed.\n' \ "Press 't' to parse the toy sentence that is described in the report.\n" \ "Press 's' to type your own sentence.\n" \ "Press 'q' to quit.\n") if to_parse in valid or to_parse in ['t', 's']: break elif to_parse == 'q': quit_program = 'yes' break if quit_program == 'yes': break if to_parse == 't': test_sent = simple_sent elif to_parse == 's': test_sent = input( 'Please type the sentence you would like to parse.\n') else: test_sent = test_sentences[int(to_parse) - 1] # Call function to perform the parsing of the selected sentence dict_parse_final = parse_sentence(grammar, test_sent) if dict_parse_final == 0: continue if to_parse in ['t', 's']: print('\nNo gold standard available for the present sentence\n') while True: answer_9 = input( "\nPlease press 's' to parse another sentence.\n") if answer_9 == 's': break else: print() print('*' * 50) while True: answer_4 = input( "\nPlease press 'e' to proceed with the evaluation.\n") if answer_4 == 'e': ref_sentence = gold_standard(test_set[int(to_parse) - 1], sd) # Call function to perform the evaluation evaluation(ref_sentence, dict_parse_final) break
def __init__(self): self.rrp = RerankingParser.fetch_and_load('WSJ-PTB3', verbose=True) self.sd = StanfordDependencies.get_instance(backend='subprocess')
action='store_false', help="Don't use Universal Dependencies (if available)") parser.add_argument('-V', '--corenlp-version', dest='version', metavar='VERSION', help="Version of CoreNLP (will use default if not set)") parser.add_argument('-d', '--debug', action='store_true', help="Enable debugging (subprocess only)") args = parser.parse_args() if args.debug: print('Args:', args) conversion_args = dict(representation=args.representation, universal=args.universal) if args.debug: if args.backend == 'subprocess': conversion_args['debug'] = True else: print("Warning: Can only set debug flag in subprocess backend.", file=sys.stderr) sd = StanfordDependencies.get_instance(backend=args.backend, version=args.version) if not args.filenames: # interactive mode print("Ready to read and convert trees (one per line)") for tree in fileinput.input(args.filenames): print('Tree: %r' % tree) tokens = sd.convert_tree(tree, **conversion_args) for token in tokens: print(token)
def head_related(query, candidate): lmt = WordNetLemmatizer() sd = StanfordDependencies.get_instance(backend='subprocess') a = Annotator() synTree = a.getAnnotations(query)['syntax_tree'] tokens = sd.convert_tree(synTree) queue = [] for i, token in enumerate(tokens): if token[6] == 0: queue.append((i + 1, token)) qHeadWords = [] while queue != []: s = queue[0] queue.remove(s) flag = 0 #print s[1][1], s[0] for i, word in enumerate(tokens): if word[6] == s[0]: flag = 1 queue.append((i + 1, word)) if flag == 1: qHeadWords.append(lmt.lemmatize(s[1][1], 'v')) synTree = a.getAnnotations(candidate)['syntax_tree'] tokens = sd.convert_tree(synTree) queue = [] for i, token in enumerate(tokens): if token[6] == 0: queue.append((i + 1, token)) cHeadWords = [] while queue != []: s = queue[0] queue.remove(s) flag = 0 #print s[1][1], s[0] for i, word in enumerate(tokens): if word[6] == s[0]: flag = 1 queue.append((i + 1, word)) if flag == 1: cHeadWords.append(lmt.lemmatize(s[1][1], 'v')) queryRel = [] for word in qHeadWords: for i, j in enumerate(wn.synsets(word)): for l in j.lemmas(): queryRel.append(l.name()) #queryRel.append(l.lemma_names() for l in j.hypernyms()) for l in j.hypernyms(): for k in l.lemma_names(): queryRel.append(k) for l in j.hyponyms(): for k in l.lemma_names(): queryRel.append(k) candidateRel = [] for word in cHeadWords: for i, j in enumerate(wn.synsets(word)): for l in j.lemmas(): candidateRel.append(l.name()) #queryRel.append(l.lemma_names() for l in j.hypernyms()) for l in j.hypernyms(): for k in l.lemma_names(): candidateRel.append(k) for l in j.hyponyms(): for k in l.lemma_names(): candidateRel.append(k) exactHeadScore = 0 count = 0 for j in cHeadWords: count = count + 1 for i in qHeadWords: #print i,j if i == j: exactHeadScore = exactHeadScore + 1 try: exactHeadScore = exactHeadScore / count except: exactHeadScore = 0 #print "Exact Head Score\n" relHeadScore = 0 count = 0 for j in candidateRel: count = count + 1 if j in queryRel: relHeadScore = relHeadScore + 1 try: relHeadScore = relHeadScore / count except: relHeadScore = 0 #print "Relative Head Score\n" return relHeadScore, exactHeadScore
dest='version', metavar='VERSION', help="Version of CoreNLP (will use default if not set)") parser.add_argument('-d', '--debug', action='store_true', help="Enable debugging (subprocess only)") args = parser.parse_args() if args.debug: print('Args:', args) conversion_args = dict(representation=args.representation, universal=args.universal) if args.debug: if args.backend == 'subprocess': conversion_args['debug'] = True else: print("Warning: Can only set debug flag in subprocess backend.", file=sys.stderr) sd = StanfordDependencies.get_instance(backend=args.backend, version=args.version) if not args.filenames: # interactive mode print("Ready to read and convert trees (one per line)") for tree in fileinput.input(args.filenames): print('Tree: %r' % tree) tokens = sd.convert_tree(tree, **conversion_args) for token in tokens: print(token)
#feature prepare # f19; fill entail set stop = stopwords.words('english') # entailLst = entailfeaturePrepare('/Volumes/Seagate Backup Plus Drive/npov_paper_data/reverb_local_global/Resource0812/reverb_local_clsf_all.txt') HedgeLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/bias_related_lexicons/hedges_hyland2005.txt','r','utf-8') if ('#' not in line)]) # FactiveLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/bias_related_lexicons/factives_hooper1975.txt','r','utf-8') if ('#' not in line)]) # AssertiveLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/bias_related_lexicons/assertives_hooper1975.txt','r','utf-8') if ('#' not in line)]) # ImplicativeLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/bias_related_lexicons/implicatives_karttunen1971.txt','r','utf-8') if ('#' not in line)]) # ReportLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/bias_related_lexicons/report_verbs.txt','r','utf-8') if ('#' not in line)]) # StrongSubjLst = subjectivePrepare('/Volumes/Seagate Backup Plus Drive/npov_paper_data/subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff','strongsubj') # WeakSubjLst = subjectivePrepare('/Volumes/Seagate Backup Plus Drive/npov_paper_data/subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff','weaksubj') # PolarityDict = polarityPrepare('/Volumes/Seagate Backup Plus Drive/npov_paper_data/subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff') # PosiveLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/opinion-lexicon-English/positive-words.txt','r','utf-8') if (';' not in line)]) # NegativeLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/opinion-lexicon-English/negative-words.txt','r') if (';' not in line)]) sd = StanfordDependencies.get_instance(backend="subprocess") os.environ['STANFORD_PARSER'] = '/Users/wxbks/Downloads/stanford-parser-full-2014-08-27/' os.environ['STANFORD_MODELS'] = '/Users/wxbks/Downloads/stanford-parser-full-2014-08-27/' parser = stanford.StanfordParser(model_path="/Users/wxbks/Downloads/stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") npovdict = {} labels = [] features = [] line_num = 0 # npovlist = [] start = timeit.timeit() for line in gram5_train: line = line.decode('utf8') line = line.rstrip('\n') nline = line.split('\t')
def __init__(self, CACHE): Cached.__init__(self, CACHE) self.sd = StanfordDependencies.get_instance(jar_filename=STANFORD_JAR, backend='jpype')
st = StanfordNERTagger( '../StanfordNLP/stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz', '../StanfordNLP/stanford-ner-2018-10-16/stanford-ner.jar', encoding='utf-8') #for deprel os.environ[ 'STANFORD_PARSER'] = '../StanfordNLP/stanford-parser-full-2018-10-17/stanford-parser.jar' os.environ[ 'STANFORD_MODELS'] = '../StanfordNLP/stanford-parser-full-2018-10-17/stanford-parser-3.9.2-models.jar' parser = stanford.StanfordParser( model_path= "../StanfordNLP/stanford-parser-full-2018-10-17/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" ) sd = StanfordDependencies.get_instance( jar_filename= '../StanfordNLP/stanford-parser-full-2018-10-17/stanford-parser.jar') #for read_data train_path = './SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT' test_path = './SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT' #for write_json train_json = './ToTacredResult/train.json' test_json = './ToTacredResult/test.json' def most_common(words): user_counter = Counter(words) if len(user_counter.most_common(len(words))) == 1: return user_counter.most_common(1), False else: