def args_syn_wn(self, lemmas_t, lemmas_h, score, id, type): result = 0 wn = WNTools() tool = RTETools() tool.set_tokens(lemmas_t.split()) tool.quit_punct() lemmas_t = tool.quit_sw() tool.set_tokens(lemmas_h.split()) tool.quit_punct() lemmas_h = tool.quit_sw() (diff_ht, diff_th) = self.diff(lemmas_t, lemmas_h) expand_diff_ht = wn.expand_bow_syns(list(diff_ht)) expand_diff_th = wn.expand_bow_syns(list(diff_th)) if len(expand_diff_ht) != 0 and len(expand_diff_th) !=0: sim = SetMetrics(expand_diff_ht, expand_diff_th) if sim.cosine() > 0: result = 1 else: result = 1 predicate = '>arg_relsyn\n%s %s %s'%(id, type, result) return predicate
def args_syn_wn(self, lemmas_t, lemmas_h, score, id, type): result = 0 wn = WNTools() tool = RTETools() tool.set_tokens(lemmas_t.split()) tool.quit_punct() lemmas_t = tool.quit_sw() tool.set_tokens(lemmas_h.split()) tool.quit_punct() lemmas_h = tool.quit_sw() (diff_ht, diff_th) = self.diff(lemmas_t, lemmas_h) expand_diff_ht = wn.expand_bow_syns(list(diff_ht)) expand_diff_th = wn.expand_bow_syns(list(diff_th)) if len(expand_diff_ht) != 0 and len(expand_diff_th) !=0: sim = SetMetrics(expand_diff_ht, expand_diff_th) if sim.cosine() > 0: result = 1 else: result = 1 predicate = 'ArgRelSyn(%s, %s, %s)'%(id, type, result) return predicate
def arg_proc(self, id, point, sep): self.a_predicates = [] n = 3 #levels in wn tree if 'verbs' in point: verbs = point['verbs'] for i, verb in verbs.iteritems(): i = '%s.%s'%(id, i) (vt, vh) = verb['tokens'] if 'ARG' in verb: args = verb['ARG'] for type,arg in args.items(): w_t = arg['wordform-t'].split() w_h = arg['wordform-h'].split() l_t = arg['lemma-t'].split() l_h = arg['lemma-h'].split() p_t = arg['pos-t'].split() p_h = arg['pos-h'].split() c_t = arg['chunk-t'].split() c_h = arg['chunk-h'].split() n_t = arg['ne-t'].split() n_h = arg['ne-h'].split() score = arg['score'] w_t.extend(w_h) l_t.extend(l_h) p_t.extend(p_h) #TODO quit stop words for j, word in enumerate(w_t): word_arg = '>token_word\n"%s" %s "%s"'%(type, j, self.clean_str(word)) lemma_arg = '>token_lemma\n"%s" %s "%s"'%(type, j, self.clean_str(l_t[j])) pos_arg = '>token_pos\n"%s" %s "%s"'%(type, j, self.clean_str(p_t[j])) lin = Lin() sim_words = lin.expand_w(word) wn = WNTools() hyps = wn.get_mfs_hypernyms((l_t[j], p_t[j])) self.a_predicates.append(word_arg) self.a_predicates.append(lemma_arg) self.a_predicates.append(pos_arg) for j, sim_word in enumerate(sim_words): lin_arg = '>token_lin\n"%s" %s "%s"'%(type, j, self.clean_str(sim_word)) self.a_predicates.append(lin_arg) for key, tree in hyps: j = 0 for category in tree[:n]: hyp_arg = '>token_wn\n"%s" %s "%s"'%(type, j, self.clean_str(category)) j += 1 self.a_predicates.append(hyp_arg) arg_id = '>arg\n"%s" "%s%s%s" %s'%(type, self.clean_str(vt), sep, self.clean_str(vh), id) self.a_predicates.append(arg_id) return self.a_predicates
def arg_proc(self, id, point, sep): self.a_predicates = [] if 'verbs' in point: verbs = point['verbs'] for i, verb in verbs.iteritems(): i = '%s.%s'%(id, i) (vt, vh) = verb['tokens'] if 'ARG' in verb: args = verb['ARG'] for type,arg in args.items(): w_t = arg['wordform-t'].split() w_h = arg['wordform-h'].split() l_t = arg['lemma-t'].split() l_h = arg['lemma-h'].split() p_t = arg['pos-t'].split() p_h = arg['pos-h'].split() c_t = arg['chunk-t'].split() c_h = arg['chunk-h'].split() n_t = arg['ne-t'].split() n_h = arg['ne-h'].split() score = arg['score'] w_t.extend(w_h) l_t.extend(l_h) p_t.extend(p_h) #TODO quit stop words for j, word in enumerate(w_t): word_arg = 'Token(%s, %s, "%s")'%(type, id, self.clean_str(word)) lemma_arg = 'Token(%s, %s, "%s")'%(type, id, self.clean_str(l_t[j])) pos_arg = 'Token(%s, %s, "%s")'%(type, id, self.clean_str(p_t[j])) lin = Lin() sim_words = lin.expand_w(word) wn = WNTools() hyps = wn.get_mfs_hypernyms((l_t[j], p_t[j])) self.a_predicates.append(word_arg) self.a_predicates.append(lemma_arg) self.a_predicates.append(pos_arg) for sim_word in sim_words: lin_arg = 'Token(%s, %s, "%s")'%(type, id, self.clean_str(sim_word)) self.a_predicates.append(lin_arg) for key, tree in hyps: t = 0 for category in tree: if t >= 3: break hyp_arg = 'Token(%s, %s, "%s")'%(type, id, self.clean_str(category)) self.a_predicates.append(hyp_arg) t +=1 arg_id = 'ARG(%s, %s, %s)'%(type, i, id) self.a_predicates.append(arg_id) return self.a_predicates
def __init__(self, frames_text={}, tokens_text=[], chunks_text=[], frames_hypo={}, tokens_hypo=[], chunks_hypo=[], e_type='simple', verbose=1, entailment=-1): self.e_type = e_type self.srl_t = SRLTools() self.srl_h = SRLTools() self.verb_net = VerbMetrics() self.arg_sim = SetMetrics() self.lin = Lin() self.wn = WNTools() self.frames_hypo = frames_text self.frames_text = frames_hypo self.tokens_hypo = tokens_hypo self.tokens_text = tokens_text self.chunks_hypo = chunks_hypo self.chunks_text = chunks_text self.args_text = {} self.args_hypo = {} self.verbs_text = [] self.verbs_hypo = [] self.edit_score = 0 self.verb_score = 0 self.chunk_score = 0 self.oper_type = {'del': 0, 'in': 0, 'sub': 0} self.verbose = verbose self.entailment = entailment return
def args_hyp_wn(self, lemmas_t, pos_t, lemmas_h, pos_h, score, id, type): result = 0 wn = WNTools() tool = RTETools() tool.set_tokens(lemmas_t) tool.quit_punct() lemmas_t = tool.quit_sw() tool.set_tokens(lemmas_h) tool.quit_punct() lemmas_h = tool.quit_sw() (diff_ht, diff_th) = self.diff(lemmas_t, lemmas_h) #TODO predicate = 'HypRelSyn(%s, %s)'%(id, type, result) return predicate
def __init__(self, frames_text = {}, tokens_text = [], frames_hypo = {}, tokens_hypo = [], sim_type = 'Lin', verbose = 1): self.srl_t = SRLTools() self.srl_h = SRLTools() self.verb_net = VerbMetrics() self.arg_sim = SetMetrics() self.lin = Lin() self.wn = WNTools() self.frames_hypo = frames_text self.frames_text = frames_hypo self.tokens_hypo = tokens_hypo self.tokens_text = tokens_text self.args_text = {} self.args_hypo = {} self.verbs_text = [] self.verbs_hypo = [] self.tine_score = 0 self.verb_score = 0 self.arg_score = 0 self.sim_type = sim_type self.verbose = verbose #self.pos_text = pos_text #self.pos_hypo = pos_hypo return
def main(args): pickle_file = args[0] print 'loading file:',pickle_file with open(pickle_file, 'r') as pf: pairs = pickle.load(pf) k = 0 for pair in pairs: print 'id:', pair.get_id() print 's1:', pair.get_text() print 's2:', pair.get_hypo() print 'features:', pair.get_features_text_type() print 'set-metrics, cos test' lemmas_text = pair.get_feature_text('lemmas') lemmas_hypo = pair.get_feature_hypo('lemmas') set_th = SetMetrics(lemmas_text, lemmas_hypo) cos = set_th.cosine() #print cos print 'SRL tools' frames_text = pair.get_feature_text('frames') print frames_text print '################' srl = SRLTools(lemmas_text, frames_text) word_to_frame = srl.get_words_frame() print word_to_frame print '################' print srl.get_verbs() print '################' #print 'verb-metrics, ' pos_text = pair.get_feature_text('pos') pos_hypo = pair.get_feature_hypo('pos') verbs = VerbMetrics() lin = Lin() vectors = VectorMetrics() hyper = WNTools() for i, pos_tuple_t in enumerate(pos_text): (token, pos_t) = pos_tuple_t if pos_t.startswith('V'): for j, pos_tuple_h in enumerate(pos_hypo): (token, pos_h) = pos_tuple_h if pos_h.startswith('V'): verbs.set_text_verb(lemmas_text[i]) verbs.set_hypo_verb(lemmas_hypo[j]) #print 'verbs test t:%s h:%s'%(lemmas_text[i], lemmas_hypo[j]) vn_isec = verbs.vn_isec() #print 'verb net isec: %d'%vn_isec #print 'lin(%s):'%lemmas_text[i], '\n', lin.n_similar_words(lemmas_text[i]) #print 'lin(%s):'%lemmas_hypo[j], '\n', lin.n_similar_words(lemmas_hypo[j]) t_sim = lin.n_similar_words(lemmas_text[i]) h_sim = lin.n_similar_words(lemmas_hypo[j]) t_score = [float(score) for word,score in t_sim] h_score = [float(score) for word,score in h_sim] vectors.set_vectors(t_score, h_score) #print 'cos_vect: ', vectors.cosine() elif pos_h.startswith('N'): #print 'wn test hypernyms' trees = hyper.get_mfs_hypernyms((lemmas_hypo[j], pos_h)) #print trees k += 1 if k >= 10: break pf.close return
def arg_proc(self, id, point, sep): self.a_predicates = [] if 'verbs' in point: verbs = point['verbs'] for i, verb in verbs.iteritems(): (vt, vh) = verb['tokens'] if 'ARG' in verb: args = verb['ARG'] for type,arg in args.items(): w_t = arg['wordform-t'] w_h = arg['wordform-h'] l_t = arg['lemma-t'] l_h = arg['lemma-h'] p_t = arg['pos-t'] p_h = arg['pos-h'] c_t = arg['chunk-t'] c_h = arg['chunk-h'] n_t = arg['ne-t'] n_h = arg['ne-h'] score = arg['score'] token_arg = '>token_arg\n%s %s "%s%s%s"'%(id, type, self.clean_str(w_t), sep, self.clean_str(w_h)) lemma_arg = '>lemma_arg\n%s %s "%s%s%s"'%(id, type, self.clean_str(l_t), sep, self.clean_str(l_h)) pos_arg = '>pos_arg\n%s %s "%s%s%s"'%(id, type, self.clean_str(p_t), sep, self.clean_str(p_h)) #expand content words Lin sw_tool = RTETools() sw_tool.set_tokens(l_t.split()) sw_tool.quit_sw() l_tsw = sw_tool.quit_punct() sw_tool.set_tokens(l_h.split()) sw_tool.quit_sw() l_hsw = sw_tool.quit_punct() content_arg = '>cont_arg\n%s %s "%s%s%s"'%(id, type, self.clean_str(' '.join(l_tsw)), sep, self.clean_str(' '.join(l_hsw))) sim_diff = self.sim_dif(l_tsw, l_hsw) diff_arg = '>diff_arg\n%s %s "%s"'%(id, type, self.clean_str(' '.join(sim_diff))) if score >= 0.4: score_arg = '>sim_arg\n%s %s 1'%(id, type) else: #TODO find threshold from data statistics mf score_arg = '>sim_arg\n%s %s 0'%(id, type) lin = Lin() bow_t = lin.expand_bow(l_tsw) bow_h = lin.expand_bow(l_hsw) lin_arg = '>lin_arg\n%s %s "%s%s%s"'%(id, type, self.clean_str(' '.join(bow_t)), sep, self.clean_str(' '.join(bow_h))) wn = WNTools() wns_t = zip(l_t.split(), p_t.split()) wns_h = zip(l_h.split(), p_h.split()) bow_wnt = wn.expand_bow_tree(wns_t) bow_wnh = wn.expand_bow_tree(wns_h) wn_arg = '>wn_arg\n%s %s "%s%s%s"'%(id, type, self.clean_str(' '.join(bow_wnt)), sep, self.clean_str(' '.join(bow_wnh))) rel_syn_args = self.args_syn_wn(l_t, l_h, score, id, type) self.a_predicates.append(token_arg) self.a_predicates.append(score_arg) self.a_predicates.append(lemma_arg) self.a_predicates.append(rel_syn_args) self.a_predicates.append(content_arg) self.a_predicates.append(diff_arg) self.a_predicates.append(pos_arg) self.a_predicates.append(lin_arg) self.a_predicates.append(wn_arg) return self.a_predicates
def arg_proc(self, id, point, sep): self.a_predicates = [] if 'verbs' in point: verbs = point['verbs'] for i, verb in verbs.iteritems(): (vt, vh) = verb['tokens'] if 'ARG' in verb: args = verb['ARG'] for type,arg in args.items(): w_t = arg['wordform-t'] w_h = arg['wordform-h'] l_t = arg['lemma-t'] l_h = arg['lemma-h'] p_t = arg['pos-t'] p_h = arg['pos-h'] c_t = arg['chunk-t'] c_h = arg['chunk-h'] n_t = arg['ne-t'] n_h = arg['ne-h'] score = arg['score'] token_arg = 'TokenArg(%s, %s, "%s %s %s")'%(id, type, self.clean_str(w_t), sep, self.clean_str(w_h)) lemma_arg = 'LemmaArg(%s, %s,"%s %s %s")'%(id, type, self.clean_str(l_t), sep, self.clean_str(l_h)) pos_arg = 'PosArg(%s, %s, "%s %s %s")'%(id, type, self.clean_str(p_t), sep, self.clean_str(p_h)) #expand content words Lin sw_tool = RTETools() sw_tool.set_tokens(l_t.split()) sw_tool.quit_sw() l_tsw = sw_tool.quit_punct() sw_tool.set_tokens(l_h.split()) sw_tool.quit_sw() l_hsw = sw_tool.quit_punct() content_arg = 'ContArg(%s, %s, "%s %s %s")'%(id, type, self.clean_str(' '.join(l_tsw)), sep, self.clean_str(' '.join(l_hsw))) sim_diff = self.sim_dif(l_tsw, l_hsw) diff_arg = 'DiffArg(%s, %s, "%s")'%(id, type, self.clean_str(' '.join(sim_diff))) if score >= 0.4: score_arg = 'SimArg(%s, %s, 1)'%(id, type) else: #TODO find threshold from data statistics mf score_arg = 'SimArg(%s, %s, 0)'%(id, type) lin = Lin() bow_t = lin.expand_bow(l_tsw) bow_h = lin.expand_bow(l_hsw) lin_arg = 'LinArg(%s, %s, "%s %s %s")'%(id, type, self.clean_str(' '.join(bow_t)), sep, self.clean_str(' '.join(bow_h))) wn = WNTools() wns_t = zip(l_t.split(), p_t.split()) wns_h = zip(l_h.split(), p_h.split()) bow_wnt = wn.expand_bow_tree(wns_t) bow_wnh = wn.expand_bow_tree(wns_h) wn_arg = 'WnArg(%s, %s, "%s %s %s")'%(id, type, self.clean_str(' '.join(bow_wnt)), sep, self.clean_str(' '.join(bow_wnh))) rel_syn_args = self.args_syn_wn(l_t, l_h, score, id, type) self.a_predicates.append(token_arg) self.a_predicates.append(score_arg) self.a_predicates.append(lemma_arg) self.a_predicates.append(rel_syn_args) self.a_predicates.append(content_arg) self.a_predicates.append(diff_arg) self.a_predicates.append(pos_arg) self.a_predicates.append(lin_arg) self.a_predicates.append(wn_arg) return self.a_predicates
def arg_proc(self, id, point, sep): self.a_predicates = [] n = 3 #levels in wn tree if 'verbs' in point: verbs = point['verbs'] for i, verb in verbs.iteritems(): i = '%s.%s' % (id, i) (vt, vh) = verb['tokens'] if 'ARG' in verb: args = verb['ARG'] for type, arg in args.items(): w_t = arg['wordform-t'].split() w_h = arg['wordform-h'].split() l_t = arg['lemma-t'].split() l_h = arg['lemma-h'].split() p_t = arg['pos-t'].split() p_h = arg['pos-h'].split() c_t = arg['chunk-t'].split() c_h = arg['chunk-h'].split() n_t = arg['ne-t'].split() n_h = arg['ne-h'].split() score = arg['score'] w_t.extend(w_h) l_t.extend(l_h) p_t.extend(p_h) #TODO quit stop words for j, word in enumerate(w_t): word_arg = '>token_word\n"%s" %s "%s"' % ( type, j, self.clean_str(word)) lemma_arg = '>token_lemma\n"%s" %s "%s"' % ( type, j, self.clean_str(l_t[j])) pos_arg = '>token_pos\n"%s" %s "%s"' % ( type, j, self.clean_str(p_t[j])) lin = Lin() sim_words = lin.expand_w(word) wn = WNTools() hyps = wn.get_mfs_hypernyms((l_t[j], p_t[j])) self.a_predicates.append(word_arg) self.a_predicates.append(lemma_arg) self.a_predicates.append(pos_arg) for j, sim_word in enumerate(sim_words): lin_arg = '>token_lin\n"%s" %s "%s"' % ( type, j, self.clean_str(sim_word)) self.a_predicates.append(lin_arg) for key, tree in hyps: j = 0 for category in tree[:n]: hyp_arg = '>token_wn\n"%s" %s "%s"' % ( type, j, self.clean_str(category)) j += 1 self.a_predicates.append(hyp_arg) arg_id = '>arg\n"%s" "%s%s%s" %s' % ( type, self.clean_str(vt), sep, self.clean_str(vh), id) self.a_predicates.append(arg_id) return self.a_predicates
class TineVN: def __init__(self, frames_text = {}, tokens_text = [], frames_hypo = {}, tokens_hypo = [], sim_type = 'Lin', verbose = 1): self.srl_t = SRLTools() self.srl_h = SRLTools() self.verb_net = VerbMetrics() self.arg_sim = SetMetrics() self.lin = Lin() self.wn = WNTools() self.frames_hypo = frames_text self.frames_text = frames_hypo self.tokens_hypo = tokens_hypo self.tokens_text = tokens_text self.args_text = {} self.args_hypo = {} self.verbs_text = [] self.verbs_hypo = [] self.tine_score = 0 self.verb_score = 0 self.arg_score = 0 self.sim_type = sim_type self.verbose = verbose #self.pos_text = pos_text #self.pos_hypo = pos_hypo return def get_tine_score(self, frames_text = {}, tokens_text = [], frames_hypo = {}, tokens_hypo = []): if frames_text: self.frames_text = frames_text if frames_hypo: self.frames_hypo = frames_hypo if tokens_text: self.tokens_text = tokens_text if tokens_hypo: self.tokens_hypo = tokens_hypo self.srl_t.set_frames(self.frames_text) self.srl_h.set_frames(self.frames_hypo) self.srl_t.set_tokens(self.tokens_text) self.srl_h.set_tokens(self.tokens_hypo) self.args_text = self.srl_t.get_words_frame() self.args_hypo = self.srl_h.get_words_frame() sum_verb = 0 num_verbs_h = len(self.args_text.keys()) self.__p_stderr('TINE VerbNet\n') self.__p_stderr('T: %s \n H: %s\n'%(self.args_text, self.args_hypo)) self.__p_stderr('T: %s \n H: %s\n'%(self.args_text.keys(), self.args_hypo.keys())) for verb_t, args_t in self.args_text.items(): for verb_h, args_h in self.args_hypo.items(): sim_verbs = self.__simVerbs(verb_t, verb_h) if sim_verbs == 1: self.__p_stderr('verbs(%s, %s)\n'%(verb_t, verb_h)) args_score = self.__simArgs(args_t, args_h) sum_verb += args_score self.tine_score = float(sum_verb) / num_verbs_h self.__p_stderr('score:%s\n'%(self.tine_score)) return self.tine_score def __simVerbs(self, verb_t = '', verb_h = ''): if verb_t == verb_h: return 1 self.verb_net.set_text_verb(verb_t) self.verb_net.set_hypo_verb(verb_h) isec = self.verb_net.vn_isec() if isec == 0: vo = self.verb_net.vo() return vo else: return isec return isec def __simArgs(self, args_t = [], args_h = []): sum_args = 0 num_args_h = len(args_h) for tag_t, tokens_t in args_t: for tag_h, tokens_h in args_h: if tag_t == tag_h: expand_t = [] expand_h = [] if self.sim_type == 'Lin': expand_t = self.lin.expand_bow(tokens_t) expand_h = self.lin.expand_bow(tokens_h) elif self.sim_type == 'WN': expand_t = self.wn.expand_bow_tree(tokens_t) expand_h = self.wn.expand_bow_tree(tokens_h) self.arg_sim.set_text(expand_t) self.arg_sim.set_hypo(expand_h) self.arg_score = self.arg_sim.cosine() self.__p_stderr('\t[%s|%s] %s %s\n'%(tag_t, self.arg_score, expand_t, expand_h)) sum_args += self.arg_score if num_args_h == 0: return 0 else: self.verb_score = float(sum_args) / num_args_h return self.verb_score def __p_stderr(self, text = ''): if self.verbose == 1: sys.stderr.write(text) return def get_verb_score(self): return self.verb_score def get_arg_score(self): return self.arg_score