def extractFeatures(options): with open(options.pickle_file, 'r') as pf: pairs = pickle.load(pf) metrics = SetMetrics() tool = RTETools() feature_values = {} i = 0 o = open(options.output_file, 'w') for pair in pairs: id = pair.get_id() value = pair.get_value() lemmas_text = pair.get_feature_text('lemmas') lemmas_hypo = pair.get_feature_hypo('lemmas') tool.set_tokens(lemmas_text) lemmas_text = tool.quit_sw() lemmas_text = tool.quit_punct() pos_text = pair.get_feature_text('pos') pos_hypo = pair.get_feature_hypo('pos') tool.set_tokens(lemmas_hypo) lemmas_hypo = tool.quit_sw() lemmas_hypo = tool.quit_punct() metrics.set_text(lemmas_text) metrics.set_hypo(lemmas_hypo) cos = metrics.cosine() print >> o, '>>' i = 0 for lemma_text in lemmas_text: print >> o, '>word_t' print >> o, '%s %s "%s"' % (id, i, lemma_text) i += 1 i = 0 for lemma_hypo in lemmas_hypo: print >> o, '>word_h' print >> o, '%s %s "%s"' % (id, i, lemma_hypo) i += 1 i = 0 for l, pos_text in pos_text: print >> o, '>pos_t' print >> o, '%s %s "%s"' % (id, i, pos_text) i += 1 i = 0 for l, pos_hypo in pos_hypo: print >> o, '>pos_h' print >> o, '%s %s "%s"' % (id, i, pos_hypo) i += 1 print >> o, '>cosine' print >> o, '%s %s' % (id, cos) print >> o, '>entailment' print >> o, '%s "%s"\n' % (id, value)
def extractFeatures(options): with open(options.pickle_file, 'r') as pf: pairs = pickle.load(pf) metrics = SetMetrics() tool = RTETools() feature_values = {} i = 0 o = open(options.output_file, 'w') for pair in pairs: id = pair.get_id() value = pair.get_value() lemmas_text = pair.get_feature_text('lemmas') lemmas_hypo = pair.get_feature_hypo('lemmas') tool.set_tokens(lemmas_text) lemmas_text = tool.quit_sw() lemmas_text = tool.quit_punct() pos_text = pair.get_feature_text('pos') pos_hypo = pair.get_feature_hypo('pos') tool.set_tokens(lemmas_hypo) lemmas_hypo = tool.quit_sw() lemmas_hypo = tool.quit_punct() metrics.set_text(lemmas_text) metrics.set_hypo(lemmas_hypo) cos = metrics.cosine() print >>o, '>>' i = 0 for lemma_text in lemmas_text: print >>o, '>word_t' print >>o, '%s %s "%s"'%(id, i, lemma_text) i += 1 i = 0 for lemma_hypo in lemmas_hypo: print >>o, '>word_h' print >>o, '%s %s "%s"'%(id, i, lemma_hypo) i += 1 i = 0 for l,pos_text in pos_text: print >>o, '>pos_t' print >>o, '%s %s "%s"'%(id, i, pos_text) i += 1 i = 0 for l,pos_hypo in pos_hypo: print >>o, '>pos_h' print >>o, '%s %s "%s"'%(id, i, pos_hypo) i += 1 print >>o, '>cosine' print >>o, '%s %s'%(id, cos) print >>o, '>entailment' print >>o, '%s "%s"\n'%(id, value)
def baseline(self, id_bo): self.baseline_predicates = [] lin = Lin() tools = RTETools() if id_bo in self.pairs: value = self.pairs[id_bo].get_value() lemmas_text = self.pairs[id_bo].get_feature_text('lemmas') tools.set_tokens(lemmas_text) lemmas_text = tools.quit_sw() lemmas_text = tools.quit_punct() lemmas_hypo = self.pairs[id_bo].get_feature_hypo('lemmas') tools.set_tokens(lemmas_hypo) lemmas_hypo = tools.quit_sw() lemmas_hypo = tools.quit_punct() for lemma_t in lemmas_text: for lemma_h in lemmas_hypo: combo1 = 'Combo(%s, "%s|||%s")'%(id_bo, self.clean_str(lemma_t), self.clean_str(lemma_h)) sim_t = lin.n_similar_words(lemma_t, 10) sim_h = lin.n_similar_words(lemma_h, 10) tmp_score_t = [] for w,s in sim_t: tmp_score_t.append(w) tmp_score_h = [] for w,s in sim_h: tmp_score_h.append(w) vector = SetMetrics(tmp_score_t, tmp_score_h) cos = vector.cosine() combo2 = 'ComboLin(%s, %s)'%(id_bo, cos) wordcpm = NounTools(lemma_t, lemma_h) direct = wordcpm.direct() combo3 = 'Direct(%s, %s)'%(id_bo, direct) self.baseline_predicates.append(combo1) #self.baseline_predicates.append(combo2) self.baseline_predicates.append(combo3) return self.baseline_predicates
def args_direct(self, lemmas_t, pos_t, lemmas_h, pos_h, score, id, type): result = 0 n = NounTools() tool = RTETools() tool.set_tokens(lemmas_t) tool.quit_punct() lemmas_t = tool.quit_sw() tool.set_tokens(lemmas_h) tool.quit_punct() lemmas_h = tool.quit_sw() (diff_ht, diff_th) = self.diff(lemmas_t, lemmas_h) sim_th = self.sim(lemmas_t, lemmas_h) return result
def args_hyp_wn(self, lemmas_t, pos_t, lemmas_h, pos_h, score, id, type): result = 0 wn = WNTools() tool = RTETools() tool.set_tokens(lemmas_t) tool.quit_punct() lemmas_t = tool.quit_sw() tool.set_tokens(lemmas_h) tool.quit_punct() lemmas_h = tool.quit_sw() (diff_ht, diff_th) = self.diff(lemmas_t, lemmas_h) #TODO predicate = '>hyp_relsyn\n%s %s'%(id, type, result) return predicate
def args_syn_wn(self, lemmas_t, lemmas_h, score, id, type): result = 0 wn = WNTools() tool = RTETools() tool.set_tokens(lemmas_t.split()) tool.quit_punct() lemmas_t = tool.quit_sw() tool.set_tokens(lemmas_h.split()) tool.quit_punct() lemmas_h = tool.quit_sw() (diff_ht, diff_th) = self.diff(lemmas_t, lemmas_h) expand_diff_ht = wn.expand_bow_syns(list(diff_ht)) expand_diff_th = wn.expand_bow_syns(list(diff_th)) if len(expand_diff_ht) != 0 and len(expand_diff_th) !=0: sim = SetMetrics(expand_diff_ht, expand_diff_th) if sim.cosine() > 0: result = 1 else: result = 1 predicate = '>arg_relsyn\n%s %s %s'%(id, type, result) return predicate
def arg_proc(self, id, point, sep): self.a_predicates = [] if 'verbs' in point: verbs = point['verbs'] for i, verb in verbs.iteritems(): (vt, vh) = verb['tokens'] if 'ARG' in verb: args = verb['ARG'] for type,arg in args.items(): w_t = arg['wordform-t'] w_h = arg['wordform-h'] l_t = arg['lemma-t'] l_h = arg['lemma-h'] p_t = arg['pos-t'] p_h = arg['pos-h'] c_t = arg['chunk-t'] c_h = arg['chunk-h'] n_t = arg['ne-t'] n_h = arg['ne-h'] score = arg['score'] token_arg = '>token_arg\n%s %s "%s%s%s"'%(id, type, self.clean_str(w_t), sep, self.clean_str(w_h)) lemma_arg = '>lemma_arg\n%s %s "%s%s%s"'%(id, type, self.clean_str(l_t), sep, self.clean_str(l_h)) pos_arg = '>pos_arg\n%s %s "%s%s%s"'%(id, type, self.clean_str(p_t), sep, self.clean_str(p_h)) #expand content words Lin sw_tool = RTETools() sw_tool.set_tokens(l_t.split()) sw_tool.quit_sw() l_tsw = sw_tool.quit_punct() sw_tool.set_tokens(l_h.split()) sw_tool.quit_sw() l_hsw = sw_tool.quit_punct() content_arg = '>cont_arg\n%s %s "%s%s%s"'%(id, type, self.clean_str(' '.join(l_tsw)), sep, self.clean_str(' '.join(l_hsw))) sim_diff = self.sim_dif(l_tsw, l_hsw) diff_arg = '>diff_arg\n%s %s "%s"'%(id, type, self.clean_str(' '.join(sim_diff))) if score >= 0.4: score_arg = '>sim_arg\n%s %s 1'%(id, type) else: #TODO find threshold from data statistics mf score_arg = '>sim_arg\n%s %s 0'%(id, type) lin = Lin() bow_t = lin.expand_bow(l_tsw) bow_h = lin.expand_bow(l_hsw) lin_arg = '>lin_arg\n%s %s "%s%s%s"'%(id, type, self.clean_str(' '.join(bow_t)), sep, self.clean_str(' '.join(bow_h))) wn = WNTools() wns_t = zip(l_t.split(), p_t.split()) wns_h = zip(l_h.split(), p_h.split()) bow_wnt = wn.expand_bow_tree(wns_t) bow_wnh = wn.expand_bow_tree(wns_h) wn_arg = '>wn_arg\n%s %s "%s%s%s"'%(id, type, self.clean_str(' '.join(bow_wnt)), sep, self.clean_str(' '.join(bow_wnh))) rel_syn_args = self.args_syn_wn(l_t, l_h, score, id, type) self.a_predicates.append(token_arg) self.a_predicates.append(score_arg) self.a_predicates.append(lemma_arg) self.a_predicates.append(rel_syn_args) self.a_predicates.append(content_arg) self.a_predicates.append(diff_arg) self.a_predicates.append(pos_arg) self.a_predicates.append(lin_arg) self.a_predicates.append(wn_arg) return self.a_predicates
def args_hyp_wn(self, lemmas_t, pos_t, lemmas_h, pos_h, score, id, type): result = 0 wn = WNTools() tool = RTETools() tool.set_tokens(lemmas_t) tool.quit_punct() lemmas_t = tool.quit_sw() tool.set_tokens(lemmas_h) tool.quit_punct() lemmas_h = tool.quit_sw() (diff_ht, diff_th) = self.diff(lemmas_t, lemmas_h) #TODO predicate = 'HypRelSyn(%s, %s)'%(id, type, result) return predicate
def args_syn_wn(self, lemmas_t, lemmas_h, score, id, type): result = 0 wn = WNTools() tool = RTETools() tool.set_tokens(lemmas_t.split()) tool.quit_punct() lemmas_t = tool.quit_sw() tool.set_tokens(lemmas_h.split()) tool.quit_punct() lemmas_h = tool.quit_sw() (diff_ht, diff_th) = self.diff(lemmas_t, lemmas_h) expand_diff_ht = wn.expand_bow_syns(list(diff_ht)) expand_diff_th = wn.expand_bow_syns(list(diff_th)) if len(expand_diff_ht) != 0 and len(expand_diff_th) !=0: sim = SetMetrics(expand_diff_ht, expand_diff_th) if sim.cosine() > 0: result = 1 else: result = 1 predicate = 'ArgRelSyn(%s, %s, %s)'%(id, type, result) return predicate
def arg_proc(self, id, point, sep): self.a_predicates = [] if 'verbs' in point: verbs = point['verbs'] for i, verb in verbs.iteritems(): (vt, vh) = verb['tokens'] if 'ARG' in verb: args = verb['ARG'] for type,arg in args.items(): w_t = arg['wordform-t'] w_h = arg['wordform-h'] l_t = arg['lemma-t'] l_h = arg['lemma-h'] p_t = arg['pos-t'] p_h = arg['pos-h'] c_t = arg['chunk-t'] c_h = arg['chunk-h'] n_t = arg['ne-t'] n_h = arg['ne-h'] score = arg['score'] token_arg = 'TokenArg(%s, %s, "%s %s %s")'%(id, type, self.clean_str(w_t), sep, self.clean_str(w_h)) lemma_arg = 'LemmaArg(%s, %s,"%s %s %s")'%(id, type, self.clean_str(l_t), sep, self.clean_str(l_h)) pos_arg = 'PosArg(%s, %s, "%s %s %s")'%(id, type, self.clean_str(p_t), sep, self.clean_str(p_h)) #expand content words Lin sw_tool = RTETools() sw_tool.set_tokens(l_t.split()) sw_tool.quit_sw() l_tsw = sw_tool.quit_punct() sw_tool.set_tokens(l_h.split()) sw_tool.quit_sw() l_hsw = sw_tool.quit_punct() content_arg = 'ContArg(%s, %s, "%s %s %s")'%(id, type, self.clean_str(' '.join(l_tsw)), sep, self.clean_str(' '.join(l_hsw))) sim_diff = self.sim_dif(l_tsw, l_hsw) diff_arg = 'DiffArg(%s, %s, "%s")'%(id, type, self.clean_str(' '.join(sim_diff))) if score >= 0.4: score_arg = 'SimArg(%s, %s, 1)'%(id, type) else: #TODO find threshold from data statistics mf score_arg = 'SimArg(%s, %s, 0)'%(id, type) lin = Lin() bow_t = lin.expand_bow(l_tsw) bow_h = lin.expand_bow(l_hsw) lin_arg = 'LinArg(%s, %s, "%s %s %s")'%(id, type, self.clean_str(' '.join(bow_t)), sep, self.clean_str(' '.join(bow_h))) wn = WNTools() wns_t = zip(l_t.split(), p_t.split()) wns_h = zip(l_h.split(), p_h.split()) bow_wnt = wn.expand_bow_tree(wns_t) bow_wnh = wn.expand_bow_tree(wns_h) wn_arg = 'WnArg(%s, %s, "%s %s %s")'%(id, type, self.clean_str(' '.join(bow_wnt)), sep, self.clean_str(' '.join(bow_wnh))) rel_syn_args = self.args_syn_wn(l_t, l_h, score, id, type) self.a_predicates.append(token_arg) self.a_predicates.append(score_arg) self.a_predicates.append(lemma_arg) self.a_predicates.append(rel_syn_args) self.a_predicates.append(content_arg) self.a_predicates.append(diff_arg) self.a_predicates.append(pos_arg) self.a_predicates.append(lin_arg) self.a_predicates.append(wn_arg) return self.a_predicates