def process(self, text): preproc = TextTo4lang.preprocess_text(text) deps, corefs, parse_trees = self.parser_wrapper.parse_text(preproc) machines = self.dep_to_4lang.get_machines_from_deps_and_corefs( deps, corefs) # print machines self.dep_to_4lang.lexicon.expand(machines) graph = MachineGraph.create_from_machines(machines.values()) print graph.to_dot()
def calculate_output(mode): data_frame = read_data(mode) lang = mode if mode != "sherliic" else "en" text_to_4lang = TextTo4lang(lang=lang) lexicon = Lexicon(lang=lang) similarity = Similarity(with_embedding=False) blacklist_dict = {"en": ["in", "on", "of"], "de": ["auf", "in"], "it": ["nel", "su", "di"], "sherliic": []} with open("{}.txt".format(mode), "w") as f: for index in tqdm(range(len(data_frame))): premise = data_frame.premise[index] hypothesis = data_frame.hypothesis[index] score = data_frame.score[index] blacklist = blacklist_dict[mode] if mode == "sherliic": prem_expand = 2 filt = False process_function = text_to_4lang.process_deps similarity_function = similarity.asim_jac_edges graph_premise = text_to_4lang.process_deps(premise, method="default") graph_hypothesis = text_to_4lang.process_deps(hypothesis, method="default") syn_premise = [premise] syn_hypothesis = [hypothesis] premise_root = graph_premise.root.split('_')[0] hypothesis_root = graph_hypothesis.root.split('_')[0] if premise_root in lexicon.wiktionary_synonyms: syn_premise = synonym_graph(premise, premise_root, lexicon.wiktionary_synonyms[premise_root]) if hypothesis_root in lexicon.wiktionary_synonyms: syn_hypothesis = synonym_graph(hypothesis, hypothesis_root, lexicon.wiktionary_synonyms[hypothesis_root]) else: prem_expand = 3 filt = True process_function = text_to_4lang.process_text similarity_function = similarity.asim_jac_nodes syn_premise = [premise] + lexicon.wiktionary_synonyms[premise] syn_premise = [hypothesis] + lexicon.wiktionary_synonyms[hypothesis] best_match = 0 for syn_prem in syn_premise: for syn_hyp in syn_hypothesis: graph_premise = process_function(syn_prem, method="expand", depth=prem_expand, filt=filt, blacklist=blacklist, black_or_white="black") graph_hypothesis = process_function(syn_hyp, method="expand", depth=1, filt=filt, blacklist=blacklist, black_or_white="black") pred = similarity_function(graph_premise, graph_hypothesis) if pred > best_match: best_match = pred """Source(graph_premise.to_dot()).render('nodes_2_1_sherliic_wo_case/{}_{}_premise.gv'.format( "-".join(df.premise[index].split(" ")), "-".join(df.hypothesis[index].split(" ")))) Source(graph_hypothesis.to_dot()).render('nodes_2_1_sherliic_wo_case/{}_{}_hypothesis.gv'.format( "-".join(df.premise[index].split(" ")), "-".join(df.hypothesis[index].split(" "))))""" f.write("{}\t{}\t{}\t{}\n".format(premise, hypothesis, best_match, score))
def text_to_4lang_demo(self, text, expand, fn='pic', dep_fn='deps'): preproc_sen = TextTo4lang.preprocess_text(text.strip().decode('utf-8')) deps, corefs = self.parser_wrapper.parse_text(preproc_sen) words2machines = self.dep_to_4lang.get_machines_from_deps_and_corefs( deps, corefs) # TODO orig_machines = set() for machine in words2machines.itervalues(): orig_machines |= set(MachineTraverser.get_nodes( machine, names_only=False, keep_upper=True)) # orig_machines = set([m.printname() for m in words2machines.values()]) # logging.info(u'orig_machines: {0}'.format( # [m.printname() for m in orig_machines])) if expand: self.dep_to_4lang.lexicon.expand(words2machines) pic_path = draw_text_graph( words2machines, self.tmp_dir, fn=fn, orig_machines=orig_machines) dep_path = draw_dep_graph(deps[0], self.tmp_dir, dep_fn) # deps_table = self.get_dep_table(deps[0]) return os.path.basename(dep_path), os.path.basename(pic_path)
def main(): logging.basicConfig( level="INFO", format="%(asctime)s : " + "%(module)s (%(lineno)s) - %(levelname)s - %(message)s") cfg_file = sys.argv[1] cfg = get_cfg(cfg_file) text_to_4lang = TextTo4lang(cfg) fn = cfg.get('text', 'input_sens') base_fn = os.path.basename(fn) deps_fn = os.path.join(text_to_4lang.deps_dir, "{0}.deps".format(base_fn)) if text_to_4lang.lang == 'hu': id_field, word_field, lemma_field, msd_field, gov_field, dep_field = ( 0, 1, 3, 4, -4, -2) else: id_field, word_field, lemma_field, msd_field, gov_field, dep_field = ( 0, 1, None, None, -4, -3) deps = map( lambda s: get_dependencies(s, id_field, word_field, lemma_field, msd_field, gov_field, dep_field), sentence_iterator(open(fn))) if text_to_4lang.lang == 'en': c_deps = [] for sen in deps: c_deps.append([]) for d in sen: c_deps[-1].append( (d['type'], (d['gov']['word'], d['gov']['id']), (d['dep']['word'], d['dep']['id']))) # convert to old deps (for now, see issue #51) else: c_deps = deps with open(deps_fn, 'w') as out_f: out_f.write("{0}\n".format(json.dumps({"deps": c_deps, "corefs": []}))) text_to_4lang.process_deps(deps_fn)
def run(synonyms, depth, threshold, combine, dataset="dev", blacklist=["in", "of", "on"]): print("Initializng modules...") text_to_4lang = TextTo4lang(lang="en") data = read_sherliic("data/" + dataset + ".csv", ud_path="data/relation_index.tsv", keep_context=True) data_frame = build_graph(data) data['premise_text'] = data["prem_argleft"] + " " + \ data["premise"] + " " + data["prem_argright"] data['hyp_text'] = data["hypo_argleft"] + " " + \ data["hypothesis"] + " " + data["hypo_argright"] preds = process(text_to_4lang, data_frame, synonyms, depth, threshold, combine, blacklist) bPrecis, bRecall, bFscore, bSupport = pr(data_frame.score.tolist(), preds) print("Precision: " + str(bPrecis[1])) print("Recall: " + str(bRecall[1])) print("Fscore: " + str(bFscore[1])) tn, fp, fn, tp = cm(data_frame.score.tolist(), preds).ravel() print("Scores") print("TN: " + str(tn)) print("FP: " + str(fp)) print("FN: " + str(fn)) print("TP: " + str(tp)) with open("sherlic_output.txt", "w+") as f: for i, pred in enumerate(preds): premise = data.premise_text[i] hypothesis = data.hyp_text[i] f.write( str(premise) + " " + str(hypothesis) + " " + str(pred) + "\n")
def main(): logging.basicConfig( level="INFO", format="%(asctime)s : " + "%(module)s (%(lineno)s) - %(levelname)s - %(message)s") cfg_file = sys.argv[1] cfg = get_cfg(cfg_file) text_to_4lang = TextTo4lang(cfg) fn = cfg.get('text', 'input_sens') base_fn = os.path.basename(fn) deps_fn = os.path.join(text_to_4lang.deps_dir, "{0}.deps".format(base_fn)) deps = [[]] for line in open(fn): dep_str = line.strip() if not dep_str: deps.append([]) else: deps[-1].append(dep_str) with open(deps_fn, 'w') as out_f: out_f.write("{0}\n".format(json.dumps({"deps": deps, "corefs": []}))) text_to_4lang.process_deps(deps_fn)
def run(synonyms, filtering, depth, threshold, language, data_type, votes, blacklist, port, combine, wordnet_only=False): print("Initializng modules...") graded = True if data_type == "graded" else False data_frame = read(language, graded=graded) supported_languages = ["en", "it", "de"] if language not in supported_languages: raise Exception("Not supported language") text_to_4lang = TextTo4lang(lang=language, port=port) if not wordnet_only: fourlang_votes = process_fourlang_votes(text_to_4lang, language, data_frame, synonyms, filtering, depth, threshold, blacklist, combine) else: fourlang_votes = len(data_frame) * [0] if votes: if language == "it" or language == "en": preds = process(language, data_frame, fourlang_votes) else: preds = process_de(data_frame, fourlang_votes) else: preds = fourlang_votes bPrecis, bRecall, bFscore, bSupport = pr(data_frame.score.tolist(), fourlang_votes) print("4lang") print("Precision: " + str(bPrecis[1])) print("Recall: " + str(bRecall[1])) print("Fscore: " + str(bFscore[1])) tn, fp, fn, tp = cm(data_frame.score.tolist(), fourlang_votes).ravel() print("Scores") print("TN: " + str(tn)) print("FP: " + str(fp)) print("FN: " + str(fn)) print("TP: " + str(tp)) bPrecis, bRecall, bFscore, bSupport = pr(data_frame.score.tolist(), preds) print("Voting") print("Precision: " + str(bPrecis[1])) print("Recall: " + str(bRecall[1])) print("Fscore: " + str(bFscore[1])) tn, fp, fn, tp = cm(data_frame.score.tolist(), preds).ravel() print("Scores") print("TN: " + str(tn)) print("FP: " + str(fp)) print("FN: " + str(fn)) print("TP: " + str(tp)) with open("semeval_output.txt", "w+") as f: for i, pred in enumerate(preds): premise = data_frame.premise[i] hypothesis = data_frame.hypothesis[i] f.write( str(premise) + " " + str(hypothesis) + " " + str(pred) + "\n")
def main(self, graded): # Start services self.pid_elmo = subprocess.Popen("{} {} -p 1666".format( sys.executable, os.path.join(os.path.dirname(__file__), "elmo_service.py")).split(' ')) self.pid_ud = subprocess.Popen("{} {} -p 5005 -l {}".format( sys.executable, os.path.join(os.path.dirname(__file__), "../fourlang/service/ud_service.py"), lang).split(' ')) ud_server_up = False while not ud_server_up: try: requests.head("http://127.0.0.1:5005") ud_server_up = True except requests.exceptions.ConnectionError: time.sleep(5) # Create objects self.similarity = Similarity(self.lang) self.text_to_4lang = TextTo4lang(self.lang) self.lexicon = Lexicon(self.lang) graded_text = "graded" if graded else "binary" data = read(self.lang, graded=graded) # Initialize similarity lists if not os.path.exists( os.path.join( os.path.dirname(__file__), "../results/{}/{}".format( graded_text, self.lang))): os.makedirs( os.path.join(os.path.dirname(__file__), "../results/{}/{}".format(graded_text, self.lang))) results = { "asim_jac_word": open( os.path.join( os.path.dirname(__file__), "../results/{}/{}/asim_jac_bow.txt".format( graded_text, self.lang)), "w"), "asim_jac_node": open( os.path.join( os.path.dirname(__file__), "../results/{}/{}/asim_jac_node.txt".format( graded_text, self.lang)), "w"), "asim_jac_edge": open( os.path.join( os.path.dirname(__file__), "../results/{}/{}/asim_jac_edge.txt".format( graded_text, self.lang)), "w"), "asim_jac_bow_elmo": open( os.path.join( os.path.dirname(__file__), "../results/{}/{}/asim_jac_bow_elmo.txt".format( graded_text, self.lang)), "w"), "asim_jac_node_elmo": open( os.path.join( os.path.dirname(__file__), "../results/{}/{}/asim_jac_node_elmo.txt".format( graded_text, self.lang)), "w"), "asim_jac_edge_elmo": open( os.path.join( os.path.dirname(__file__), "../results/{}/{}/asim_jac_edge_elmo.txt".format( graded_text, self.lang)), "w"), "elmo_similarity": open( os.path.join( os.path.dirname(__file__), "../results/{}/{}/elmo_similarity.txt".format( graded_text, self.lang)), "w") } for index, row in data.iterrows(): premise = row.premise hypothesis = row.hypothesis graph_premise = self.text_to_4lang.process_text(premise, True) graph_hypothesis = self.text_to_4lang.process_text( hypothesis, True) def_premise = "" def_hypothesis = "" lemma_premise = self.text_to_4lang.parser_wrapper.lemmatize_word( premise) lemma_hypothesis = self.text_to_4lang.parser_wrapper.lemmatize_word( hypothesis) if premise in self.lexicon.lexicon: def_premise = self.lexicon.lexicon[premise] if lemma_premise in self.lexicon.lexicon and self.lexicon.lexicon[ lemma_premise] != self.lexicon.lexicon[premise]: def_premise = " . ".join( [def_premise, self.lexicon.lexicon[lemma_premise]]) elif lemma_premise in self.lexicon.lexicon: def_premise = self.lexicon.lexicon[lemma_premise] if hypothesis in self.lexicon.lexicon: def_hypothesis = self.lexicon.lexicon[hypothesis] if lemma_hypothesis in self.lexicon.lexicon and self.lexicon.lexicon[ lemma_hypothesis] != self.lexicon.lexicon[hypothesis]: def_hypothesis = " . ".join([ def_hypothesis, self.lexicon.lexicon[lemma_hypothesis] ]) elif lemma_premise in self.lexicon.lexicon: def_hypothesis = self.lexicon.lexicon[lemma_hypothesis] try: results["asim_jac_word"].write(" ".join([ premise, hypothesis, str( self.similarity.asim_jac_words(def_premise, def_hypothesis)) ]) + "\n") results["asim_jac_node"].write(" ".join([ premise, hypothesis, str( self.similarity.asim_jac_nodes(graph_premise, graph_hypothesis)) ]) + "\n") results["asim_jac_edge"].write(" ".join([ premise, hypothesis, str( self.similarity.asim_jac_edges(graph_premise, graph_hypothesis)) ]) + "\n") results["asim_jac_bow_elmo"].write(" ".join([ premise, hypothesis, str( self.similarity.asim_jac_bow_elmo( def_premise, def_hypothesis)) ]) + "\n") results["asim_jac_node_elmo"].write(" ".join([ premise, hypothesis, str( self.similarity.asim_jac_nodes_elmo( premise, hypothesis, graph_premise, graph_hypothesis, def_premise, def_hypothesis)) ]) + "\n") results["asim_jac_edge_elmo"].write(" ".join([ premise, hypothesis, str( self.similarity.asim_jac_edges_elmo( premise, hypothesis, graph_premise, graph_hypothesis, def_premise, def_hypothesis)) ]) + "\n") results["elmo_similarity"].write(" ".join([ premise, hypothesis, str(self.similarity.word_elmo(premise, hypothesis)) ]) + "\n") except Exception as e: self.pid_elmo.terminate() self.pid_ud.terminate() raise e self.pid_elmo.terminate() self.pid_ud.terminate()