def main(): lex_fn, word = sys.argv[1:3] lex = Lexicon.load_from_binary(lex_fn) machines = lex.lexicon.get(word, lex.ext_lexicon.get(word)) if machines is None: print '404 :(' else: graph = MachineGraph.create_from_machines(machines) sys.stdout.write(graph.to_dot().encode('utf-8'))
def calculate_output(mode): data_frame = read_data(mode) lang = mode if mode != "sherliic" else "en" text_to_4lang = TextTo4lang(lang=lang) lexicon = Lexicon(lang=lang) similarity = Similarity(with_embedding=False) blacklist_dict = {"en": ["in", "on", "of"], "de": ["auf", "in"], "it": ["nel", "su", "di"], "sherliic": []} with open("{}.txt".format(mode), "w") as f: for index in tqdm(range(len(data_frame))): premise = data_frame.premise[index] hypothesis = data_frame.hypothesis[index] score = data_frame.score[index] blacklist = blacklist_dict[mode] if mode == "sherliic": prem_expand = 2 filt = False process_function = text_to_4lang.process_deps similarity_function = similarity.asim_jac_edges graph_premise = text_to_4lang.process_deps(premise, method="default") graph_hypothesis = text_to_4lang.process_deps(hypothesis, method="default") syn_premise = [premise] syn_hypothesis = [hypothesis] premise_root = graph_premise.root.split('_')[0] hypothesis_root = graph_hypothesis.root.split('_')[0] if premise_root in lexicon.wiktionary_synonyms: syn_premise = synonym_graph(premise, premise_root, lexicon.wiktionary_synonyms[premise_root]) if hypothesis_root in lexicon.wiktionary_synonyms: syn_hypothesis = synonym_graph(hypothesis, hypothesis_root, lexicon.wiktionary_synonyms[hypothesis_root]) else: prem_expand = 3 filt = True process_function = text_to_4lang.process_text similarity_function = similarity.asim_jac_nodes syn_premise = [premise] + lexicon.wiktionary_synonyms[premise] syn_premise = [hypothesis] + lexicon.wiktionary_synonyms[hypothesis] best_match = 0 for syn_prem in syn_premise: for syn_hyp in syn_hypothesis: graph_premise = process_function(syn_prem, method="expand", depth=prem_expand, filt=filt, blacklist=blacklist, black_or_white="black") graph_hypothesis = process_function(syn_hyp, method="expand", depth=1, filt=filt, blacklist=blacklist, black_or_white="black") pred = similarity_function(graph_premise, graph_hypothesis) if pred > best_match: best_match = pred """Source(graph_premise.to_dot()).render('nodes_2_1_sherliic_wo_case/{}_{}_premise.gv'.format( "-".join(df.premise[index].split(" ")), "-".join(df.hypothesis[index].split(" ")))) Source(graph_hypothesis.to_dot()).render('nodes_2_1_sherliic_wo_case/{}_{}_hypothesis.gv'.format( "-".join(df.premise[index].split(" ")), "-".join(df.hypothesis[index].split(" "))))""" f.write("{}\t{}\t{}\t{}\n".format(premise, hypothesis, best_match, score))
import sys from pymachine.utils import MachineGraph from fourlang.lexicon import Lexicon lexicon = Lexicon.load_from_binary(sys.argv[1]) total = 0 total_size = 0 smallest = 999 largest = 0 for word, machines in lexicon.ext_lexicon.iteritems(): machine = next(iter(machines)) graph = MachineGraph.create_from_machines([machine]) size = len(graph.G) - 1 if size < 1: continue total += 1 total_size += size smallest = min(smallest, size) largest = max(largest, size) print 'processed {0} graphs'.format(total) print 'average size: {0} nodes'.format(total_size/float(total)) print 'smallest: {0}, largest: {1}'.format(smallest, largest)
def main(self, graded): # Start services self.pid_elmo = subprocess.Popen("{} {} -p 1666".format( sys.executable, os.path.join(os.path.dirname(__file__), "elmo_service.py")).split(' ')) self.pid_ud = subprocess.Popen("{} {} -p 5005 -l {}".format( sys.executable, os.path.join(os.path.dirname(__file__), "../fourlang/service/ud_service.py"), lang).split(' ')) ud_server_up = False while not ud_server_up: try: requests.head("http://127.0.0.1:5005") ud_server_up = True except requests.exceptions.ConnectionError: time.sleep(5) # Create objects self.similarity = Similarity(self.lang) self.text_to_4lang = TextTo4lang(self.lang) self.lexicon = Lexicon(self.lang) graded_text = "graded" if graded else "binary" data = read(self.lang, graded=graded) # Initialize similarity lists if not os.path.exists( os.path.join( os.path.dirname(__file__), "../results/{}/{}".format( graded_text, self.lang))): os.makedirs( os.path.join(os.path.dirname(__file__), "../results/{}/{}".format(graded_text, self.lang))) results = { "asim_jac_word": open( os.path.join( os.path.dirname(__file__), "../results/{}/{}/asim_jac_bow.txt".format( graded_text, self.lang)), "w"), "asim_jac_node": open( os.path.join( os.path.dirname(__file__), "../results/{}/{}/asim_jac_node.txt".format( graded_text, self.lang)), "w"), "asim_jac_edge": open( os.path.join( os.path.dirname(__file__), "../results/{}/{}/asim_jac_edge.txt".format( graded_text, self.lang)), "w"), "asim_jac_bow_elmo": open( os.path.join( os.path.dirname(__file__), "../results/{}/{}/asim_jac_bow_elmo.txt".format( graded_text, self.lang)), "w"), "asim_jac_node_elmo": open( os.path.join( os.path.dirname(__file__), "../results/{}/{}/asim_jac_node_elmo.txt".format( graded_text, self.lang)), "w"), "asim_jac_edge_elmo": open( os.path.join( os.path.dirname(__file__), "../results/{}/{}/asim_jac_edge_elmo.txt".format( graded_text, self.lang)), "w"), "elmo_similarity": open( os.path.join( os.path.dirname(__file__), "../results/{}/{}/elmo_similarity.txt".format( graded_text, self.lang)), "w") } for index, row in data.iterrows(): premise = row.premise hypothesis = row.hypothesis graph_premise = self.text_to_4lang.process_text(premise, True) graph_hypothesis = self.text_to_4lang.process_text( hypothesis, True) def_premise = "" def_hypothesis = "" lemma_premise = self.text_to_4lang.parser_wrapper.lemmatize_word( premise) lemma_hypothesis = self.text_to_4lang.parser_wrapper.lemmatize_word( hypothesis) if premise in self.lexicon.lexicon: def_premise = self.lexicon.lexicon[premise] if lemma_premise in self.lexicon.lexicon and self.lexicon.lexicon[ lemma_premise] != self.lexicon.lexicon[premise]: def_premise = " . ".join( [def_premise, self.lexicon.lexicon[lemma_premise]]) elif lemma_premise in self.lexicon.lexicon: def_premise = self.lexicon.lexicon[lemma_premise] if hypothesis in self.lexicon.lexicon: def_hypothesis = self.lexicon.lexicon[hypothesis] if lemma_hypothesis in self.lexicon.lexicon and self.lexicon.lexicon[ lemma_hypothesis] != self.lexicon.lexicon[hypothesis]: def_hypothesis = " . ".join([ def_hypothesis, self.lexicon.lexicon[lemma_hypothesis] ]) elif lemma_premise in self.lexicon.lexicon: def_hypothesis = self.lexicon.lexicon[lemma_hypothesis] try: results["asim_jac_word"].write(" ".join([ premise, hypothesis, str( self.similarity.asim_jac_words(def_premise, def_hypothesis)) ]) + "\n") results["asim_jac_node"].write(" ".join([ premise, hypothesis, str( self.similarity.asim_jac_nodes(graph_premise, graph_hypothesis)) ]) + "\n") results["asim_jac_edge"].write(" ".join([ premise, hypothesis, str( self.similarity.asim_jac_edges(graph_premise, graph_hypothesis)) ]) + "\n") results["asim_jac_bow_elmo"].write(" ".join([ premise, hypothesis, str( self.similarity.asim_jac_bow_elmo( def_premise, def_hypothesis)) ]) + "\n") results["asim_jac_node_elmo"].write(" ".join([ premise, hypothesis, str( self.similarity.asim_jac_nodes_elmo( premise, hypothesis, graph_premise, graph_hypothesis, def_premise, def_hypothesis)) ]) + "\n") results["asim_jac_edge_elmo"].write(" ".join([ premise, hypothesis, str( self.similarity.asim_jac_edges_elmo( premise, hypothesis, graph_premise, graph_hypothesis, def_premise, def_hypothesis)) ]) + "\n") results["elmo_similarity"].write(" ".join([ premise, hypothesis, str(self.similarity.word_elmo(premise, hypothesis)) ]) + "\n") except Exception as e: self.pid_elmo.terminate() self.pid_ud.terminate() raise e self.pid_elmo.terminate() self.pid_ud.terminate()
from fourlang.lexicon import Lexicon from fourlang.stanford_wrapper import StanfordParser def check_lexicon(lexicon): lemma_not_in_lexicon_file = open("lemma_not_in_lexicon.txt", "w") lemma_word_def_different_file = open("lemma_word_def_different.txt", "w") lemma_not_in_lexicon = [] lemma_word_def_different = [] i = 0 for word in lexicon: lemma = sp.lemmatize_word(word) if lemma not in lexicon: print("{}\t{}\t{}".format(word, lexicon[word], lemma), file=lemma_not_in_lexicon_file) lemma_not_in_lexicon.append((word, lemma)) elif lexicon[lemma] != lexicon[word]: print("{}\t{}\t{}\t{}".format(word, lexicon[word], lemma, lexicon[lemma]), file=lemma_word_def_different_file) lemma_word_def_different.append({word: lexicon[word], lemma: lexicon[lemma]}) i += 1 print(i, len(lexicon)) print(len(lemma_not_in_lexicon), len(lemma_word_def_different), len(lexicon)) if __name__ == '__main__': sp = StanfordParser() en = Lexicon(lang="en") check_lexicon(en.lexicon)