Esempio n. 1
0
def main():
    lex_fn, word = sys.argv[1:3]
    lex = Lexicon.load_from_binary(lex_fn)
    machines = lex.lexicon.get(word, lex.ext_lexicon.get(word))
    if machines is None:
        print '404 :('
    else:
        graph = MachineGraph.create_from_machines(machines)
        sys.stdout.write(graph.to_dot().encode('utf-8'))
Esempio n. 2
0
def main():
    lex_fn, word = sys.argv[1:3]
    lex = Lexicon.load_from_binary(lex_fn)
    machines = lex.lexicon.get(word, lex.ext_lexicon.get(word))
    if machines is None:
        print '404 :('
    else:
        graph = MachineGraph.create_from_machines(machines)
        sys.stdout.write(graph.to_dot().encode('utf-8'))
Esempio n. 3
0
def calculate_output(mode):
    data_frame = read_data(mode)
    
    lang = mode if mode != "sherliic" else "en"
    text_to_4lang = TextTo4lang(lang=lang)
    lexicon = Lexicon(lang=lang)
    similarity = Similarity(with_embedding=False)
    blacklist_dict = {"en": ["in", "on", "of"], "de": ["auf", "in"], "it": ["nel", "su", "di"], "sherliic": []}
    
    with open("{}.txt".format(mode), "w") as f:
        for index in tqdm(range(len(data_frame))):
            premise = data_frame.premise[index]
            hypothesis = data_frame.hypothesis[index]
            score = data_frame.score[index]

            blacklist = blacklist_dict[mode]

            if mode == "sherliic":
                prem_expand = 2
                filt = False
                process_function = text_to_4lang.process_deps
                similarity_function = similarity.asim_jac_edges
                graph_premise = text_to_4lang.process_deps(premise, method="default")
                graph_hypothesis = text_to_4lang.process_deps(hypothesis, method="default")
                syn_premise = [premise]
                syn_hypothesis = [hypothesis]
                premise_root = graph_premise.root.split('_')[0]
                hypothesis_root = graph_hypothesis.root.split('_')[0]
                if premise_root in lexicon.wiktionary_synonyms:
                    syn_premise = synonym_graph(premise, premise_root, lexicon.wiktionary_synonyms[premise_root])
                if hypothesis_root in lexicon.wiktionary_synonyms:
                    syn_hypothesis = synonym_graph(hypothesis, hypothesis_root, lexicon.wiktionary_synonyms[hypothesis_root])
            else:
                prem_expand = 3
                filt = True
                process_function = text_to_4lang.process_text
                similarity_function = similarity.asim_jac_nodes
                syn_premise = [premise] + lexicon.wiktionary_synonyms[premise]
                syn_premise = [hypothesis] + lexicon.wiktionary_synonyms[hypothesis]
            best_match = 0
            for syn_prem in syn_premise:
                for syn_hyp in syn_hypothesis:
                    graph_premise = process_function(syn_prem, method="expand", depth=prem_expand, filt=filt, blacklist=blacklist, black_or_white="black")
                    graph_hypothesis = process_function(syn_hyp, method="expand", depth=1, filt=filt, blacklist=blacklist, black_or_white="black")
                    pred = similarity_function(graph_premise, graph_hypothesis)
                    if pred > best_match:
                        best_match = pred

            """Source(graph_premise.to_dot()).render('nodes_2_1_sherliic_wo_case/{}_{}_premise.gv'.format(
                "-".join(df.premise[index].split(" ")), "-".join(df.hypothesis[index].split(" "))))
            Source(graph_hypothesis.to_dot()).render('nodes_2_1_sherliic_wo_case/{}_{}_hypothesis.gv'.format(
                "-".join(df.premise[index].split(" ")), "-".join(df.hypothesis[index].split(" "))))"""
            f.write("{}\t{}\t{}\t{}\n".format(premise, hypothesis, best_match, score))
Esempio n. 4
0
import sys

from pymachine.utils import MachineGraph

from fourlang.lexicon import Lexicon

lexicon = Lexicon.load_from_binary(sys.argv[1])
total = 0
total_size = 0
smallest = 999
largest = 0
for word, machines in lexicon.ext_lexicon.iteritems():
    machine = next(iter(machines))
    graph = MachineGraph.create_from_machines([machine])
    size = len(graph.G) - 1
    if size < 1:
        continue
    total += 1
    total_size += size
    smallest = min(smallest, size)
    largest = max(largest, size)

print 'processed {0} graphs'.format(total)
print 'average size: {0} nodes'.format(total_size/float(total))
print 'smallest: {0}, largest: {1}'.format(smallest, largest)
Esempio n. 5
0
    def main(self, graded):
        # Start services
        self.pid_elmo = subprocess.Popen("{} {} -p 1666".format(
            sys.executable,
            os.path.join(os.path.dirname(__file__),
                         "elmo_service.py")).split(' '))
        self.pid_ud = subprocess.Popen("{} {} -p 5005 -l {}".format(
            sys.executable,
            os.path.join(os.path.dirname(__file__),
                         "../fourlang/service/ud_service.py"),
            lang).split(' '))
        ud_server_up = False
        while not ud_server_up:
            try:
                requests.head("http://127.0.0.1:5005")
                ud_server_up = True
            except requests.exceptions.ConnectionError:
                time.sleep(5)

        # Create objects
        self.similarity = Similarity(self.lang)
        self.text_to_4lang = TextTo4lang(self.lang)
        self.lexicon = Lexicon(self.lang)

        graded_text = "graded" if graded else "binary"
        data = read(self.lang, graded=graded)

        # Initialize similarity lists
        if not os.path.exists(
                os.path.join(
                    os.path.dirname(__file__), "../results/{}/{}".format(
                        graded_text, self.lang))):
            os.makedirs(
                os.path.join(os.path.dirname(__file__),
                             "../results/{}/{}".format(graded_text,
                                                       self.lang)))
        results = {
            "asim_jac_word":
            open(
                os.path.join(
                    os.path.dirname(__file__),
                    "../results/{}/{}/asim_jac_bow.txt".format(
                        graded_text, self.lang)), "w"),
            "asim_jac_node":
            open(
                os.path.join(
                    os.path.dirname(__file__),
                    "../results/{}/{}/asim_jac_node.txt".format(
                        graded_text, self.lang)), "w"),
            "asim_jac_edge":
            open(
                os.path.join(
                    os.path.dirname(__file__),
                    "../results/{}/{}/asim_jac_edge.txt".format(
                        graded_text, self.lang)), "w"),
            "asim_jac_bow_elmo":
            open(
                os.path.join(
                    os.path.dirname(__file__),
                    "../results/{}/{}/asim_jac_bow_elmo.txt".format(
                        graded_text, self.lang)), "w"),
            "asim_jac_node_elmo":
            open(
                os.path.join(
                    os.path.dirname(__file__),
                    "../results/{}/{}/asim_jac_node_elmo.txt".format(
                        graded_text, self.lang)), "w"),
            "asim_jac_edge_elmo":
            open(
                os.path.join(
                    os.path.dirname(__file__),
                    "../results/{}/{}/asim_jac_edge_elmo.txt".format(
                        graded_text, self.lang)), "w"),
            "elmo_similarity":
            open(
                os.path.join(
                    os.path.dirname(__file__),
                    "../results/{}/{}/elmo_similarity.txt".format(
                        graded_text, self.lang)), "w")
        }

        for index, row in data.iterrows():
            premise = row.premise
            hypothesis = row.hypothesis

            graph_premise = self.text_to_4lang.process_text(premise, True)
            graph_hypothesis = self.text_to_4lang.process_text(
                hypothesis, True)
            def_premise = ""
            def_hypothesis = ""
            lemma_premise = self.text_to_4lang.parser_wrapper.lemmatize_word(
                premise)
            lemma_hypothesis = self.text_to_4lang.parser_wrapper.lemmatize_word(
                hypothesis)
            if premise in self.lexicon.lexicon:
                def_premise = self.lexicon.lexicon[premise]
                if lemma_premise in self.lexicon.lexicon and self.lexicon.lexicon[
                        lemma_premise] != self.lexicon.lexicon[premise]:
                    def_premise = " . ".join(
                        [def_premise, self.lexicon.lexicon[lemma_premise]])
            elif lemma_premise in self.lexicon.lexicon:
                def_premise = self.lexicon.lexicon[lemma_premise]

            if hypothesis in self.lexicon.lexicon:
                def_hypothesis = self.lexicon.lexicon[hypothesis]
                if lemma_hypothesis in self.lexicon.lexicon and self.lexicon.lexicon[
                        lemma_hypothesis] != self.lexicon.lexicon[hypothesis]:
                    def_hypothesis = " . ".join([
                        def_hypothesis, self.lexicon.lexicon[lemma_hypothesis]
                    ])
            elif lemma_premise in self.lexicon.lexicon:
                def_hypothesis = self.lexicon.lexicon[lemma_hypothesis]
            try:
                results["asim_jac_word"].write(" ".join([
                    premise, hypothesis,
                    str(
                        self.similarity.asim_jac_words(def_premise,
                                                       def_hypothesis))
                ]) + "\n")
                results["asim_jac_node"].write(" ".join([
                    premise, hypothesis,
                    str(
                        self.similarity.asim_jac_nodes(graph_premise,
                                                       graph_hypothesis))
                ]) + "\n")
                results["asim_jac_edge"].write(" ".join([
                    premise, hypothesis,
                    str(
                        self.similarity.asim_jac_edges(graph_premise,
                                                       graph_hypothesis))
                ]) + "\n")
                results["asim_jac_bow_elmo"].write(" ".join([
                    premise, hypothesis,
                    str(
                        self.similarity.asim_jac_bow_elmo(
                            def_premise, def_hypothesis))
                ]) + "\n")
                results["asim_jac_node_elmo"].write(" ".join([
                    premise, hypothesis,
                    str(
                        self.similarity.asim_jac_nodes_elmo(
                            premise, hypothesis, graph_premise,
                            graph_hypothesis, def_premise, def_hypothesis))
                ]) + "\n")
                results["asim_jac_edge_elmo"].write(" ".join([
                    premise, hypothesis,
                    str(
                        self.similarity.asim_jac_edges_elmo(
                            premise, hypothesis, graph_premise,
                            graph_hypothesis, def_premise, def_hypothesis))
                ]) + "\n")
                results["elmo_similarity"].write(" ".join([
                    premise, hypothesis,
                    str(self.similarity.word_elmo(premise, hypothesis))
                ]) + "\n")
            except Exception as e:
                self.pid_elmo.terminate()
                self.pid_ud.terminate()
                raise e
        self.pid_elmo.terminate()
        self.pid_ud.terminate()
Esempio n. 6
0
from fourlang.lexicon import Lexicon
from fourlang.stanford_wrapper import StanfordParser


def check_lexicon(lexicon):
    lemma_not_in_lexicon_file = open("lemma_not_in_lexicon.txt", "w")
    lemma_word_def_different_file = open("lemma_word_def_different.txt", "w")
    lemma_not_in_lexicon = []
    lemma_word_def_different = []
    i = 0
    for word in lexicon:
        lemma = sp.lemmatize_word(word)
        if lemma not in lexicon:
            print("{}\t{}\t{}".format(word, lexicon[word], lemma), file=lemma_not_in_lexicon_file)
            lemma_not_in_lexicon.append((word, lemma))
        elif lexicon[lemma] != lexicon[word]:
            print("{}\t{}\t{}\t{}".format(word, lexicon[word], lemma, lexicon[lemma]), file=lemma_word_def_different_file)
            lemma_word_def_different.append({word: lexicon[word], lemma: lexicon[lemma]})
        i += 1
        print(i, len(lexicon))
    print(len(lemma_not_in_lexicon), len(lemma_word_def_different), len(lexicon))


if __name__ == '__main__':
    sp = StanfordParser()
    en = Lexicon(lang="en")
    check_lexicon(en.lexicon)