Exemple #1
0
 def process(self, text):
     preproc = TextTo4lang.preprocess_text(text)
     deps, corefs, parse_trees = self.parser_wrapper.parse_text(preproc)
     machines = self.dep_to_4lang.get_machines_from_deps_and_corefs(
         deps, corefs)
     # print machines
     self.dep_to_4lang.lexicon.expand(machines)
     graph = MachineGraph.create_from_machines(machines.values())
     print graph.to_dot()
Exemple #2
0
def calculate_output(mode):
    data_frame = read_data(mode)
    
    lang = mode if mode != "sherliic" else "en"
    text_to_4lang = TextTo4lang(lang=lang)
    lexicon = Lexicon(lang=lang)
    similarity = Similarity(with_embedding=False)
    blacklist_dict = {"en": ["in", "on", "of"], "de": ["auf", "in"], "it": ["nel", "su", "di"], "sherliic": []}
    
    with open("{}.txt".format(mode), "w") as f:
        for index in tqdm(range(len(data_frame))):
            premise = data_frame.premise[index]
            hypothesis = data_frame.hypothesis[index]
            score = data_frame.score[index]

            blacklist = blacklist_dict[mode]

            if mode == "sherliic":
                prem_expand = 2
                filt = False
                process_function = text_to_4lang.process_deps
                similarity_function = similarity.asim_jac_edges
                graph_premise = text_to_4lang.process_deps(premise, method="default")
                graph_hypothesis = text_to_4lang.process_deps(hypothesis, method="default")
                syn_premise = [premise]
                syn_hypothesis = [hypothesis]
                premise_root = graph_premise.root.split('_')[0]
                hypothesis_root = graph_hypothesis.root.split('_')[0]
                if premise_root in lexicon.wiktionary_synonyms:
                    syn_premise = synonym_graph(premise, premise_root, lexicon.wiktionary_synonyms[premise_root])
                if hypothesis_root in lexicon.wiktionary_synonyms:
                    syn_hypothesis = synonym_graph(hypothesis, hypothesis_root, lexicon.wiktionary_synonyms[hypothesis_root])
            else:
                prem_expand = 3
                filt = True
                process_function = text_to_4lang.process_text
                similarity_function = similarity.asim_jac_nodes
                syn_premise = [premise] + lexicon.wiktionary_synonyms[premise]
                syn_premise = [hypothesis] + lexicon.wiktionary_synonyms[hypothesis]
            best_match = 0
            for syn_prem in syn_premise:
                for syn_hyp in syn_hypothesis:
                    graph_premise = process_function(syn_prem, method="expand", depth=prem_expand, filt=filt, blacklist=blacklist, black_or_white="black")
                    graph_hypothesis = process_function(syn_hyp, method="expand", depth=1, filt=filt, blacklist=blacklist, black_or_white="black")
                    pred = similarity_function(graph_premise, graph_hypothesis)
                    if pred > best_match:
                        best_match = pred

            """Source(graph_premise.to_dot()).render('nodes_2_1_sherliic_wo_case/{}_{}_premise.gv'.format(
                "-".join(df.premise[index].split(" ")), "-".join(df.hypothesis[index].split(" "))))
            Source(graph_hypothesis.to_dot()).render('nodes_2_1_sherliic_wo_case/{}_{}_hypothesis.gv'.format(
                "-".join(df.premise[index].split(" ")), "-".join(df.hypothesis[index].split(" "))))"""
            f.write("{}\t{}\t{}\t{}\n".format(premise, hypothesis, best_match, score))
Exemple #3
0
 def text_to_4lang_demo(self, text, expand, fn='pic', dep_fn='deps'):
     preproc_sen = TextTo4lang.preprocess_text(text.strip().decode('utf-8'))
     deps, corefs = self.parser_wrapper.parse_text(preproc_sen)
     words2machines = self.dep_to_4lang.get_machines_from_deps_and_corefs(
         deps, corefs)
     # TODO
     orig_machines = set()
     for machine in words2machines.itervalues():
         orig_machines |= set(MachineTraverser.get_nodes(
             machine, names_only=False, keep_upper=True))
     # orig_machines = set([m.printname() for m in words2machines.values()])
     # logging.info(u'orig_machines: {0}'.format(
     #     [m.printname() for m in orig_machines]))
     if expand:
         self.dep_to_4lang.lexicon.expand(words2machines)
     pic_path = draw_text_graph(
         words2machines, self.tmp_dir, fn=fn,
         orig_machines=orig_machines)
     dep_path = draw_dep_graph(deps[0], self.tmp_dir, dep_fn)
     # deps_table = self.get_dep_table(deps[0])
     return os.path.basename(dep_path), os.path.basename(pic_path)
Exemple #4
0
def main():
    logging.basicConfig(
        level="INFO",
        format="%(asctime)s : " +
        "%(module)s (%(lineno)s) - %(levelname)s - %(message)s")
    cfg_file = sys.argv[1]
    cfg = get_cfg(cfg_file)
    text_to_4lang = TextTo4lang(cfg)
    fn = cfg.get('text', 'input_sens')
    base_fn = os.path.basename(fn)
    deps_fn = os.path.join(text_to_4lang.deps_dir, "{0}.deps".format(base_fn))

    if text_to_4lang.lang == 'hu':
        id_field, word_field, lemma_field, msd_field, gov_field, dep_field = (
            0, 1, 3, 4, -4, -2)
    else:
        id_field, word_field, lemma_field, msd_field, gov_field, dep_field = (
            0, 1, None, None, -4, -3)

    deps = map(
        lambda s: get_dependencies(s, id_field, word_field, lemma_field,
                                   msd_field, gov_field, dep_field),
        sentence_iterator(open(fn)))

    if text_to_4lang.lang == 'en':
        c_deps = []
        for sen in deps:
            c_deps.append([])
            for d in sen:
                c_deps[-1].append(
                    (d['type'], (d['gov']['word'], d['gov']['id']),
                     (d['dep']['word'], d['dep']['id'])))
                # convert to old deps (for now, see issue #51)
    else:
        c_deps = deps
    with open(deps_fn, 'w') as out_f:
        out_f.write("{0}\n".format(json.dumps({"deps": c_deps, "corefs": []})))

    text_to_4lang.process_deps(deps_fn)
Exemple #5
0
def run(synonyms,
        depth,
        threshold,
        combine,
        dataset="dev",
        blacklist=["in", "of", "on"]):
    print("Initializng modules...")
    text_to_4lang = TextTo4lang(lang="en")
    data = read_sherliic("data/" + dataset + ".csv",
                         ud_path="data/relation_index.tsv",
                         keep_context=True)
    data_frame = build_graph(data)
    data['premise_text'] = data["prem_argleft"] + " " + \
        data["premise"] + " " + data["prem_argright"]
    data['hyp_text'] = data["hypo_argleft"] + " " + \
        data["hypothesis"] + " " + data["hypo_argright"]
    preds = process(text_to_4lang, data_frame, synonyms, depth, threshold,
                    combine, blacklist)

    bPrecis, bRecall, bFscore, bSupport = pr(data_frame.score.tolist(), preds)

    print("Precision: " + str(bPrecis[1]))
    print("Recall: " + str(bRecall[1]))
    print("Fscore: " + str(bFscore[1]))

    tn, fp, fn, tp = cm(data_frame.score.tolist(), preds).ravel()
    print("Scores")
    print("TN: " + str(tn))
    print("FP: " + str(fp))
    print("FN: " + str(fn))
    print("TP: " + str(tp))

    with open("sherlic_output.txt", "w+") as f:
        for i, pred in enumerate(preds):
            premise = data.premise_text[i]
            hypothesis = data.hyp_text[i]
            f.write(
                str(premise) + " " + str(hypothesis) + " " + str(pred) + "\n")
Exemple #6
0
def main():
    logging.basicConfig(
        level="INFO",
        format="%(asctime)s : " +
        "%(module)s (%(lineno)s) - %(levelname)s - %(message)s")
    cfg_file = sys.argv[1]
    cfg = get_cfg(cfg_file)
    text_to_4lang = TextTo4lang(cfg)
    fn = cfg.get('text', 'input_sens')
    base_fn = os.path.basename(fn)
    deps_fn = os.path.join(text_to_4lang.deps_dir, "{0}.deps".format(base_fn))

    deps = [[]]
    for line in open(fn):
        dep_str = line.strip()
        if not dep_str:
            deps.append([])
        else:
            deps[-1].append(dep_str)

    with open(deps_fn, 'w') as out_f:
        out_f.write("{0}\n".format(json.dumps({"deps": deps, "corefs": []})))

    text_to_4lang.process_deps(deps_fn)
Exemple #7
0
def run(synonyms,
        filtering,
        depth,
        threshold,
        language,
        data_type,
        votes,
        blacklist,
        port,
        combine,
        wordnet_only=False):
    print("Initializng modules...")
    graded = True if data_type == "graded" else False
    data_frame = read(language, graded=graded)
    supported_languages = ["en", "it", "de"]
    if language not in supported_languages:
        raise Exception("Not supported language")
    text_to_4lang = TextTo4lang(lang=language, port=port)

    if not wordnet_only:
        fourlang_votes = process_fourlang_votes(text_to_4lang, language,
                                                data_frame, synonyms,
                                                filtering, depth, threshold,
                                                blacklist, combine)
    else:
        fourlang_votes = len(data_frame) * [0]
    if votes:
        if language == "it" or language == "en":
            preds = process(language, data_frame, fourlang_votes)
        else:
            preds = process_de(data_frame, fourlang_votes)
    else:
        preds = fourlang_votes

    bPrecis, bRecall, bFscore, bSupport = pr(data_frame.score.tolist(),
                                             fourlang_votes)

    print("4lang")
    print("Precision: " + str(bPrecis[1]))
    print("Recall: " + str(bRecall[1]))
    print("Fscore: " + str(bFscore[1]))

    tn, fp, fn, tp = cm(data_frame.score.tolist(), fourlang_votes).ravel()
    print("Scores")
    print("TN: " + str(tn))
    print("FP: " + str(fp))
    print("FN: " + str(fn))
    print("TP: " + str(tp))

    bPrecis, bRecall, bFscore, bSupport = pr(data_frame.score.tolist(), preds)

    print("Voting")
    print("Precision: " + str(bPrecis[1]))
    print("Recall: " + str(bRecall[1]))
    print("Fscore: " + str(bFscore[1]))

    tn, fp, fn, tp = cm(data_frame.score.tolist(), preds).ravel()
    print("Scores")
    print("TN: " + str(tn))
    print("FP: " + str(fp))
    print("FN: " + str(fn))
    print("TP: " + str(tp))

    with open("semeval_output.txt", "w+") as f:
        for i, pred in enumerate(preds):
            premise = data_frame.premise[i]
            hypothesis = data_frame.hypothesis[i]
            f.write(
                str(premise) + " " + str(hypothesis) + " " + str(pred) + "\n")
Exemple #8
0
    def main(self, graded):
        # Start services
        self.pid_elmo = subprocess.Popen("{} {} -p 1666".format(
            sys.executable,
            os.path.join(os.path.dirname(__file__),
                         "elmo_service.py")).split(' '))
        self.pid_ud = subprocess.Popen("{} {} -p 5005 -l {}".format(
            sys.executable,
            os.path.join(os.path.dirname(__file__),
                         "../fourlang/service/ud_service.py"),
            lang).split(' '))
        ud_server_up = False
        while not ud_server_up:
            try:
                requests.head("http://127.0.0.1:5005")
                ud_server_up = True
            except requests.exceptions.ConnectionError:
                time.sleep(5)

        # Create objects
        self.similarity = Similarity(self.lang)
        self.text_to_4lang = TextTo4lang(self.lang)
        self.lexicon = Lexicon(self.lang)

        graded_text = "graded" if graded else "binary"
        data = read(self.lang, graded=graded)

        # Initialize similarity lists
        if not os.path.exists(
                os.path.join(
                    os.path.dirname(__file__), "../results/{}/{}".format(
                        graded_text, self.lang))):
            os.makedirs(
                os.path.join(os.path.dirname(__file__),
                             "../results/{}/{}".format(graded_text,
                                                       self.lang)))
        results = {
            "asim_jac_word":
            open(
                os.path.join(
                    os.path.dirname(__file__),
                    "../results/{}/{}/asim_jac_bow.txt".format(
                        graded_text, self.lang)), "w"),
            "asim_jac_node":
            open(
                os.path.join(
                    os.path.dirname(__file__),
                    "../results/{}/{}/asim_jac_node.txt".format(
                        graded_text, self.lang)), "w"),
            "asim_jac_edge":
            open(
                os.path.join(
                    os.path.dirname(__file__),
                    "../results/{}/{}/asim_jac_edge.txt".format(
                        graded_text, self.lang)), "w"),
            "asim_jac_bow_elmo":
            open(
                os.path.join(
                    os.path.dirname(__file__),
                    "../results/{}/{}/asim_jac_bow_elmo.txt".format(
                        graded_text, self.lang)), "w"),
            "asim_jac_node_elmo":
            open(
                os.path.join(
                    os.path.dirname(__file__),
                    "../results/{}/{}/asim_jac_node_elmo.txt".format(
                        graded_text, self.lang)), "w"),
            "asim_jac_edge_elmo":
            open(
                os.path.join(
                    os.path.dirname(__file__),
                    "../results/{}/{}/asim_jac_edge_elmo.txt".format(
                        graded_text, self.lang)), "w"),
            "elmo_similarity":
            open(
                os.path.join(
                    os.path.dirname(__file__),
                    "../results/{}/{}/elmo_similarity.txt".format(
                        graded_text, self.lang)), "w")
        }

        for index, row in data.iterrows():
            premise = row.premise
            hypothesis = row.hypothesis

            graph_premise = self.text_to_4lang.process_text(premise, True)
            graph_hypothesis = self.text_to_4lang.process_text(
                hypothesis, True)
            def_premise = ""
            def_hypothesis = ""
            lemma_premise = self.text_to_4lang.parser_wrapper.lemmatize_word(
                premise)
            lemma_hypothesis = self.text_to_4lang.parser_wrapper.lemmatize_word(
                hypothesis)
            if premise in self.lexicon.lexicon:
                def_premise = self.lexicon.lexicon[premise]
                if lemma_premise in self.lexicon.lexicon and self.lexicon.lexicon[
                        lemma_premise] != self.lexicon.lexicon[premise]:
                    def_premise = " . ".join(
                        [def_premise, self.lexicon.lexicon[lemma_premise]])
            elif lemma_premise in self.lexicon.lexicon:
                def_premise = self.lexicon.lexicon[lemma_premise]

            if hypothesis in self.lexicon.lexicon:
                def_hypothesis = self.lexicon.lexicon[hypothesis]
                if lemma_hypothesis in self.lexicon.lexicon and self.lexicon.lexicon[
                        lemma_hypothesis] != self.lexicon.lexicon[hypothesis]:
                    def_hypothesis = " . ".join([
                        def_hypothesis, self.lexicon.lexicon[lemma_hypothesis]
                    ])
            elif lemma_premise in self.lexicon.lexicon:
                def_hypothesis = self.lexicon.lexicon[lemma_hypothesis]
            try:
                results["asim_jac_word"].write(" ".join([
                    premise, hypothesis,
                    str(
                        self.similarity.asim_jac_words(def_premise,
                                                       def_hypothesis))
                ]) + "\n")
                results["asim_jac_node"].write(" ".join([
                    premise, hypothesis,
                    str(
                        self.similarity.asim_jac_nodes(graph_premise,
                                                       graph_hypothesis))
                ]) + "\n")
                results["asim_jac_edge"].write(" ".join([
                    premise, hypothesis,
                    str(
                        self.similarity.asim_jac_edges(graph_premise,
                                                       graph_hypothesis))
                ]) + "\n")
                results["asim_jac_bow_elmo"].write(" ".join([
                    premise, hypothesis,
                    str(
                        self.similarity.asim_jac_bow_elmo(
                            def_premise, def_hypothesis))
                ]) + "\n")
                results["asim_jac_node_elmo"].write(" ".join([
                    premise, hypothesis,
                    str(
                        self.similarity.asim_jac_nodes_elmo(
                            premise, hypothesis, graph_premise,
                            graph_hypothesis, def_premise, def_hypothesis))
                ]) + "\n")
                results["asim_jac_edge_elmo"].write(" ".join([
                    premise, hypothesis,
                    str(
                        self.similarity.asim_jac_edges_elmo(
                            premise, hypothesis, graph_premise,
                            graph_hypothesis, def_premise, def_hypothesis))
                ]) + "\n")
                results["elmo_similarity"].write(" ".join([
                    premise, hypothesis,
                    str(self.similarity.word_elmo(premise, hypothesis))
                ]) + "\n")
            except Exception as e:
                self.pid_elmo.terminate()
                self.pid_ud.terminate()
                raise e
        self.pid_elmo.terminate()
        self.pid_ud.terminate()