def run(sentence, language='en'):
    try:
        if language == 'en':
            model_dir = "ENGLISH"
            parse = subprocess.check_output(['./corenlp.sh', sentence])
        elif language == 'it':
            model_dir = "ITALIAN"
            parse = subprocess.check_output(['./tintnlp.sh', sentence])
        elif language == 'es':
            model_dir = "SPANISH"
            parse = subprocess.check_output(['./freelingnlp.sh', sentence])
        elif language == 'de':
            model_dir = "GERMAN"
            parse = subprocess.check_output(['./corenlp_de.sh', sentence])
        elif language == 'zh':
            model_dir = "CHINESE"
            parse = subprocess.check_output(['./corenlp_zh.sh', sentence])

        dependencies, tokens = preprocessing.run_single(parse, language)
        Resources.init_table(model_dir, False)
        embs = Embs("resources_" + language + "/", model_dir)
        data = (copy.deepcopy(tokens), copy.deepcopy(dependencies))
        ununderscored = []
        sent_ranges = {}
        i = 0
        for t in tokens:
            units = t.word.split("_")
            sent_ranges[t] = str(i) + "-" + str(i + len(units))
            ununderscored.extend(units)
            i += len(units)
        t = TransitionSystem(embs, data, "PARSE", language, model_dir)
        triples = t.relations()
        output = ""
        if triples == []:
            return "# ::snt " + " ".join([t for t in ununderscored
                                          ]) + "\n" + "(v / emptygraph)\n"

        graph, graph_indexes, nodes = to_string(triples, "TOP")
        output = str(graph)
        if output.startswith("(") == False:
            return "# ::snt " + " ".join([t for t in ununderscored
                                          ]) + "\n" + "(v / " + output + ")"

        align_line = ""
        for t, nodes in t.alignments():
            if len(nodes) > 0:
                align_line += sent_ranges[t] + "|"
                for n in nodes:
                    for i in graph_indexes[n]:
                        align_line += i + "+"
                    align_line = align_line[0:-1] + " "
        output = "# ::snt " + " ".join([
            t for t in ununderscored
        ]) + "\n# ::alignments " + align_line + "\n" + output
        return output
    except:
        return None
Example #2
0
def create(prefix, split, language, model_dir):
    print "Loading data.."
    alltokens = pickle.load(open(prefix + ".tokens.p", "rb"))
    alldependencies = pickle.load(open(prefix + ".dependencies.p", "rb"))
    allalignments = pickle.load(open(prefix + ".alignments.p", "rb"))
    allrelations = pickle.load(open(prefix + ".relations.p", "rb"))
    print "Number of sentences ", len(alltokens)

    labels = {}
    labels_counter = 1
    for line in open(model_dir + "/relations.txt"):
        labels[line.strip()] = labels_counter
        labels_counter += 1

    dataset = open(model_dir + "/dataset_" + split + ".txt", "w")
    labels_dataset = open(model_dir + "/labels_dataset_" + split + ".txt", "w")
    reentr_dataset = open(model_dir + "/reentr_dataset_" + split + ".txt", "w")

    counter = 0
    resources_dir = "resources_" + language
    embs = Embs(resources_dir, model_dir)
    for tokens, dependencies, alignments, relations in zip(
            alltokens, alldependencies, allalignments, allrelations):
        counter += 1
        print "Sentence no: ", counter
        data = (tokens, dependencies, relations, alignments)
        t = TransitionSystem(embs, data, "TRAIN", language)
        for feats, action in t.statesactions():
            f_rel, f_lab, f_reentr = feats
            # What's in f_*? They are each a representation of the configurations, but what does the numbers represent? A: Feature representations e.g.
            for f_cat in f_rel:
                for v in f_cat:
                    dataset.write(str(v) + ",")
            dataset.write(str(action.get_id()) + "\n")

            if action.name.endswith("arc"):
                if action.argv in labels:
                    for f_cat in f_lab:
                        for v in f_cat:
                            labels_dataset.write(str(v) + ",")
                    labels_dataset.write(str(labels[action.argv]) + "\n")

            if action.name == "reduce":
                if action.argv is not None:
                    for sib, vec in zip(action.argv[2], f_reentr):
                        for f_cat in vec:
                            for v in f_cat:
                                reentr_dataset.write(str(v) + ",")
                        if sib == action.argv[0]:
                            reentr_dataset.write(str(1) + "\n")
                        else:
                            reentr_dataset.write(str(2) + "\n")
def collect(prefix, language, model_dir):
    Resources.init_table(model_dir, True)

    print "Loading data.."
    # tokens, dependencies, alignments, relations for each sentece in the dataset.
    alltokens = pickle.load(open(prefix + ".tokens.p", "rb"))
    alldependencies = pickle.load(open(prefix + ".dependencies.p", "rb"))
    allalignments = pickle.load(open(prefix + ".alignments.p", "rb"))
    allrelations = pickle.load(open(prefix + ".relations.p", "rb"))

    print "Collecting relation labels.."
    # Store the set of all of the existed relation labels in the "relations.txt".
    seen_r = set()
    fw = open(model_dir + "/relations.txt", "w")
    for relations in allrelations:
        for r in relations:
            if r[1] not in seen_r:
                fw.write(r[1] + "\n")
                seen_r.add(r[1])
    fw.close()

    print "Collecting dependency labels.."
    # Store the set of all of the existed dependencies labels in "dependencies.txt".
    seen_d = set()
    fw = open(model_dir + "/dependencies.txt", "w")
    for dependencies in alldependencies:
        for d in dependencies:
            if d[1] not in seen_d:
                fw.write(d[1] + "\n")
                seen_d.add(d[1])
    fw.close()

    # Initialize the embeddings
    resources_dir = "resources_" + language
    embs = Embs(resources_dir, model_dir, True)

    counter = 0
    for tokens, dependencies, alignments, relations in zip(
            alltokens, alldependencies, allalignments, allrelations):
        counter += 1
        print "Sentence no: ", counter
        data = (tokens, dependencies, relations, alignments)
        t = TransitionSystem(embs, data, "COLLECT", language)

    Resources.store_table(model_dir)
    print "Done"
Example #4
0
def collect(prefix, model_dir):
    Resources.init_table(model_dir, True)

    print "Loading data.."
    alltokens = pickle.load(open(prefix + ".tokens.p", "rb"))
    alldependencies = pickle.load(open(prefix + ".dependencies.p", "rb"))
    allalignments = pickle.load(open(prefix + ".alignments.p", "rb"))
    allrelations = pickle.load(open(prefix + ".relations.p", "rb"))

    print "Collecting relation labels.."
    seen_r = set()
    fw = open(model_dir + "/relations.txt", "w")
    for relations in allrelations:
        for r in relations:
            if r[1] not in seen_r:
                fw.write(r[1] + "\n")
                seen_r.add(r[1])
    fw.close()

    print "Collecting dependency labels.."
    seen_d = set()
    fw = open(model_dir + "/dependencies.txt", "w")
    for dependencies in alldependencies:
        for d in dependencies:
            if d[1] not in seen_d:
                fw.write(d[1] + "\n")
                seen_d.add(d[1])
    fw.close()

    counter = 0
    embs = Embs(model_dir, True)
    for tokens, dependencies, alignments, relations in zip(
            alltokens, alldependencies, allalignments, allrelations):
        counter += 1
        print "Sentence no: ", counter
        data = (tokens, dependencies, relations, alignments)
        t = TransitionSystem(embs, data, "COLLECT")

    Resources.store_table(model_dir)
    print "Done"
def main(args):
    Resources.init_table(args.model, False)

    prefix = args.file
    fw = open(prefix + ".parsed", "w")
    sys.stderr.write("Writing file " + prefix + ".parsed ...\n")
    embs = Embs("resources_" + args.lang, args.model)

    alltokens = pickle.load(open(prefix + ".tokens.p", "rb"))
    alldependencies = pickle.load(open(prefix + ".dependencies.p", "rb"))
    if args.oracle:
        allalignments = pickle.load(open(prefix + ".alignments.p", "rb"))
        allrelations = pickle.load(open(prefix + ".relations.p", "rb"))
        allalignlines = open(prefix + ".alignments").read().splitlines()

    loadModels()

    for idx in range(0, len(alltokens)):
        ununderscored = []
        sent_ranges = {}
        i = 0
        for t in alltokens[idx]:
            units = t.word.split("_")
            sent_ranges[t] = str(i) + "-" + str(i + len(units))
            ununderscored.extend(units)
            i += len(units)

        sys.stderr.write("Sentence " + str((idx + 1)) + ": " +
                         " ".join([t for t in ununderscored]) + "\n")

        if args.oracle:
            data = (copy.deepcopy(alltokens[idx]),
                    copy.deepcopy(alldependencies[idx]),
                    copy.deepcopy(allrelations[idx]),
                    copy.deepcopy(allalignments[idx]))
            t = TransitionSystem(embs, data, "ORACLETEST", args.lang)
        else:
            data = (copy.deepcopy(alltokens[idx]),
                    copy.deepcopy(alldependencies[idx]))
            t = TransitionSystem(embs, data, "PARSE", args.lang, args.model)

        triples = t.relations()
        if triples == []:
            fw.write("# ::id " + str(idx) + "\n# ::snt " +
                     " ".join([t for t in ununderscored]) +
                     "\n(v / emptygraph)\n\n")
            continue
        graph, graph_indexes, nodes = to_string(triples, "TOP")

        graph = graph.strip()
        if str(graph).startswith("(") == False:
            fw.write("# ::id " + str(idx) + "\n# ::snt " +
                     " ".join([t for t in ununderscored]) + "\n(v / " +
                     str(graph) + ")\n\n")
            continue

        if args.nodesedges and len(nodes) > 0:
            nodesedges = ""
            root = nodes[0][1]
            for n in nodes:
                nodesedges += "# ::node\t" + "+".join(
                    graph_indexes[n[0]]) + "\t" + n[1] + "\n"
            nodesedges += "# ::root\t0\t" + root + "\n"
            for tr in triples:
                if tr[2] == ":top":
                    continue
                nodesedges += "# ::edge\t" + tr[1] + "\t" + tr[2] + "\t" + tr[
                    4] + "\t" + "+".join(
                        graph_indexes[tr[0]]) + "\t" + "+".join(
                            graph_indexes[tr[3]]) + "\n"
            graph = nodesedges + graph

        if args.oracle:
            output = "# ::id " + str(idx) + "\n# ::snt " + " ".join([
                t for t in ununderscored
            ]) + "\n# ::alignments " + allalignlines[i] + "\n" + str(
                graph) + "\n"
        else:
            if args.avoidalignments:
                output = "# ::id " + str(idx) + "\n# ::snt " + " ".join(
                    [t for t in ununderscored]) + "\n" + str(graph) + "\n"
            else:
                align_line = ""
                for tok, nodes in t.alignments():
                    if len(nodes) > 0:
                        tmp = align_line
                        align_line += sent_ranges[tok] + "|"
                        for n in nodes:
                            for i in graph_indexes[n]:
                                align_line += i + "+"
                        if align_line.endswith("|"):
                            align_line = tmp
                        else:
                            align_line = align_line[0:-1] + " "
                output = "# ::id " + str(idx) + "\n# ::snt " + " ".join(
                    [t for t in ununderscored]
                ) + "\n# ::alignments " + align_line + "\n" + str(graph) + "\n"
        fw.write(output + "\n")

    fw.close()