def run(sentence, language='en'): try: if language == 'en': model_dir = "ENGLISH" parse = subprocess.check_output(['./corenlp.sh', sentence]) elif language == 'it': model_dir = "ITALIAN" parse = subprocess.check_output(['./tintnlp.sh', sentence]) elif language == 'es': model_dir = "SPANISH" parse = subprocess.check_output(['./freelingnlp.sh', sentence]) elif language == 'de': model_dir = "GERMAN" parse = subprocess.check_output(['./corenlp_de.sh', sentence]) elif language == 'zh': model_dir = "CHINESE" parse = subprocess.check_output(['./corenlp_zh.sh', sentence]) dependencies, tokens = preprocessing.run_single(parse, language) Resources.init_table(model_dir, False) embs = Embs("resources_" + language + "/", model_dir) data = (copy.deepcopy(tokens), copy.deepcopy(dependencies)) ununderscored = [] sent_ranges = {} i = 0 for t in tokens: units = t.word.split("_") sent_ranges[t] = str(i) + "-" + str(i + len(units)) ununderscored.extend(units) i += len(units) t = TransitionSystem(embs, data, "PARSE", language, model_dir) triples = t.relations() output = "" if triples == []: return "# ::snt " + " ".join([t for t in ununderscored ]) + "\n" + "(v / emptygraph)\n" graph, graph_indexes, nodes = to_string(triples, "TOP") output = str(graph) if output.startswith("(") == False: return "# ::snt " + " ".join([t for t in ununderscored ]) + "\n" + "(v / " + output + ")" align_line = "" for t, nodes in t.alignments(): if len(nodes) > 0: align_line += sent_ranges[t] + "|" for n in nodes: for i in graph_indexes[n]: align_line += i + "+" align_line = align_line[0:-1] + " " output = "# ::snt " + " ".join([ t for t in ununderscored ]) + "\n# ::alignments " + align_line + "\n" + output return output except: return None
def create(prefix, split, language, model_dir): print "Loading data.." alltokens = pickle.load(open(prefix + ".tokens.p", "rb")) alldependencies = pickle.load(open(prefix + ".dependencies.p", "rb")) allalignments = pickle.load(open(prefix + ".alignments.p", "rb")) allrelations = pickle.load(open(prefix + ".relations.p", "rb")) print "Number of sentences ", len(alltokens) labels = {} labels_counter = 1 for line in open(model_dir + "/relations.txt"): labels[line.strip()] = labels_counter labels_counter += 1 dataset = open(model_dir + "/dataset_" + split + ".txt", "w") labels_dataset = open(model_dir + "/labels_dataset_" + split + ".txt", "w") reentr_dataset = open(model_dir + "/reentr_dataset_" + split + ".txt", "w") counter = 0 resources_dir = "resources_" + language embs = Embs(resources_dir, model_dir) for tokens, dependencies, alignments, relations in zip( alltokens, alldependencies, allalignments, allrelations): counter += 1 print "Sentence no: ", counter data = (tokens, dependencies, relations, alignments) t = TransitionSystem(embs, data, "TRAIN", language) for feats, action in t.statesactions(): f_rel, f_lab, f_reentr = feats # What's in f_*? They are each a representation of the configurations, but what does the numbers represent? A: Feature representations e.g. for f_cat in f_rel: for v in f_cat: dataset.write(str(v) + ",") dataset.write(str(action.get_id()) + "\n") if action.name.endswith("arc"): if action.argv in labels: for f_cat in f_lab: for v in f_cat: labels_dataset.write(str(v) + ",") labels_dataset.write(str(labels[action.argv]) + "\n") if action.name == "reduce": if action.argv is not None: for sib, vec in zip(action.argv[2], f_reentr): for f_cat in vec: for v in f_cat: reentr_dataset.write(str(v) + ",") if sib == action.argv[0]: reentr_dataset.write(str(1) + "\n") else: reentr_dataset.write(str(2) + "\n")
def collect(prefix, language, model_dir): Resources.init_table(model_dir, True) print "Loading data.." # tokens, dependencies, alignments, relations for each sentece in the dataset. alltokens = pickle.load(open(prefix + ".tokens.p", "rb")) alldependencies = pickle.load(open(prefix + ".dependencies.p", "rb")) allalignments = pickle.load(open(prefix + ".alignments.p", "rb")) allrelations = pickle.load(open(prefix + ".relations.p", "rb")) print "Collecting relation labels.." # Store the set of all of the existed relation labels in the "relations.txt". seen_r = set() fw = open(model_dir + "/relations.txt", "w") for relations in allrelations: for r in relations: if r[1] not in seen_r: fw.write(r[1] + "\n") seen_r.add(r[1]) fw.close() print "Collecting dependency labels.." # Store the set of all of the existed dependencies labels in "dependencies.txt". seen_d = set() fw = open(model_dir + "/dependencies.txt", "w") for dependencies in alldependencies: for d in dependencies: if d[1] not in seen_d: fw.write(d[1] + "\n") seen_d.add(d[1]) fw.close() # Initialize the embeddings resources_dir = "resources_" + language embs = Embs(resources_dir, model_dir, True) counter = 0 for tokens, dependencies, alignments, relations in zip( alltokens, alldependencies, allalignments, allrelations): counter += 1 print "Sentence no: ", counter data = (tokens, dependencies, relations, alignments) t = TransitionSystem(embs, data, "COLLECT", language) Resources.store_table(model_dir) print "Done"
def collect(prefix, model_dir): Resources.init_table(model_dir, True) print "Loading data.." alltokens = pickle.load(open(prefix + ".tokens.p", "rb")) alldependencies = pickle.load(open(prefix + ".dependencies.p", "rb")) allalignments = pickle.load(open(prefix + ".alignments.p", "rb")) allrelations = pickle.load(open(prefix + ".relations.p", "rb")) print "Collecting relation labels.." seen_r = set() fw = open(model_dir + "/relations.txt", "w") for relations in allrelations: for r in relations: if r[1] not in seen_r: fw.write(r[1] + "\n") seen_r.add(r[1]) fw.close() print "Collecting dependency labels.." seen_d = set() fw = open(model_dir + "/dependencies.txt", "w") for dependencies in alldependencies: for d in dependencies: if d[1] not in seen_d: fw.write(d[1] + "\n") seen_d.add(d[1]) fw.close() counter = 0 embs = Embs(model_dir, True) for tokens, dependencies, alignments, relations in zip( alltokens, alldependencies, allalignments, allrelations): counter += 1 print "Sentence no: ", counter data = (tokens, dependencies, relations, alignments) t = TransitionSystem(embs, data, "COLLECT") Resources.store_table(model_dir) print "Done"
def main(args): Resources.init_table(args.model, False) prefix = args.file fw = open(prefix + ".parsed", "w") sys.stderr.write("Writing file " + prefix + ".parsed ...\n") embs = Embs("resources_" + args.lang, args.model) alltokens = pickle.load(open(prefix + ".tokens.p", "rb")) alldependencies = pickle.load(open(prefix + ".dependencies.p", "rb")) if args.oracle: allalignments = pickle.load(open(prefix + ".alignments.p", "rb")) allrelations = pickle.load(open(prefix + ".relations.p", "rb")) allalignlines = open(prefix + ".alignments").read().splitlines() loadModels() for idx in range(0, len(alltokens)): ununderscored = [] sent_ranges = {} i = 0 for t in alltokens[idx]: units = t.word.split("_") sent_ranges[t] = str(i) + "-" + str(i + len(units)) ununderscored.extend(units) i += len(units) sys.stderr.write("Sentence " + str((idx + 1)) + ": " + " ".join([t for t in ununderscored]) + "\n") if args.oracle: data = (copy.deepcopy(alltokens[idx]), copy.deepcopy(alldependencies[idx]), copy.deepcopy(allrelations[idx]), copy.deepcopy(allalignments[idx])) t = TransitionSystem(embs, data, "ORACLETEST", args.lang) else: data = (copy.deepcopy(alltokens[idx]), copy.deepcopy(alldependencies[idx])) t = TransitionSystem(embs, data, "PARSE", args.lang, args.model) triples = t.relations() if triples == []: fw.write("# ::id " + str(idx) + "\n# ::snt " + " ".join([t for t in ununderscored]) + "\n(v / emptygraph)\n\n") continue graph, graph_indexes, nodes = to_string(triples, "TOP") graph = graph.strip() if str(graph).startswith("(") == False: fw.write("# ::id " + str(idx) + "\n# ::snt " + " ".join([t for t in ununderscored]) + "\n(v / " + str(graph) + ")\n\n") continue if args.nodesedges and len(nodes) > 0: nodesedges = "" root = nodes[0][1] for n in nodes: nodesedges += "# ::node\t" + "+".join( graph_indexes[n[0]]) + "\t" + n[1] + "\n" nodesedges += "# ::root\t0\t" + root + "\n" for tr in triples: if tr[2] == ":top": continue nodesedges += "# ::edge\t" + tr[1] + "\t" + tr[2] + "\t" + tr[ 4] + "\t" + "+".join( graph_indexes[tr[0]]) + "\t" + "+".join( graph_indexes[tr[3]]) + "\n" graph = nodesedges + graph if args.oracle: output = "# ::id " + str(idx) + "\n# ::snt " + " ".join([ t for t in ununderscored ]) + "\n# ::alignments " + allalignlines[i] + "\n" + str( graph) + "\n" else: if args.avoidalignments: output = "# ::id " + str(idx) + "\n# ::snt " + " ".join( [t for t in ununderscored]) + "\n" + str(graph) + "\n" else: align_line = "" for tok, nodes in t.alignments(): if len(nodes) > 0: tmp = align_line align_line += sent_ranges[tok] + "|" for n in nodes: for i in graph_indexes[n]: align_line += i + "+" if align_line.endswith("|"): align_line = tmp else: align_line = align_line[0:-1] + " " output = "# ::id " + str(idx) + "\n# ::snt " + " ".join( [t for t in ununderscored] ) + "\n# ::alignments " + align_line + "\n" + str(graph) + "\n" fw.write(output + "\n") fw.close()