def get_depend_graph(semantics): # 'coord', 'xycoord', 'alpha' or 'roman' grammar = get_grammar() grammar.formalism.cl_output_options("tsformat=coord") coords = zip(*grammar.formalism.semantics_to_coordinates(semantics))[0] funs = zip(*grammar.formalism.semantics_to_functions(semantics))[0] gold_seq = zip(coords, funs) tags = [] for g in gold_seq: t = "%s,%s" % (coordinate_to_roman_name(g[0]).replace("-","").replace("b", ""), g[1]) tags.append(t) gold_graph,gold_time_map = semantics_to_dependency_graph(semantics) depend_graph_tags = eval("%s" % gold_graph.get_graph_pos(tags)) gold_graph = eval("%s" % gold_graph.get_graph_index()) return [gold_graph, depend_graph_tags]
def main(): features = {} input_files = glob.glob(PARSES_FILES) for file_results in input_files: # We read in the whole file (it's pickled, so we have to), but don't # keep the pres object after the loop iteration, because it can # be very big try: pres = ParseResults.from_file(file_results) except ParseResults.LoadError, err: if options.errors: # Print all load errors print >>sys.stderr, "Error loading file: %s" % (err) errors.append(file_results) continue print file_results if len(pres.semantics) == 0: continue top_result = pres.semantics[0][1] gold_result = pres.get_gold_semantics() # 'coord', 'xycoord', 'alpha' or 'roman' grammar = get_grammar() grammar.formalism.cl_output_options("tsformat=coord") coords = zip(*grammar.formalism.semantics_to_coordinates(gold_result))[0] funs = zip(*grammar.formalism.semantics_to_functions(gold_result))[0] gold_seq = zip(coords, funs) tags = [] for g in gold_seq: t = "%s,%s" % (coordinate_to_roman_name(g[0]), g[1]) tags.append(t) gold_graph,gold_time_map = semantics_to_dependency_graph(gold_result) depend_graph = eval("%s" % gold_graph.get_graph_pos(tags)) gold_graph = eval("%s" % gold_graph.get_graph_index()) # Words for g in gold_graph: word1 = g[0].split(",") uni_word = "UNIGRAM:"+str(word1[0]) if uni_word not in features: features[uni_word] = 0 else: features[uni_word] += 1 for dep in depend_graph: word1 = dep[0].split(",") uni_word = "UNIGRAM:"+str(word1[0]) if uni_word not in features: features[uni_word] = 0 else: features[uni_word] += 1 # Tags for dep in depend_graph: word1 = dep[0].split(",") uni_tag = "UNIGRAM:"+str(word1[1]) if uni_tag not in features: features[uni_tag] = 0 else: features[uni_tag] += 1 # Bigram Words for g in gold_graph: word1 = g[0].split(",") if g[1] == "ROOT": bigram_word = "BIGRAM:"+str(word1[0])+":ROOT" else: word2 = g[1].split(",") bigram_word = "BIGRAM:"+str(word1[0])+":"+str(word2[0]) if bigram_word not in features: features[bigram_word] = 0 else: features[bigram_word] += 1 for dep in depend_graph: word1 = dep[0].split(",") if dep[1] == "ROOT": bigram_word = "BIGRAM:"+str(word1[0])+":ROOT" else: word2 = dep[1].split(",") bigram_word = "BIGRAM:"+str(word1[0])+":"+str(word2[0]) if bigram_word not in features: features[bigram_word] = 0 else: features[bigram_word] += 1 # Bigram Tags for dep in depend_graph: word1 = dep[0].split(",") if dep[1] == "ROOT": bigram_tag = "BIGRAM:"+str(word1[1])+":ROOT" else: word2 = dep[1].split(",") bigram_tag = "BIGRAM:"+str(word1[1])+":"+str(word2[1]) if bigram_tag not in features: features[bigram_tag] = 0 else: features[bigram_tag] += 1 # Bigram Words/Tags for dep in depend_graph: word1 = dep[0].split(",") if dep[1] == "ROOT": bigram_words_tags = "BIGRAM:"+str(word1[0])+":"+str(word1[1])+":ROOT" else: word2 = dep[1].split(",") bigram_words_tags = "BIGRAM:"+str(word1[0])+":"+str(word1[1])+":"+str(word2[0])+":"+str(word2[1]) if bigram_words_tags not in features: features[bigram_words_tags] = 0 else: features[bigram_words_tags] += 1 # Trigram words for i in range(len(gold_graph)): if gold_graph[i][1] == "ROOT": # Get trigram if gold_graph[i-1][1] != "ROOT" and gold_graph[i-2][1] != "ROOT": head_root_word = gold_graph[i][0].split(",")[0] head_i1_word = gold_graph[i-1][0].split(",")[0] head_i2_word = gold_graph[i-2][0].split(",")[0] trigram_word = "TRIGRAM:" + head_root_word + ":" + head_i1_word + ":" + head_i2_word if trigram_word not in features: features[trigram_word] = 0 else: features[trigram_word] += 1 for i in range(len(depend_graph)): if depend_graph[i][1] == "ROOT": # Get trigram if depend_graph[i-1][1] != "ROOT" and depend_graph[i-2][1] != "ROOT": head_root_word = depend_graph[i][0].split(",")[0] head_i1_word = depend_graph[i-1][0].split(",")[0] head_i2_word = depend_graph[i-2][0].split(",")[0] trigram_word = "TRIGRAM:" + head_root_word + ":" + head_i1_word + ":" + head_i2_word if trigram_word not in features: features[trigram_word] = 0 else: features[trigram_word] += 1 # Trigram tags for i in range(len(depend_graph)): if depend_graph[i][1] == "ROOT": # Get trigram if depend_graph[i-1][1] != "ROOT" and depend_graph[i-2][1] != "ROOT": head_root_tag = depend_graph[i][0].split(",")[1] head_i1_tag = depend_graph[i-1][0].split(",")[1] head_i2_tag = depend_graph[i-2][0].split(",")[1] trigram_tag = "TRIGRAM:" + head_root_tag + ":" + head_i1_tag + ":" + head_i2_tag if trigram_tag not in features: features[trigram_tag] = 0 else: features[trigram_tag] += 1 # Trigram words/tags for i in range(len(depend_graph)): if depend_graph[i][1] == "ROOT": # Get trigram if depend_graph[i-1][1] != "ROOT" and depend_graph[i-2][1] != "ROOT": head_root = depend_graph[i][0].split(",") head_root_word_tag = head_root[0] + ":" + head_root[1] # words/tags head_i1 = depend_graph[i-1][0].split(",") head_i2 = depend_graph[i-2][0].split(",") head_i1_word_tag = head_i1[0] + ":" + head_i1[1] head_i2_word_tag = head_i2[0] + ":" + head_i2[1] trigram_word_tag = "TRIGRAM:" + head_root_word_tag + ":" + head_i1_word_tag + ":" + head_i2_word_tag if trigram_word_tag not in features: features[trigram_word_tag] = 0 else: features[trigram_word_tag] += 1