def main(fileP, output_format="legacy" or "json"): with codecs.open(fileP, "r", "utf-8") as inF: for sentId, sentence in enumerate(read_conll(inF, lookup_lemmas=True)): assert None not in sentence targets = list(get_segmentation(sentence)) if output_format.lower() == "json": sentJ = {"tokens": [tkn.form for tkn in sentence], "frames": []} for target in targets: spansJ = [] for tkn in sorted(target, key=lambda x: x.id): # group contiguous tokens into spans if spansJ and spansJ[-1]["end"] == tkn.id: spansJ["end"] += 1 spansJ["text"] += " " + tkn.form else: spansJ.append({"start": tkn.id - 1, "end": tkn.id, "text": tkn.form}) sentJ["frames"].append({"target": {"spans": spansJ}}) print(json.dumps(sentJ)) else: print( "\t".join( "_".join(str(token.id - 1) for token in target_tokens) + "#true" for target_tokens in targets ) + "\t" + str(sentId) )
def main(fileP, output_format='legacy' or 'json'): with codecs.open(fileP, 'r', 'utf-8') as inF: for sentId, sentence in enumerate(read_conll(inF, lookup_lemmas=True)): assert None not in sentence targets = list(get_segmentation(sentence)) if output_format.lower() == 'json': sentJ = { "tokens": [tkn.form for tkn in sentence], "frames": [] } for target in targets: spansJ = [] for tkn in sorted(target, key=lambda x: x.id): # group contiguous tokens into spans if spansJ and spansJ[-1]["end"] == tkn.id: spansJ[-1]["end"] += 1 spansJ[-1]["text"] += ' ' + tkn.form else: spansJ.append({ "start": tkn.id - 1, "end": tkn.id, "text": tkn.form }) sentJ["frames"].append({"target": {"spans": spansJ}}) print(json.dumps(sentJ)) else: print(format_target_line(sentId, targets))
def main(lines, out=sys.stdout): sentences = read_conll(lines, lookup_lemmas=True) sentence_idx = 0 for sentence in sentences: matches = lookup_phrases(sentence, noun_types) for tokens, types in matches: all_ancestors = set(chain(*[hierarchy.adj[t] for t, c in types])) out.write(u"%s\t%s\t%s\n" % (sentence_idx, '_'.join( str(t.id) for t in tokens), ' '.join(all_ancestors))) sentence_idx += 1
def main(lines, out=sys.stdout): sentences = read_conll(lines, lookup_lemmas=True) sentence_idx = 0 for sentence in sentences: matches = lookup_phrases(sentence, noun_types) for tokens, types in matches: all_ancestors = set(chain(*[hierarchy.adj[t] for t, c in types])) out.write(u"%s\t%s\t%s\n" % (sentence_idx, '_'.join(str(t.id) for t in tokens), ' '.join(all_ancestors))) sentence_idx += 1
def main(fileP, output_format='legacy' or 'json'): with codecs.open(fileP, 'r', 'utf-8') as inF: for sentId, sentence in enumerate(read_conll(inF, lookup_lemmas=True)): assert None not in sentence targets = list(get_segmentation(sentence)) if output_format.lower() == 'json': sentJ = {"tokens": [tkn.form for tkn in sentence], "frames": []} for target in targets: spansJ = [] for tkn in sorted(target, key=lambda x: x.id): # group contiguous tokens into spans if spansJ and spansJ[-1]["end"] == tkn.id: spansJ[-1]["end"] += 1 spansJ[-1]["text"] += ' ' + tkn.form else: spansJ.append({"start": tkn.id - 1, "end": tkn.id, "text": tkn.form}) sentJ["frames"].append({"target": {"spans": spansJ}}) print(json.dumps(sentJ)) else: print(format_target_line(sentId, targets))
} def extract_gold_data_points(sentence, conll_tokens): non_target_token_idxs = get_non_target_token_idxs(sentence) is_target = [(i not in non_target_token_idxs) for i in range(len(conll_tokens))] features = extract_features(conll_tokens) # return [DataPoint(is_target=t, features=f) for t, f in zip(is_target, features)] return [dict(f, is_target=t) for t, f in zip(is_target, features)] def main(gold_sentences, conll_parses, out_file=sys.stdout): dps = list(chain(*[extract_gold_data_points(sentence, conll_tokens) for sentence, conll_tokens in zip(gold_sentences, conll_parses)])) all_fields = list(chain(*(dp.keys() for dp in dps))) #counts = FreqDist(all_fields) dict_writer = DictWriter(out_file, all_fields, delimiter="\t") dict_writer.writeheader() for dp in dps: dict_writer.writerow(dp) if __name__ == "__main__": split_name = sys.argv[1] assert(split_name in ('train', 'dev', 'test')) with codecs.open(DEP_PARSED_FILENAME % split_name, encoding='utf8') as dep_file: conll_parses = list(read_conll(dep_file)) with codecs.open(GOLD_FILENAME % split_name, encoding='utf8') as gold_file: gold_sentences = [json.loads(line) for line in gold_file if line] main(gold_sentences, conll_parses)