コード例 #1
0
ファイル: targetid_orig.py プロジェクト: chenguandan/semafor
def main(fileP, output_format="legacy" or "json"):
    with codecs.open(fileP, "r", "utf-8") as inF:
        for sentId, sentence in enumerate(read_conll(inF, lookup_lemmas=True)):
            assert None not in sentence

            targets = list(get_segmentation(sentence))

            if output_format.lower() == "json":
                sentJ = {"tokens": [tkn.form for tkn in sentence], "frames": []}
                for target in targets:
                    spansJ = []
                    for tkn in sorted(target, key=lambda x: x.id):
                        # group contiguous tokens into spans
                        if spansJ and spansJ[-1]["end"] == tkn.id:
                            spansJ["end"] += 1
                            spansJ["text"] += " " + tkn.form
                        else:
                            spansJ.append({"start": tkn.id - 1, "end": tkn.id, "text": tkn.form})
                    sentJ["frames"].append({"target": {"spans": spansJ}})
                print(json.dumps(sentJ))
            else:
                print(
                    "\t".join(
                        "_".join(str(token.id - 1) for token in target_tokens) + "#true" for target_tokens in targets
                    )
                    + "\t"
                    + str(sentId)
                )
コード例 #2
0
def main(fileP, output_format='legacy' or 'json'):
    with codecs.open(fileP, 'r', 'utf-8') as inF:
        for sentId, sentence in enumerate(read_conll(inF, lookup_lemmas=True)):
            assert None not in sentence

            targets = list(get_segmentation(sentence))

            if output_format.lower() == 'json':
                sentJ = {
                    "tokens": [tkn.form for tkn in sentence],
                    "frames": []
                }
                for target in targets:
                    spansJ = []
                    for tkn in sorted(target, key=lambda x: x.id):
                        # group contiguous tokens into spans
                        if spansJ and spansJ[-1]["end"] == tkn.id:
                            spansJ[-1]["end"] += 1
                            spansJ[-1]["text"] += ' ' + tkn.form
                        else:
                            spansJ.append({
                                "start": tkn.id - 1,
                                "end": tkn.id,
                                "text": tkn.form
                            })
                    sentJ["frames"].append({"target": {"spans": spansJ}})
                print(json.dumps(sentJ))
            else:
                print(format_target_line(sentId, targets))
コード例 #3
0
def main(lines, out=sys.stdout):
    sentences = read_conll(lines, lookup_lemmas=True)
    sentence_idx = 0
    for sentence in sentences:
        matches = lookup_phrases(sentence, noun_types)
        for tokens, types in matches:
            all_ancestors = set(chain(*[hierarchy.adj[t] for t, c in types]))

            out.write(u"%s\t%s\t%s\n" % (sentence_idx, '_'.join(
                str(t.id) for t in tokens), ' '.join(all_ancestors)))
        sentence_idx += 1
コード例 #4
0
ファイル: markup_sentence.py プロジェクト: Noahs-ARK/semafor
def main(lines, out=sys.stdout):
    sentences = read_conll(lines, lookup_lemmas=True)
    sentence_idx = 0
    for sentence in sentences:
        matches = lookup_phrases(sentence, noun_types)
        for tokens, types in matches:
            all_ancestors = set(chain(*[hierarchy.adj[t] for t, c in types]))

            out.write(u"%s\t%s\t%s\n" % (sentence_idx,
                                         '_'.join(str(t.id) for t in tokens),
                                         ' '.join(all_ancestors)))
        sentence_idx += 1
コード例 #5
0
ファイル: targetid_simple.py プロジェクト: Noahs-ARK/semafor
def main(fileP, output_format='legacy' or 'json'):
    with codecs.open(fileP, 'r', 'utf-8') as inF:
        for sentId, sentence in enumerate(read_conll(inF, lookup_lemmas=True)):
            assert None not in sentence

            targets = list(get_segmentation(sentence))

            if output_format.lower() == 'json':
                sentJ = {"tokens": [tkn.form for tkn in sentence],
                         "frames": []}
                for target in targets:
                    spansJ = []
                    for tkn in sorted(target, key=lambda x: x.id):
                        # group contiguous tokens into spans
                        if spansJ and spansJ[-1]["end"] == tkn.id:
                            spansJ[-1]["end"] += 1
                            spansJ[-1]["text"] += ' ' + tkn.form
                        else:
                            spansJ.append({"start": tkn.id - 1, "end": tkn.id, "text": tkn.form})
                    sentJ["frames"].append({"target": {"spans": spansJ}})
                print(json.dumps(sentJ))
            else:
                print(format_target_line(sentId, targets))
コード例 #6
0
        }


def extract_gold_data_points(sentence, conll_tokens):
    non_target_token_idxs = get_non_target_token_idxs(sentence)
    is_target = [(i not in non_target_token_idxs) for i in range(len(conll_tokens))]
    features = extract_features(conll_tokens)
    # return [DataPoint(is_target=t, features=f) for t, f in zip(is_target, features)]
    return [dict(f, is_target=t) for t, f in zip(is_target, features)]


def main(gold_sentences, conll_parses, out_file=sys.stdout):
    dps = list(chain(*[extract_gold_data_points(sentence, conll_tokens)
                       for sentence, conll_tokens in zip(gold_sentences, conll_parses)]))
    all_fields = list(chain(*(dp.keys() for dp in dps)))
    #counts = FreqDist(all_fields)
    dict_writer = DictWriter(out_file, all_fields, delimiter="\t")
    dict_writer.writeheader()
    for dp in dps:
        dict_writer.writerow(dp)


if __name__ == "__main__":
    split_name = sys.argv[1]
    assert(split_name in ('train', 'dev', 'test'))
    with codecs.open(DEP_PARSED_FILENAME % split_name, encoding='utf8') as dep_file:
        conll_parses = list(read_conll(dep_file))
    with codecs.open(GOLD_FILENAME % split_name, encoding='utf8') as gold_file:
        gold_sentences = [json.loads(line) for line in gold_file if line]
    main(gold_sentences, conll_parses)