コード例 #1
0
ファイル: ontonotes-stats.py プロジェクト: ai-ku/uwsd
def process_parse_annotation():
    print >> sys.stderr, "Parsing started"
    reader = BracketParseCorpusReader(annotations_path, '.*parse')
    pos_set = set("NN VB RB JJ".split()) # word level pos tags for n, v, adv, adj.
    check_pos = lambda x: x in pos_set
    d = dd(lambda: count(0))
    for fileid in reader.fileids():
        #print fileid
        for sentence in reader.parsed_sents(fileid):
            for word, p in sentence.pos():
                pos = p[0:2]
                if p != '-NONE-' and check_pos(pos):
                     d[pos].next()
    print [(pos, c.next()) for pos, c in d.iteritems()]
コード例 #2
0
ファイル: ontonotes-preprocess.py プロジェクト: ai-ku/uwsd
def annotation_process():
    d = get_inventory_info()
    annotated_files = find_files(annotations_path, "*.sense")
    pos_file = gzip.open('on.pos.gz', 'w')
    inst_num_dict = dd(lambda: count(1))
    for num_processed, fn in enumerate(annotated_files):
        if num_processed % 1000 == 0:
            print >> sys.stderr, "{} files processed".format(num_processed)
        directory = os.path.dirname(fn)
        basename = os.path.basename(fn)
        reader = BracketParseCorpusReader(directory, basename.replace('.sense', '.parse'))
        fileid = reader.fileids()[0]
        sentences = dict()
        parsed_sents = reader.parsed_sents(fileid)
        for line in open(fn):
            line = line.split()
            tw = line[3]
            onto_sense = line[-1]
            sent_id, tok_id = int(line[1]), int(line[2])
            stuple = sentences.setdefault(sent_id, None)
            if stuple is None:
                sentence = parsed_sents[sent_id]
                clean_sent = []
                clean_pos = []
                for word, p in sentence.pos():
                    if p != '-NONE-':
                        if word in fix:
                            word = fix[word]
                        clean_sent.append(word)
                        clean_pos.append(p)
                sentences[sent_id] = (clean_sent, clean_pos)
            else:
                clean_sent, clean_pos = stuple
            lexicon_senses, version, ita = d[tw][onto_sense]
            w = tw.replace('-', '.') # following the convention of SemEval
            m = "{}\t{}.on.{}\t{}-{}-{}\t{}-{}\t{}\t{}\t{}\t{}\t{}"
            print m.format(w, w, inst_num_dict[tw].next(), line[0], sent_id, tok_id,
                w, onto_sense, lexicon_senses, version, ita, tok_id, " ".join(clean_sent))
            pos_file.write("{}\n".format(clean_pos))
    print >> sys.stderr, "{} files processed".format(num_processed)