def process_parse_annotation(): print >> sys.stderr, "Parsing started" reader = BracketParseCorpusReader(annotations_path, '.*parse') pos_set = set("NN VB RB JJ".split()) # word level pos tags for n, v, adv, adj. check_pos = lambda x: x in pos_set d = dd(lambda: count(0)) for fileid in reader.fileids(): #print fileid for sentence in reader.parsed_sents(fileid): for word, p in sentence.pos(): pos = p[0:2] if p != '-NONE-' and check_pos(pos): d[pos].next() print [(pos, c.next()) for pos, c in d.iteritems()]
def annotation_process(): d = get_inventory_info() annotated_files = find_files(annotations_path, "*.sense") pos_file = gzip.open('on.pos.gz', 'w') inst_num_dict = dd(lambda: count(1)) for num_processed, fn in enumerate(annotated_files): if num_processed % 1000 == 0: print >> sys.stderr, "{} files processed".format(num_processed) directory = os.path.dirname(fn) basename = os.path.basename(fn) reader = BracketParseCorpusReader(directory, basename.replace('.sense', '.parse')) fileid = reader.fileids()[0] sentences = dict() parsed_sents = reader.parsed_sents(fileid) for line in open(fn): line = line.split() tw = line[3] onto_sense = line[-1] sent_id, tok_id = int(line[1]), int(line[2]) stuple = sentences.setdefault(sent_id, None) if stuple is None: sentence = parsed_sents[sent_id] clean_sent = [] clean_pos = [] for word, p in sentence.pos(): if p != '-NONE-': if word in fix: word = fix[word] clean_sent.append(word) clean_pos.append(p) sentences[sent_id] = (clean_sent, clean_pos) else: clean_sent, clean_pos = stuple lexicon_senses, version, ita = d[tw][onto_sense] w = tw.replace('-', '.') # following the convention of SemEval m = "{}\t{}.on.{}\t{}-{}-{}\t{}-{}\t{}\t{}\t{}\t{}\t{}" print m.format(w, w, inst_num_dict[tw].next(), line[0], sent_id, tok_id, w, onto_sense, lexicon_senses, version, ita, tok_id, " ".join(clean_sent)) pos_file.write("{}\n".format(clean_pos)) print >> sys.stderr, "{} files processed".format(num_processed)