def get_inventory_info(): d = dd(dict) files = find_files(inventory_path, "*.xml") for num_processed, f in enumerate(files): fn = os.path.basename(f).replace('.xml', '') if num_processed % 1000 == 0: print >> sys.stderr, "{} files processed".format(num_processed) soup = BeautifulSoup(open(f), 'xml') senses = soup.findAll('sense') for sense in senses: onto_key = str(sense['n']) sense_name = str(sense['name']) mapping = sense.findAll('mappings')[0] wn = mapping.findAll('wn')[0] version = wn['version'] wn_senses = wn.text.strip() #FIXME: None of above sense should be mapped to 3.0 first! if sense_name == NONE_OF_ABOVE_SENSE: wn_senses = "no_lexicon_sense" version = "3.0" ita = soup.findAll('ita') # inter-annotator agreement ita_score = "ITA_UNDEFINED" if len(ita) != 0: ita_score = ita[0]['ann_1_2_agreement'] d[fn][onto_key] = [wn_senses, version, ita_score] print >> sys.stderr, "{} files processed".format(num_processed) return d
def get_inventory_info(): d = dd(dict) files = find_files(inventory_path, "*.xml") for num_processed, f in enumerate(files): fn = os.path.basename(f).replace('.xml', '') if num_processed % 1000 == 0: print >> sys.stderr, "{0} files processed".format(num_processed) soup = BeautifulSoup(open(f), 'xml') senses = soup.findAll('sense') target_word = fn.replace('-', '.') for sense in senses: onto_key = str(sense['n']) sense_name = str(sense['name']) mapping = sense.findAll('mappings')[0] wn = mapping.findAll('wn')[0] version = wn['version'] wn_senses = wn.text.strip() #FIXME: None of above sense should be mapped to 3.0 first! if sense_name == NONE_OF_ABOVE_SENSE: wn_senses = "no_lexicon_sense" version = "3.0" if version in wn_set: wn_senses = map(str, wn_senses.split(',')) for wn_s in wn_senses: d[target_word][wn_s] = onto_key return d
def process_sense_annotation(): print >> sys.stderr, "Sense Annotation processing started" word_sense_dict, ita_less_90 = get_sense_mappings() sense_freq = dd(lambda : count(0)) # sense freqs (wn3.0, wn2.0 etc) for annotation word_freq = dd(lambda : count(0)) # words frequency in annontation pos_dict = dd(lambda : count(0)) # pos distribution for annotation. num_adjudicated = 0 # Number of instance that adjudicated pattern = "*.sense" annotated_files = find_files(annotations_path, pattern) num_word_processed = 0 ita_less90_count = 0 for num_processed, annotated_file in enumerate(annotated_files): #fn = annotated_file.replace(annotations_path, "") for line in open(annotated_file): line = line.split() num_word_processed += +1 if len(line) == 6: num_adjudicated += 1 word = line[3] pos_tag = word[-1] pos_dict[pos_tag].next() sense_tag = line[-1] word_freq[word].next() version = word_sense_dict[word][sense_tag] sense_freq[version].next() if word in ita_less_90: ita_less90_count += 1 if num_processed % 3000 == 0: print >> sys.stderr, "{} files processed".format(num_processed) ### Printing: Pos Info in annotated corpus ### num_noun = pos_dict['n'].next() num_verb = pos_dict['v'].next() pos_msg = "Noun\tVerb\tNoun+Verb\tTotalWord\n{}\t{}\t{}\t{}" print pos_msg.format(num_noun, num_verb, num_verb + num_noun, num_word_processed) ### Printing: Number of Adjudicated word print "Number of adjudicated case: {}".format(num_adjudicated) ### Writing: sense frequency in annotated data sensefreq_list = [(key, val.next()) for key, val in sense_freq.iteritems()] sensefreq_list = sorted(sensefreq_list, key=lambda x: x[1], reverse=True) with open('ontonotes-sensefreq-annotation.tab', 'w') as f: for key, val in sensefreq_list: f.write("{}\t{}\n".format(key, val)) m = "Number of words that have <90 ita score {} in annotated data" print m.format(ita_less90_count) print >> sys.stderr, "Sense Annotation processing finished"
def get_filtered_set(is_only_wn=True, ita_threshold=.85): inventory_files = find_files(inventory, "*.xml") for num_processed, f in enumerate(inventory_files): fn = os.path.basename(f).replace('.xml', '') if num_processed % 1000 == 0: print >> sys.stderr, "{} files processed".format(num_processed) soup = BeautifulSoup(open(f), 'xml') ita = soup.findAll('ita') # inter-annotator agreement if len(ita) != 0: ita_score = float(ita[0]['ann_1_2_agreement']) if ita_score > ita_threshold: versions = [wn['version'] == '3.0' for wn in soup.findAll('wn')[:-1]] if all(versions): print fn, versions, ita_score
def get_sense_mappings(): version_dict = dd(lambda : count(0)) # keep tracking the annotation version word_sense_dict = dd(dict) # keep tracking words' senses' version inventory_files = find_files(inventory, "*.xml") # ITA INF below: [#ofinstance, #total_ita_score, #of word <90, #total_score_for_<90] ita_inf = [0, 0, 0, 0] nsense = [0, 0] ita_less_90 = set() for num_processed, f in enumerate(inventory_files): fn = os.path.basename(f).replace('.xml', '') if num_processed % 1000 == 0: print >> sys.stderr, "{} files processed".format(num_processed) soup = BeautifulSoup(open(f), 'xml') senses = soup.findAll('sense') nsense[0] += len(senses) nsense[1] += 1 for sense in senses: key = str(sense['n']) mapping = sense.findAll('mappings')[0] wn = mapping.findAll('wn')[0] version = wn['version'] word_sense_dict[fn][key] = version wn_senses = wn.text.split(',') # maybe we can use it later version_dict[version].next() ita = soup.findAll('ita') # inter-annotator agreement if len(ita) != 0: ita_inf[0] += 1 ita_score = float(ita[0]['ann_1_2_agreement']) ita_inf[1] += ita_score if ita_score < 0.9: ita_inf[2] += 1 ita_inf[3] += ita_score ita_less_90.add(fn) ita_inf[1] = ita_inf[1] / ita_inf[0] # averaging.for all instance ita score ita_inf[3] = ita_inf[3] / ita_inf[2] # averaging.ita score for word lower than 0.9 print >> sys.stderr, "{} files processed (total)".format(num_processed) print "ITA informations: {}".format(ita_inf) avg_sense = nsense[0] / float(nsense[1]) print "total sense: {}, avg # of sense: {}".format(nsense[0], avg_sense) version_list = [(key, val.next()) for key, val in version_dict.iteritems()] version_list = sorted(version_list, key=lambda x: x[1], reverse=True) with open('ontonotes-sensefreq-inventory.tab', 'w') as f: for key, val in version_list: f.write("{}\t{}\n".format(key, val)) return word_sense_dict, ita_less_90
def annotation_process(): d = get_inventory_info() annotated_files = find_files(annotations_path, "*.sense") pos_file = gzip.open('on.pos.gz', 'w') inst_num_dict = dd(lambda: count(1)) for num_processed, fn in enumerate(annotated_files): if num_processed % 1000 == 0: print >> sys.stderr, "{} files processed".format(num_processed) directory = os.path.dirname(fn) basename = os.path.basename(fn) reader = BracketParseCorpusReader(directory, basename.replace('.sense', '.parse')) fileid = reader.fileids()[0] sentences = dict() parsed_sents = reader.parsed_sents(fileid) for line in open(fn): line = line.split() tw = line[3] onto_sense = line[-1] sent_id, tok_id = int(line[1]), int(line[2]) stuple = sentences.setdefault(sent_id, None) if stuple is None: sentence = parsed_sents[sent_id] clean_sent = [] clean_pos = [] for word, p in sentence.pos(): if p != '-NONE-': if word in fix: word = fix[word] clean_sent.append(word) clean_pos.append(p) sentences[sent_id] = (clean_sent, clean_pos) else: clean_sent, clean_pos = stuple lexicon_senses, version, ita = d[tw][onto_sense] w = tw.replace('-', '.') # following the convention of SemEval m = "{}\t{}.on.{}\t{}-{}-{}\t{}-{}\t{}\t{}\t{}\t{}\t{}" print m.format(w, w, inst_num_dict[tw].next(), line[0], sent_id, tok_id, w, onto_sense, lexicon_senses, version, ita, tok_id, " ".join(clean_sent)) pos_file.write("{}\n".format(clean_pos)) print >> sys.stderr, "{} files processed".format(num_processed)
sentences = reader.parsed_sents(parse_file) for sentid, triple in sentids.viewitems(): sentence = sentences[sentid] clean_sent_list = [] clean_pos_list = [] for word, p in sentence.pos(): if p != '-NONE-': if word in fix: word = fix[word] clean_sent_list.append(word) clean_pos_list.append(p) for w, tid, senseid in triple: t = clean_sent_list[tid] p = clean_pos_list[tid] w = w.replace('-', '.') mm = "line-{}\t{}\t{}\t{}\t{}\t{}\t{}".format(c, t, c, tid, p, w, tid) ss = "line-{}\t{}\t{}\t{}".format(c, t, w, senseid) print mm write2file(files, [clean_pos_list, clean_sent_list, [ss]]) c += 1 map(lambda f: f.close(), files) #path = "../data/ontonotes_v5/data/files/data/english/annotations/bc/p2.5_a2e/00/" annotated_files = find_files(path, extension) d = get_parse_file_dict(annotated_files, words) print d.keys()[0] print d[d.keys()[0]] exit() print >> sys.stderr, "Dict created: # of keys: {}".format(len(d)) create_files(d)
""" Method processes the index.sense file """ # hood%1:15:00:: 08641944 1 0 d = dd(dict) for line in open(fn): line = line.split() sense_id = line[0] offset, sense_no, freq = line[1:] #print tw, sense_id, offset, sense_no, freq d[sense_id] = sense_no return d onto_dict = get_inventory_info() index_senses = index_sense_process() ans_files = find_files('ims/on/testing-output', '*.ans') not_in_WN3 = [] for fn in ans_files: for line in open(fn): line = line.split() tw = line[0] key = line[-1] if key in index_senses: wn_s = index_senses[key] if wn_s not in onto_dict[tw]: wn_s = 'no_lexicon_sense' try: sense = onto_dict[tw][wn_s] except KeyError: print >> sys.stderr, tw, key, wn_s
#on.n.XYv.128.score #on.n.X-1sc100.2.score #regex = re.compile('F.*Score.*(0\.\d+)') pattern = sys.argv[1] # embedding type directory = sys.argv[2] regex = re.compile('.*F.*Score.*(0\.\d+).*') #fn_regex = re.compile('scores/on\.(\w)\.(\w+)\.(\d+)\.score') fn_regex = re.compile('scores/on\.(\w)\.(.*)\.(\d+)\.score') print directory, pattern def file_sort(fn): return int(fn.split('.')[3]) files = find_files(directory, '*%s*.score' % pattern) for f in sorted(files, key=file_sort): results = [] for line in open(f): match = regex.match(line) if match: results.append(match.group(1)) fn_match = fn_regex.match(f) print "%s-%s\t" % (fn_match.group(3), fn_match.group(1)), print '\t'.join(results)