Ejemplo n.º 1
0
def get_inventory_info():
    d = dd(dict)
    files = find_files(inventory_path, "*.xml")
    for num_processed, f in enumerate(files):
        fn = os.path.basename(f).replace('.xml', '')
        if num_processed % 1000 == 0:
            print >> sys.stderr, "{} files processed".format(num_processed)

        soup = BeautifulSoup(open(f), 'xml')
        senses = soup.findAll('sense')

        for sense in senses:
            onto_key = str(sense['n'])
            sense_name = str(sense['name'])
            mapping = sense.findAll('mappings')[0]
            wn = mapping.findAll('wn')[0]
            version = wn['version']
            wn_senses = wn.text.strip()
            #FIXME: None of above sense should be mapped to 3.0 first!
            if sense_name == NONE_OF_ABOVE_SENSE:
                wn_senses = "no_lexicon_sense"
                version = "3.0"
            ita = soup.findAll('ita') # inter-annotator agreement
            ita_score = "ITA_UNDEFINED"
            if len(ita) != 0:
                ita_score = ita[0]['ann_1_2_agreement']
            d[fn][onto_key] = [wn_senses, version, ita_score]
            
    print >> sys.stderr, "{} files processed".format(num_processed)
    return d
Ejemplo n.º 2
0
def get_inventory_info():
    d = dd(dict)
    files = find_files(inventory_path, "*.xml")
    for num_processed, f in enumerate(files):
        fn = os.path.basename(f).replace('.xml', '')
        if num_processed % 1000 == 0:
            print >> sys.stderr, "{0} files processed".format(num_processed)

        soup = BeautifulSoup(open(f), 'xml')
        senses = soup.findAll('sense')
        target_word = fn.replace('-', '.')

        for sense in senses:
            onto_key = str(sense['n'])
            sense_name = str(sense['name'])
            mapping = sense.findAll('mappings')[0]
            wn = mapping.findAll('wn')[0]
            version = wn['version']
            wn_senses = wn.text.strip()
            #FIXME: None of above sense should be mapped to 3.0 first!
            if sense_name == NONE_OF_ABOVE_SENSE:
                wn_senses = "no_lexicon_sense"
                version = "3.0"

            if version in wn_set:
                wn_senses = map(str, wn_senses.split(','))
                for wn_s in wn_senses:
                    d[target_word][wn_s] = onto_key
    return d
Ejemplo n.º 3
0
def process_sense_annotation():
    
    print >> sys.stderr, "Sense Annotation processing started"

    word_sense_dict, ita_less_90 = get_sense_mappings()
    sense_freq = dd(lambda : count(0)) # sense freqs (wn3.0, wn2.0 etc) for annotation
    word_freq = dd(lambda : count(0)) # words frequency in annontation
    pos_dict = dd(lambda : count(0)) # pos distribution for annotation.
    num_adjudicated = 0 # Number of instance that adjudicated
    pattern = "*.sense"
    annotated_files = find_files(annotations_path, pattern)
    num_word_processed = 0 
    ita_less90_count = 0
    for num_processed, annotated_file in enumerate(annotated_files):
        #fn = annotated_file.replace(annotations_path, "")
        for line in open(annotated_file):
            line = line.split()
            num_word_processed += +1
            if len(line) == 6:
                num_adjudicated += 1
            word = line[3]
            pos_tag = word[-1]
            pos_dict[pos_tag].next()
            sense_tag = line[-1]
            word_freq[word].next()

            version = word_sense_dict[word][sense_tag]
            sense_freq[version].next()

            if word in ita_less_90:
                ita_less90_count += 1

        if num_processed % 3000 == 0:
            print >> sys.stderr, "{} files processed".format(num_processed)

    ### Printing: Pos Info in annotated corpus  ###
    num_noun = pos_dict['n'].next()
    num_verb = pos_dict['v'].next()
    pos_msg = "Noun\tVerb\tNoun+Verb\tTotalWord\n{}\t{}\t{}\t{}"
    print pos_msg.format(num_noun, num_verb, num_verb + num_noun, num_word_processed)

    ### Printing: Number of Adjudicated word
    print "Number of adjudicated case: {}".format(num_adjudicated)

    ### Writing: sense frequency in annotated data 
    sensefreq_list = [(key, val.next()) for key, val in sense_freq.iteritems()]
    sensefreq_list = sorted(sensefreq_list, key=lambda x: x[1], reverse=True)
    with open('ontonotes-sensefreq-annotation.tab', 'w') as f:
        for key, val in sensefreq_list:
             f.write("{}\t{}\n".format(key, val))
    m = "Number of words that have <90 ita score {} in annotated data"
    print m.format(ita_less90_count)
    print >> sys.stderr, "Sense Annotation processing finished"
Ejemplo n.º 4
0
def get_filtered_set(is_only_wn=True, ita_threshold=.85):
    inventory_files = find_files(inventory, "*.xml")
    for num_processed, f in enumerate(inventory_files):
        fn = os.path.basename(f).replace('.xml', '')
        if num_processed % 1000 == 0:
            print >> sys.stderr, "{} files processed".format(num_processed)
        soup = BeautifulSoup(open(f), 'xml')
        ita = soup.findAll('ita') # inter-annotator agreement
        if len(ita) != 0:
            ita_score = float(ita[0]['ann_1_2_agreement'])
            if ita_score > ita_threshold:
                versions = [wn['version'] == '3.0' for wn in soup.findAll('wn')[:-1]]
                if all(versions):
                    print fn, versions, ita_score
Ejemplo n.º 5
0
def get_sense_mappings():
    
    version_dict = dd(lambda : count(0)) # keep tracking the annotation version
    word_sense_dict = dd(dict) # keep tracking words' senses' version
    inventory_files = find_files(inventory, "*.xml")
    # ITA INF below: [#ofinstance, #total_ita_score, #of word <90, #total_score_for_<90]
    ita_inf = [0, 0, 0, 0] 
    nsense = [0, 0]
    ita_less_90 = set()
    for num_processed, f in enumerate(inventory_files):
        fn = os.path.basename(f).replace('.xml', '')
        if num_processed % 1000 == 0:
            print >> sys.stderr, "{} files processed".format(num_processed)
        soup = BeautifulSoup(open(f), 'xml')
        senses = soup.findAll('sense')

        nsense[0] += len(senses)
        nsense[1] += 1

        for sense in senses:
            key = str(sense['n'])
            mapping = sense.findAll('mappings')[0]
            wn = mapping.findAll('wn')[0]
            version = wn['version']
            word_sense_dict[fn][key] = version
            wn_senses = wn.text.split(',') # maybe we can use it later
            version_dict[version].next()
        ita = soup.findAll('ita') # inter-annotator agreement
        if len(ita) != 0:
            ita_inf[0] += 1
            ita_score = float(ita[0]['ann_1_2_agreement'])
            ita_inf[1] += ita_score
            if ita_score < 0.9:
                ita_inf[2] += 1
                ita_inf[3] += ita_score
                ita_less_90.add(fn)

    ita_inf[1] = ita_inf[1] / ita_inf[0] # averaging.for all instance ita score
    ita_inf[3] = ita_inf[3] / ita_inf[2] # averaging.ita score for word lower than 0.9
    print >> sys.stderr, "{} files processed (total)".format(num_processed)
    print "ITA informations: {}".format(ita_inf)
    avg_sense = nsense[0] / float(nsense[1])
    print "total sense: {}, avg # of sense: {}".format(nsense[0], avg_sense)
    version_list = [(key, val.next()) for key, val in version_dict.iteritems()]
    version_list = sorted(version_list, key=lambda x: x[1], reverse=True)
    with open('ontonotes-sensefreq-inventory.tab', 'w') as f:
        for key, val in version_list:
             f.write("{}\t{}\n".format(key, val))
    return word_sense_dict, ita_less_90
Ejemplo n.º 6
0
def annotation_process():
    d = get_inventory_info()
    annotated_files = find_files(annotations_path, "*.sense")
    pos_file = gzip.open('on.pos.gz', 'w')
    inst_num_dict = dd(lambda: count(1))
    for num_processed, fn in enumerate(annotated_files):
        if num_processed % 1000 == 0:
            print >> sys.stderr, "{} files processed".format(num_processed)
        directory = os.path.dirname(fn)
        basename = os.path.basename(fn)
        reader = BracketParseCorpusReader(directory, basename.replace('.sense', '.parse'))
        fileid = reader.fileids()[0]
        sentences = dict()
        parsed_sents = reader.parsed_sents(fileid)
        for line in open(fn):
            line = line.split()
            tw = line[3]
            onto_sense = line[-1]
            sent_id, tok_id = int(line[1]), int(line[2])
            stuple = sentences.setdefault(sent_id, None)
            if stuple is None:
                sentence = parsed_sents[sent_id]
                clean_sent = []
                clean_pos = []
                for word, p in sentence.pos():
                    if p != '-NONE-':
                        if word in fix:
                            word = fix[word]
                        clean_sent.append(word)
                        clean_pos.append(p)
                sentences[sent_id] = (clean_sent, clean_pos)
            else:
                clean_sent, clean_pos = stuple
            lexicon_senses, version, ita = d[tw][onto_sense]
            w = tw.replace('-', '.') # following the convention of SemEval
            m = "{}\t{}.on.{}\t{}-{}-{}\t{}-{}\t{}\t{}\t{}\t{}\t{}"
            print m.format(w, w, inst_num_dict[tw].next(), line[0], sent_id, tok_id,
                w, onto_sense, lexicon_senses, version, ita, tok_id, " ".join(clean_sent))
            pos_file.write("{}\n".format(clean_pos))
    print >> sys.stderr, "{} files processed".format(num_processed)
Ejemplo n.º 7
0
        sentences = reader.parsed_sents(parse_file)
        for sentid, triple in sentids.viewitems():
            sentence = sentences[sentid]
            clean_sent_list = []
            clean_pos_list = []
            for word, p in sentence.pos():
                if p != '-NONE-':
                    if word in fix:
                        word = fix[word]
                    clean_sent_list.append(word)
                    clean_pos_list.append(p)
            for w, tid, senseid in triple:
                t = clean_sent_list[tid]
                p = clean_pos_list[tid]
                w = w.replace('-', '.')
                mm = "line-{}\t{}\t{}\t{}\t{}\t{}\t{}".format(c, t, c, tid, p, w, tid)
                ss = "line-{}\t{}\t{}\t{}".format(c, t, w, senseid)
                print mm
                write2file(files, [clean_pos_list, clean_sent_list, [ss]])
                c += 1
    map(lambda f: f.close(), files)

#path = "../data/ontonotes_v5/data/files/data/english/annotations/bc/p2.5_a2e/00/"
annotated_files = find_files(path, extension)
d = get_parse_file_dict(annotated_files, words)
print d.keys()[0]
print d[d.keys()[0]]
exit()
print >> sys.stderr, "Dict created: # of keys: {}".format(len(d))
create_files(d)
Ejemplo n.º 8
0
    """ Method processes the index.sense file """

    # hood%1:15:00:: 08641944 1 0
    d = dd(dict)
    for line in open(fn):
        line = line.split()
        sense_id = line[0]
        offset, sense_no, freq = line[1:]
        #print tw, sense_id, offset, sense_no, freq
        d[sense_id] = sense_no
    return d

onto_dict = get_inventory_info()
index_senses = index_sense_process()

ans_files = find_files('ims/on/testing-output', '*.ans')

not_in_WN3 = []
for fn in ans_files:
    for line in open(fn):
        line = line.split()
        tw = line[0]
        key = line[-1]
        if key in index_senses:
            wn_s = index_senses[key]
            if wn_s not in onto_dict[tw]:
                wn_s = 'no_lexicon_sense'
            try:
                sense = onto_dict[tw][wn_s]
            except KeyError:
                print >> sys.stderr, tw, key, wn_s
Ejemplo n.º 9
0

#on.n.XYv.128.score
#on.n.X-1sc100.2.score

#regex = re.compile('F.*Score.*(0\.\d+)')

pattern = sys.argv[1] # embedding type
directory = sys.argv[2]

regex = re.compile('.*F.*Score.*(0\.\d+).*')
#fn_regex = re.compile('scores/on\.(\w)\.(\w+)\.(\d+)\.score')
fn_regex = re.compile('scores/on\.(\w)\.(.*)\.(\d+)\.score')


print directory, pattern

def file_sort(fn):
    return int(fn.split('.')[3])

files = find_files(directory, '*%s*.score' % pattern)
for f in sorted(files, key=file_sort):
    results = []
    for line in open(f):
        match = regex.match(line)
        if match:
            results.append(match.group(1))
    fn_match = fn_regex.match(f)
    print "%s-%s\t" % (fn_match.group(3), fn_match.group(1)),
    print '\t'.join(results)