Esempio n. 1
0
def create_mapping(babelnet_dir, adagram_fpath, match_fpath, threshold=THRESHOLD):
    print ">>> write babelnet vocabulary"
    babelnet = BabelNet(babelnet_keys=BABELNET_KEYS, babelnet_fpath=babelnet_dir)
    babelnet_voc_fpath = join(babelnet_dir, "voc.csv")
    with codecs.open(babelnet_voc_fpath, "w", "utf-8") as out:
        out.write("word\n")
        for w in babelnet.data: out.write("%s\n" % w)
    print "BabelNet vocabulary:", babelnet_voc_fpath

    print ">>> make a subset of adagram"
    adagram_voc_fpath = adagram_fpath + "-voc.csv"
    filter_ddt_by_voc(adagram_fpath, babelnet_voc_fpath, adagram_voc_fpath)

    print ">>> calculare similarities between all words"
    adagram = SenseClusters(adagram_voc_fpath, normalized_bow=True)
    m = Matcher(babelnet, adagram)
    match_all_fpath = match_fpath + "-all.csv"
    m.match_file(babelnet_voc_fpath, match_all_fpath, threshold_percentile=0.0)

    print ">>> threshold the similarity"
    match_df = read_csv(match_all_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False)
    all_words_num = len(set(match_df.word))
    candidates_num = len(match_df)
    match_df = match_df[match_df.sim >= threshold]
    print "# of mapping candidates", candidates_num
    print "# of mappings:", len(match_df)
    print "# total words", all_words_num
    print "# mapped words", len(set(match_df.word))
    raw_match_fpath =  match_fpath + "-raw.csv"
    match_df.to_csv(raw_match_fpath, sep="\t", encoding="utf-8", float_format='%.3f', index=False)
    print "Raw mapping file:", raw_match_fpath

    print ">>> map to wordnet and reformat"
    wordnet = WordNetOffsets()
    bn2wn = babelnet.wn_mapping()
    df = read_csv(raw_match_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False)
    df["wordnet_id"] = Series("", df.index)
    df["wordnet_cluster"] = Series("", df.index)

    for i, row in df.iterrows():
        bn_id = row.sense_i

        if row.word in bn2wn and bn_id in bn2wn[row.word] and len(bn2wn[row.word][bn_id]) > 0:
            wn_id = bn2wn[row.word][bn_id][0]
            df.loc[i,"wordnet_id"] = wn_id
            s = wordnet.get(wn_id)
            if s is not None:
                df.loc[i,"wordnet_cluster"] = s.definition() + u" " + ". ".join(s.examples())
            if len(bn2wn[row.word][bn_id]) > 1: print "*",

    df.columns = [u'word', u'babelnet_id', u"adagram_id", u"babelnet_adagram_sim",
                  u"babelnet_cluster", u"adagram_cluster", u"wordnet_id", "wordnet_cluster"]
    df = df[[u'word', u'babelnet_id', u"adagram_id", u"wordnet_id", u"babelnet_adagram_sim",
                  u"babelnet_cluster", u"adagram_cluster", u"wordnet_cluster"]]
    df.to_csv(match_fpath, sep="\t", encoding="utf-8", float_format='%.3f', index=False)

    print "Final mapping:", match_fpath
Esempio n. 2
0
from jnt.matching.synset_fetchers import BabelNet, BABELNET_KEYS

babelnet_dir = "/Users/alex/tmp/matching/babelnet/"
freq_fpath = "/Users/alex/tmp/st/word-freq-t10.csv"

babelnet = BabelNet(BABELNET_KEYS, babelnet_dir, freq_fpath=freq_fpath)

Esempio n. 3
0
#     match_fpath = voc_fpath + "-match-p%d.csv" % p
#     m.match_file(voc_fpath, match_fpath, threshold_percentile=p)
#     df = read_csv(match_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False)
#     res["matches"].append(len(df))
#     res["words"].append(len(set(df.word.values)))
#     res["matches/words"].append(float(len(df)) / len(set(df.word.values)))
#     res["p"].append(p)
#     l = [len(rows) for word, rows in df.groupby(["word"])]
#     res["std"].append(std(l))


from jnt.matching.synset_fetchers import BabelNet, SenseClusters, BABELNET_KEYS
from jnt.matching.matcher import Matcher

BABELNET_DIR = "/Users/alex/tmp/matching/babelnet-eval/"
ADAGRAM_DDT = "/Users/alex/tmp/matching/ddt-adagram-ukwac+wacky.csv.gz.voc.out"
voc_fpath = "/Users/alex/work/joint/src/data/ambigous-words-mine.csv"
freq_fpath = "" # "/Users/alex/tmp/st/word-freq-t10.csv"

from time import time
tic = time()
babelnet = BabelNet(babelnet_keys=BABELNET_KEYS, babelnet_fpath=BABELNET_DIR, freq_fpath=freq_fpath, divide_by_freq=False)
print "BabelNet load time:", time()-tic

tic = time()
adagram = SenseClusters(ADAGRAM_DDT, normalized_bow=True)
print "SenseClusters load time:", time()-tic

m = Matcher(babelnet, adagram)
m.match_file(voc_fpath, voc_fpath + "-match.csv")
Esempio n. 4
0
from jnt.matching.synset_fetchers import BabelNet, SenseClusters
from jnt.matching.matcher import Matcher

BABELNET_KEY = ""  # insert a BabelNet API key here (not required)
ADAGRAM_BOW = "data/ddt-adagram-50.csv"  # bow with Adagram nearest neighbours
BABELNET_BOW = "data/babelnet-bow-5190.pkl"  # wget http://panchenko.me/data/joint/adagram/data/babelnet-bow-5190.pkl
VOC = "data/voc-50.csv"  # the mapping will be performed for these words
OUTPUT = VOC + ".match.csv"

babelnet = BabelNet(babelnet_keys=[BABELNET_KEY], babelnet_fpath=BABELNET_BOW)
adagram = SenseClusters(sense_clusters_fpath=ADAGRAM_BOW,
                        strip_dst_senses=True)
m = Matcher(babelnet, adagram)
m.match_file(words_fpath=VOC, output_fpath=OUTPUT)
Esempio n. 5
0
from jnt.common import load_voc
import codecs 
from jnt.matching.synset_fetchers import BabelNet, BABELNET_KEYS
from jnt.common import take

MAX_WORDS = 999

voc_fpath = "/Users/alex/work/joint/src/data/ambigous-words-mine.csv"
output_fpath = voc_fpath + "-babelnet.csv"
babelnet_dir = "/Users/alex/tmp/matching/babelnet-eval/"
adagram_voc_fpath = "/Users/alex/tmp/adagram/HugeModel-voc.csv"


babelnet = BabelNet(babelnet_keys=BABELNET_KEYS, babelnet_fpath=babelnet_dir,
                    freq_fpath="", divide_by_freq=False)
adagram_voc = load_voc(adagram_voc_fpath)
voc = load_voc(voc_fpath)


with codecs.open(output_fpath, "w", "utf-8") as out:
    for word in voc:
        senses = babelnet.get_senses(word)
        for sense_id, bow in senses:
            bow_words = []
            for w in sorted(bow, key=bow.get, reverse=True):
                if w in adagram_voc and w != word:
                    bow_words.append(w) 
            out.write("%s\t%s\t%s\n" % (word, sense_id, ' '.join(take(MAX_WORDS,bow_words))))
        
print "Output:", output_fpath
Esempio n. 6
0
def create_mapping(babelnet_dir,
                   adagram_fpath,
                   match_fpath,
                   threshold=THRESHOLD):
    print ">>> write babelnet vocabulary"
    babelnet = BabelNet(babelnet_keys=BABELNET_KEYS,
                        babelnet_fpath=babelnet_dir)
    babelnet_voc_fpath = join(babelnet_dir, "voc.csv")
    with codecs.open(babelnet_voc_fpath, "w", "utf-8") as out:
        out.write("word\n")
        for w in babelnet.data:
            out.write("%s\n" % w)
    print "BabelNet vocabulary:", babelnet_voc_fpath

    print ">>> make a subset of adagram"
    adagram_voc_fpath = adagram_fpath + "-voc.csv"
    filter_ddt_by_voc(adagram_fpath, babelnet_voc_fpath, adagram_voc_fpath)

    print ">>> calculare similarities between all words"
    adagram = SenseClusters(adagram_voc_fpath, normalized_bow=True)
    m = Matcher(babelnet, adagram)
    match_all_fpath = match_fpath + "-all.csv"
    m.match_file(babelnet_voc_fpath, match_all_fpath, threshold_percentile=0.0)

    print ">>> threshold the similarity"
    match_df = read_csv(match_all_fpath,
                        encoding='utf-8',
                        delimiter="\t",
                        error_bad_lines=False)
    all_words_num = len(set(match_df.word))
    candidates_num = len(match_df)
    match_df = match_df[match_df.sim >= threshold]
    print "# of mapping candidates", candidates_num
    print "# of mappings:", len(match_df)
    print "# total words", all_words_num
    print "# mapped words", len(set(match_df.word))
    raw_match_fpath = match_fpath + "-raw.csv"
    match_df.to_csv(raw_match_fpath,
                    sep="\t",
                    encoding="utf-8",
                    float_format='%.3f',
                    index=False)
    print "Raw mapping file:", raw_match_fpath

    print ">>> map to wordnet and reformat"
    wordnet = WordNetOffsets()
    bn2wn = babelnet.wn_mapping()
    df = read_csv(raw_match_fpath,
                  encoding='utf-8',
                  delimiter="\t",
                  error_bad_lines=False)
    df["wordnet_id"] = Series("", df.index)
    df["wordnet_cluster"] = Series("", df.index)

    for i, row in df.iterrows():
        bn_id = row.sense_i

        if row.word in bn2wn and bn_id in bn2wn[row.word] and len(
                bn2wn[row.word][bn_id]) > 0:
            wn_id = bn2wn[row.word][bn_id][0]
            df.loc[i, "wordnet_id"] = wn_id
            s = wordnet.get(wn_id)
            if s is not None:
                df.loc[i,
                       "wordnet_cluster"] = s.definition() + u" " + ". ".join(
                           s.examples())
            if len(bn2wn[row.word][bn_id]) > 1: print "*",

    df.columns = [
        u'word', u'babelnet_id', u"adagram_id", u"babelnet_adagram_sim",
        u"babelnet_cluster", u"adagram_cluster", u"wordnet_id",
        "wordnet_cluster"
    ]
    df = df[[
        u'word', u'babelnet_id', u"adagram_id", u"wordnet_id",
        u"babelnet_adagram_sim", u"babelnet_cluster", u"adagram_cluster",
        u"wordnet_cluster"
    ]]
    df.to_csv(match_fpath,
              sep="\t",
              encoding="utf-8",
              float_format='%.3f',
              index=False)

    print "Final mapping:", match_fpath