from jnt.matching.synset_fetchers import BabelNet, SenseClusters from jnt.matching.matcher import Matcher BABELNET_KEY = "" # insert a BabelNet API key here (not required) ADAGRAM_BOW = "data/ddt-adagram-50.csv" # bow with Adagram nearest neighbours BABELNET_BOW = "data/babelnet-bow-5190.pkl" # wget http://panchenko.me/data/joint/adagram/data/babelnet-bow-5190.pkl VOC = "data/voc-50.csv" # the mapping will be performed for these words OUTPUT = VOC + ".match.csv" babelnet = BabelNet(babelnet_keys=[BABELNET_KEY], babelnet_fpath=BABELNET_BOW) adagram = SenseClusters(sense_clusters_fpath=ADAGRAM_BOW, strip_dst_senses=True) m = Matcher(babelnet, adagram) m.match_file(words_fpath=VOC, output_fpath=OUTPUT)
# match_fpath = voc_fpath + "-match-p%d.csv" % p # m.match_file(voc_fpath, match_fpath, threshold_percentile=p) # df = read_csv(match_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False) # res["matches"].append(len(df)) # res["words"].append(len(set(df.word.values))) # res["matches/words"].append(float(len(df)) / len(set(df.word.values))) # res["p"].append(p) # l = [len(rows) for word, rows in df.groupby(["word"])] # res["std"].append(std(l)) from jnt.matching.synset_fetchers import BabelNet, SenseClusters, BABELNET_KEYS from jnt.matching.matcher import Matcher BABELNET_DIR = "/Users/alex/tmp/matching/babelnet-eval/" ADAGRAM_DDT = "/Users/alex/tmp/matching/ddt-adagram-ukwac+wacky.csv.gz.voc.out" voc_fpath = "/Users/alex/work/joint/src/data/ambigous-words-mine.csv" freq_fpath = "" # "/Users/alex/tmp/st/word-freq-t10.csv" from time import time tic = time() babelnet = BabelNet(babelnet_keys=BABELNET_KEYS, babelnet_fpath=BABELNET_DIR, freq_fpath=freq_fpath, divide_by_freq=False) print "BabelNet load time:", time()-tic tic = time() adagram = SenseClusters(ADAGRAM_DDT, normalized_bow=True) print "SenseClusters load time:", time()-tic m = Matcher(babelnet, adagram) m.match_file(voc_fpath, voc_fpath + "-match.csv")