def generate_non_matching(self, words_fpath, output_fpath, continue_locally=True, threshold_percentile=PERCENTILE): with codecs.open(output_fpath, "w", "utf-8") as output_file: print >> output_file, "word\tsense_i\tsense_j\tsim\tsense_i_cluster\tsense_j_cluster" df = read_csv(words_fpath, encoding='utf-8', delimiter=WORDS_SEP, error_bad_lines=False) for i, row in df.iterrows(): try: senses1 = self._fetcher1.get_senses(row.word) senses2 = self._fetcher2.get_senses(row.word) res = self._match(row.word, senses1, senses2, q=threshold_percentile) for sid1, bow1 in senses1: for sid2, bow2 in senses2: if sid1 not in res and sid2 not in res[sid1]: cluster1 = ','.join(take(CLUSTER_SIZE,[x[0] for x in sorted(self._fetcher1.get_cluster(row.word, sid1).items(), reverse=True, key=operator.itemgetter(1))])) cluster2 = ','.join(take(CLUSTER_SIZE,[x[0] for x in sorted(self._fetcher2.get_cluster(row.word, sid2).items(), reverse=True, key=operator.itemgetter(1))])) output_file.write("%s\t%s\t%s\t%.2f\t%s\t%s\n" % ( row.word, sid1, sid2, 0.0, cluster1, cluster2)) except KeyboardInterrupt: print "Keyboard interrupt" return except DailyLimitException: if continue_locally: print "Skipping due to API limit:", row.word continue else: print "BabelNet API daily limit reached" return except: print "Error:", row print format_exc() print "Matched senses:", output_fpath return output_fpath
from jnt.common import load_voc import codecs from jnt.matching.synset_fetchers import BabelNet, BABELNET_KEYS from jnt.common import take MAX_WORDS = 999 voc_fpath = "/Users/alex/work/joint/src/data/ambigous-words-mine.csv" output_fpath = voc_fpath + "-babelnet.csv" babelnet_dir = "/Users/alex/tmp/matching/babelnet-eval/" adagram_voc_fpath = "/Users/alex/tmp/adagram/HugeModel-voc.csv" babelnet = BabelNet(babelnet_keys=BABELNET_KEYS, babelnet_fpath=babelnet_dir, freq_fpath="", divide_by_freq=False) adagram_voc = load_voc(adagram_voc_fpath) voc = load_voc(voc_fpath) with codecs.open(output_fpath, "w", "utf-8") as out: for word in voc: senses = babelnet.get_senses(word) for sense_id, bow in senses: bow_words = [] for w in sorted(bow, key=bow.get, reverse=True): if w in adagram_voc and w != word: bow_words.append(w) out.write("%s\t%s\t%s\n" % (word, sense_id, ' '.join(take(MAX_WORDS,bow_words)))) print "Output:", output_fpath
def match_file(self, words_fpath, output_fpath, continue_locally=True, threshold_percentile=PERCENTILE): with codecs.open(output_fpath, "w", "utf-8") as output_file: print >> output_file, "word\tsense_i\tsense_j\tsim\tsense_i_cluster\tsense_j_cluster" df = read_csv(words_fpath, encoding='utf-8', delimiter=WORDS_SEP, error_bad_lines=False) for i, row in df.iterrows(): try: senses1 = self._fetcher1.get_senses( row.word, min_prob=MIN_SENSE_PROB) senses2 = self._fetcher2.get_senses( row.word, min_prob=MIN_SENSE_PROB) res = self._match(row.word, senses1, senses2, q=threshold_percentile) for sid1 in res: for sid2, sim in sorted(res[sid1].items(), key=operator.itemgetter(1), reverse=True): cluster1 = ','.join( take(CLUSTER_SIZE, [ x[0] for x in sorted(self._fetcher1.get_cluster( row.word, sid1).items(), reverse=True, key=operator.itemgetter(1)) ])) cluster2 = ','.join( take(CLUSTER_SIZE, [ x[0] for x in sorted(self._fetcher2.get_cluster( row.word, sid2).items(), reverse=True, key=operator.itemgetter(1)) ])) output_file.write("%s\t%s\t%s\t%.6f\t%s\t%s\n" % (row.word, sid1, sid2, sim, cluster1, cluster2)) except KeyboardInterrupt: print "Keyboard interrupt" return except DailyLimitException: if continue_locally: print "Skipping due to API limit:", row.word continue else: print "BabelNet API daily limit reached" return except: print "Error:", row print format_exc() print "Matched senses:", output_fpath return output_fpath