def __init__(self, babelnet_keys, babelnet_fpath="", freq_fpath="", normalized=True, divide_by_freq=False, force_api=False): self._babelnet_keys = babelnet_keys self._babelnet_dir = babelnet_fpath ensure_dir(self._babelnet_dir) self._normalized = normalized self._force_api = force_api self._freq = FreqDictionary(freq_fpath) self._babelnet = self._load( babelnet_fpath, divide_by_freq=divide_by_freq ) # format: (word->sense_id->{"bow" , "wnOffset"}
def __init__(self, babelnet_keys, babelnet_fpath="", freq_fpath="", normalized=True, divide_by_freq=False, force_api=False): self._babelnet_keys = babelnet_keys self._babelnet_dir = babelnet_fpath ensure_dir(self._babelnet_dir) self._normalized = normalized self._force_api = force_api self._freq = FreqDictionary(freq_fpath) self._babelnet = self._load(babelnet_fpath, divide_by_freq=divide_by_freq) # format: (word->sense_id->{"bow" , "wnOffset"}
class BabelNet(object): def __init__(self, babelnet_keys, babelnet_fpath="", freq_fpath="", normalized=True, divide_by_freq=False, force_api=False): self._babelnet_keys = babelnet_keys self._babelnet_dir = babelnet_fpath ensure_dir(self._babelnet_dir) self._normalized = normalized self._force_api = force_api self._freq = FreqDictionary(freq_fpath) self._babelnet = self._load(babelnet_fpath, divide_by_freq=divide_by_freq) # format: (word->sense_id->{"bow" , "wnOffset"} @property def data(self): return self._babelnet def _load(self, babelnet_fpath, divide_by_freq=False, sanity_check=True): if not exists(babelnet_fpath): return defaultdict(dict) with open(babelnet_fpath, 'rb') as babelnet_file: bn = pickle.load(babelnet_file) if sanity_check: err_num = 0 for word in bn: if len(bn[word]) <= 0: err_num += 1 print "Warning: local word with no senses", word if err_num > 0: print "Warning:", err_num, "local words with no senses" print "Loaded BabelNet with %d words from: %s" % (len(bn), babelnet_fpath) self._block_save = False if self._normalized: for word in bn: for sense_id in bn[word]: if divide_by_freq: bow = Counter({w: bn[word][sense_id]["bow"][w] / self._freq.freq(w) for w in bn[word][sense_id]["bow"] if good_token(w)}) self._block_save = True else: bow = bn[word][sense_id]["bow"] max_freq_norm = float(max(bow.values())) if len(bow) > 0 else 1.0 if max_freq_norm == 0.0: max_freq_norm = 1.0 bow_range_norm = Counter({w: bow[w] / max_freq_norm for w in bow if good_token(w)}) bn[word][sense_id]["bow"] = bow_range_norm return bn def wn_mapping(self): words = self.data.keys() wn_ids = defaultdict(dict) for word in words: for bn_id in self.data[word]: if len(self.data[word][bn_id]["wnOffsets"]) == 0: continue for wnid_dict in self.data[word][bn_id]["wnOffsets"]: wn_ids[word][bn_id] = [] if "id" not in wnid_dict: continue else: wn_ids[word][bn_id].append(wnid_dict["id"]) if len(wn_ids[word][bn_id]) > 1: print "Warning: more than two mappings", word, bn_id, wn_ids[word][bn_id] return wn_ids def save(self, babelnet_dir=""): if self._block_save: print "Save blocked" return if babelnet_dir == "" and self._babelnet_dir == "": print "Error: specify path to the output file" return babelnet_fpath = join(babelnet_dir, BABELNET_PKL) if babelnet_dir != "" else join(self._babelnet_dir, BABELNET_PKL) with open(babelnet_fpath, 'wb') as babelnet_file: pickle.dump(self._babelnet, babelnet_file) print "BabelNet saved to:", babelnet_fpath def _get_key(self): return self._babelnet_keys[0] def _get_synset_ids(self, word, lang=DEFAULT_LANG, pos=""): params = {"word": word, "lang":lang.upper(), "key": self._get_key()} response = requests.get(BABELNET_ENDPOINT + GET_SYNSET_IDS, params=params, headers=HEADERS, verify=True) if response.status_code == CODE_OK: content = json.loads(response.content) if content == LIMIT_MSG and len(self._babelnet_keys) > 1: self._babelnet_keys.pop(0) return self._get_synset_ids(word, lang=lang, pos=pos) elif content == LIMIT_MSG: raise DailyLimitException() else: return map(lambda x: x["id"], content) else: print "Error: cannot process query '%s'. Status code: %d.\n" % (word, response.status_code) return [] def _get_synset(self, synset_id): params = {"id": synset_id, "key": self._get_key()} response = requests.get(BABELNET_ENDPOINT + GET_SYNSET, params=params, headers=HEADERS, verify=True) if response.status_code == CODE_OK: content = json.loads(response.content) if content == LIMIT_MSG: print "Error: BabelNet daily limit is over." return {} else: return content else: print "Error: cannot process query '%s'. Status code: %d.\n" % (word, response.status_code) return {} def get_wordnet_senseids(self, word): """ Returns a list of dicts {'wordnet': sense_id, 'babelnet': sense_id} """ senses = [] if word not in self._babelnet: return senses for babelnet_id in self._babelnet[word]: if "wnOffsets" not in self._babelnet[word][babelnet_id]: print "Warning:", babelnet_id, "no wnOffsets" continue if len(self._babelnet[word][babelnet_id]["wnOffsets"]) == 0: print "Warning:", babelnet_id, "no wordnet senses" continue for wn_sense in self._babelnet[word][babelnet_id]["wnOffsets"]: if "id" not in wn_sense: print "Warning:", babelnet_id, "no id" continue senses.append({'babelnet': babelnet_id, 'wordnet': wn_sense["id"]}) return senses def _normalize(self, word, dash=False): word = _re_norm_babel_dash.sub(u" ", word) if dash else _re_norm_babel.sub(u" ", word) word = _re_whitespaces2.sub(u" ", word) return word.lower().strip() def _get_synset_bow(self, synset, lang=DEFAULT_LANG, senses=True, glosses=False, categories=False, image_names=False): bow = Counter() if senses and "senses" in synset: for s in synset["senses"]: if s["language"] != lang: continue lemma = self._normalize(s["lemma"]) bow[lemma] += 1 bow.update(lemma.split(" ")) slemma = self._normalize(s["simpleLemma"]) bow[slemma] += 1 bow.update(slemma.split(" ")) if glosses and "glosses" in synset: for s in synset["glosses"]: if s["language"] != lang: continue bow.update(tokenize(s["gloss"], lowercase=LOWERCASE_BOW, remove_stopwords=REMOVE_BOW_STOPWORDS)) if categories and "categories" in synset: for s in synset["categories"]: if s["language"] != lang: continue bow.update(tokenize(self._normalize(s["category"]), lowercase=LOWERCASE_BOW, remove_stopwords=REMOVE_BOW_STOPWORDS)) if image_names and "images" in synset: names = set(s["name"] for s in synset["images"] if s["name"]) map(lambda n: bow.update( filter(lambda t: t not in IMAGE_STOP_LIST, tokenize(self._normalize(splitext(n)[0], dash=True))) ), names) return bow def fetch_from_voc(self, voc_fpath): for i, row in read_csv(voc_fpath, "\t", encoding='utf8', error_bad_lines=False).iterrows(): try: s = self.get_senses(row.word) # saving to self._babelnet print row.word, len(s) except KeyboardInterrupt: self.save() return except DailyLimitException: print "Error: Daily limit exceeded" self.save() return except: print "Error:", row print format_exc() break self.save() def _save_synset(self, word, sid, synset): try: if not exists(self._babelnet_dir): return output_fpath = join(self._babelnet_dir, word + "#" + sid + ".json") with codecs.open(output_fpath, 'w', "utf-8") as outfile: print >> outfile, json.dumps(synset, ensure_ascii=False).decode("utf-8") except: print "Error saving file" print format_exc() def get_senses(self, word, lang=DEFAULT_LANG, senses=True, glosses=True, categories=True, image_names=True, verbose=False, min_prob=0.0): """ Returns a list of tuples (sense_id, bow), where bow is a Counter and sense_id is a unicode """ if word in self._babelnet and not self._force_api: senses_lst = [(sid, self._babelnet[word][sid]["bow"]) for sid in self._babelnet[word]] if verbose: print word, ": local" else: senses_lst = [] for i, sid in enumerate(self._get_synset_ids(word, lang)): tic = time() synset = self._get_synset(synset_id=sid) if len(synset) == 0: continue bow = self._get_synset_bow(synset, senses=senses, glosses=glosses, categories=categories, image_names=image_names) senses_lst.append((sid, bow)) print "%s#%d\t%s\t%.2f sec." % (word, i, sid, time()-tic) self._babelnet[word][sid] = {"bow": bow, "wnOffsets": synset.get("wnOffsets", "")} self._save_synset(word, sid, synset) if verbose: print "\n%s#%d\t%s\n===================================================" % (word.upper(), i, sid) print "senses=True, glosses=True, categories=True, image_names=True" print self._get_synset_bow(synset, senses=True, glosses=True, categories=True, image_names=True) print "\nsenses=True" print self._get_synset_bow(synset, senses=True, glosses=False, categories=False, image_names=False) print "\nglosses=True" print self._get_synset_bow(synset, senses=False, glosses=True, categories=False, image_names=False) print "\ncategories=True" print self._get_synset_bow(synset, senses=False, glosses=False, categories=True, image_names=False) print "\nimage_names=True" print self._get_synset_bow(synset, senses=False, glosses=False, categories=False, image_names=True) print word, ": api" sleep(BABELNET_SLEEP) return senses_lst def get_cluster(self, word, sense_id): if word in self._babelnet and sense_id in self._babelnet[word]: return self._babelnet[word][sense_id]["bow"] else: return []
class BabelNet(object): def __init__(self, babelnet_keys, babelnet_fpath="", freq_fpath="", normalized=True, divide_by_freq=False, force_api=False): self._babelnet_keys = babelnet_keys self._babelnet_dir = babelnet_fpath ensure_dir(self._babelnet_dir) self._normalized = normalized self._force_api = force_api self._freq = FreqDictionary(freq_fpath) self._babelnet = self._load( babelnet_fpath, divide_by_freq=divide_by_freq ) # format: (word->sense_id->{"bow" , "wnOffset"} @property def data(self): return self._babelnet def _load(self, babelnet_fpath, divide_by_freq=False, sanity_check=True): if not exists(babelnet_fpath): return defaultdict(dict) with open(babelnet_fpath, 'rb') as babelnet_file: bn = pickle.load(babelnet_file) if sanity_check: err_num = 0 for word in bn: if len(bn[word]) <= 0: err_num += 1 print "Warning: local word with no senses", word if err_num > 0: print "Warning:", err_num, "local words with no senses" print "Loaded BabelNet with %d words from: %s" % (len(bn), babelnet_fpath) self._block_save = False if self._normalized: for word in bn: for sense_id in bn[word]: if divide_by_freq: bow = Counter({ w: bn[word][sense_id]["bow"][w] / self._freq.freq(w) for w in bn[word][sense_id]["bow"] if good_token(w) }) self._block_save = True else: bow = bn[word][sense_id]["bow"] max_freq_norm = float(max( bow.values())) if len(bow) > 0 else 1.0 if max_freq_norm == 0.0: max_freq_norm = 1.0 bow_range_norm = Counter({ w: bow[w] / max_freq_norm for w in bow if good_token(w) }) bn[word][sense_id]["bow"] = bow_range_norm return bn def wn_mapping(self): words = self.data.keys() wn_ids = defaultdict(dict) for word in words: for bn_id in self.data[word]: if len(self.data[word][bn_id]["wnOffsets"]) == 0: continue for wnid_dict in self.data[word][bn_id]["wnOffsets"]: wn_ids[word][bn_id] = [] if "id" not in wnid_dict: continue else: wn_ids[word][bn_id].append(wnid_dict["id"]) if len(wn_ids[word][bn_id]) > 1: print "Warning: more than two mappings", word, bn_id, wn_ids[ word][bn_id] return wn_ids def save(self, babelnet_dir=""): if self._block_save: print "Save blocked" return if babelnet_dir == "" and self._babelnet_dir == "": print "Error: specify path to the output file" return babelnet_fpath = join(babelnet_dir, BABELNET_PKL) if babelnet_dir != "" else join( self._babelnet_dir, BABELNET_PKL) with open(babelnet_fpath, 'wb') as babelnet_file: pickle.dump(self._babelnet, babelnet_file) print "BabelNet saved to:", babelnet_fpath def _get_key(self): return self._babelnet_keys[0] def _get_synset_ids(self, word, lang=DEFAULT_LANG, pos=""): params = {"word": word, "lang": lang.upper(), "key": self._get_key()} response = requests.get(BABELNET_ENDPOINT + GET_SYNSET_IDS, params=params, headers=HEADERS, verify=True) if response.status_code == CODE_OK: content = json.loads(response.content) if content == LIMIT_MSG and len(self._babelnet_keys) > 1: self._babelnet_keys.pop(0) return self._get_synset_ids(word, lang=lang, pos=pos) elif content == LIMIT_MSG: raise DailyLimitException() else: return map(lambda x: x["id"], content) else: print "Error: cannot process query '%s'. Status code: %d.\n" % ( word, response.status_code) return [] def _get_synset(self, synset_id): params = {"id": synset_id, "key": self._get_key()} response = requests.get(BABELNET_ENDPOINT + GET_SYNSET, params=params, headers=HEADERS, verify=True) if response.status_code == CODE_OK: content = json.loads(response.content) if content == LIMIT_MSG: print "Error: BabelNet daily limit is over." return {} else: return content else: print "Error: cannot process query '%s'. Status code: %d.\n" % ( word, response.status_code) return {} def get_wordnet_senseids(self, word): """ Returns a list of dicts {'wordnet': sense_id, 'babelnet': sense_id} """ senses = [] if word not in self._babelnet: return senses for babelnet_id in self._babelnet[word]: if "wnOffsets" not in self._babelnet[word][babelnet_id]: print "Warning:", babelnet_id, "no wnOffsets" continue if len(self._babelnet[word][babelnet_id]["wnOffsets"]) == 0: print "Warning:", babelnet_id, "no wordnet senses" continue for wn_sense in self._babelnet[word][babelnet_id]["wnOffsets"]: if "id" not in wn_sense: print "Warning:", babelnet_id, "no id" continue senses.append({ 'babelnet': babelnet_id, 'wordnet': wn_sense["id"] }) return senses def _normalize(self, word, dash=False): word = _re_norm_babel_dash.sub( u" ", word) if dash else _re_norm_babel.sub(u" ", word) word = _re_whitespaces2.sub(u" ", word) return word.lower().strip() def _get_synset_bow(self, synset, lang=DEFAULT_LANG, senses=True, glosses=False, categories=False, image_names=False): bow = Counter() if senses and "senses" in synset: for s in synset["senses"]: if s["language"] != lang: continue lemma = self._normalize(s["lemma"]) bow[lemma] += 1 bow.update(lemma.split(" ")) slemma = self._normalize(s["simpleLemma"]) bow[slemma] += 1 bow.update(slemma.split(" ")) if glosses and "glosses" in synset: for s in synset["glosses"]: if s["language"] != lang: continue bow.update( tokenize(s["gloss"], lowercase=LOWERCASE_BOW, remove_stopwords=REMOVE_BOW_STOPWORDS)) if categories and "categories" in synset: for s in synset["categories"]: if s["language"] != lang: continue bow.update( tokenize(self._normalize(s["category"]), lowercase=LOWERCASE_BOW, remove_stopwords=REMOVE_BOW_STOPWORDS)) if image_names and "images" in synset: names = set(s["name"] for s in synset["images"] if s["name"]) map( lambda n: bow.update( filter( lambda t: t not in IMAGE_STOP_LIST, tokenize(self._normalize(splitext(n)[0], dash=True)))), names) return bow def fetch_from_voc(self, voc_fpath): for i, row in read_csv(voc_fpath, "\t", encoding='utf8', error_bad_lines=False).iterrows(): try: s = self.get_senses(row.word) # saving to self._babelnet print row.word, len(s) except KeyboardInterrupt: self.save() return except DailyLimitException: print "Error: Daily limit exceeded" self.save() return except: print "Error:", row print format_exc() break self.save() def _save_synset(self, word, sid, synset): try: if not exists(self._babelnet_dir): return output_fpath = join(self._babelnet_dir, word + "#" + sid + ".json") with codecs.open(output_fpath, 'w', "utf-8") as outfile: print >> outfile, json.dumps( synset, ensure_ascii=False).decode("utf-8") except: print "Error saving file" print format_exc() def get_senses(self, word, lang=DEFAULT_LANG, senses=True, glosses=True, categories=True, image_names=True, verbose=False, min_prob=0.0): """ Returns a list of tuples (sense_id, bow), where bow is a Counter and sense_id is a unicode """ if word in self._babelnet and not self._force_api: senses_lst = [(sid, self._babelnet[word][sid]["bow"]) for sid in self._babelnet[word]] if verbose: print word, ": local" else: senses_lst = [] for i, sid in enumerate(self._get_synset_ids(word, lang)): tic = time() synset = self._get_synset(synset_id=sid) if len(synset) == 0: continue bow = self._get_synset_bow(synset, senses=senses, glosses=glosses, categories=categories, image_names=image_names) senses_lst.append((sid, bow)) print "%s#%d\t%s\t%.2f sec." % (word, i, sid, time() - tic) self._babelnet[word][sid] = { "bow": bow, "wnOffsets": synset.get("wnOffsets", "") } self._save_synset(word, sid, synset) if verbose: print "\n%s#%d\t%s\n===================================================" % ( word.upper(), i, sid) print "senses=True, glosses=True, categories=True, image_names=True" print self._get_synset_bow(synset, senses=True, glosses=True, categories=True, image_names=True) print "\nsenses=True" print self._get_synset_bow(synset, senses=True, glosses=False, categories=False, image_names=False) print "\nglosses=True" print self._get_synset_bow(synset, senses=False, glosses=True, categories=False, image_names=False) print "\ncategories=True" print self._get_synset_bow(synset, senses=False, glosses=False, categories=True, image_names=False) print "\nimage_names=True" print self._get_synset_bow(synset, senses=False, glosses=False, categories=False, image_names=True) print word, ": api" sleep(BABELNET_SLEEP) return senses_lst def get_cluster(self, word, sense_id): if word in self._babelnet and sense_id in self._babelnet[word]: return self._babelnet[word][sense_id]["bow"] else: return []