Esempio n. 1
0
 def __init__(self,
              babelnet_keys,
              babelnet_fpath="",
              freq_fpath="",
              normalized=True,
              divide_by_freq=False,
              force_api=False):
     self._babelnet_keys = babelnet_keys
     self._babelnet_dir = babelnet_fpath
     ensure_dir(self._babelnet_dir)
     self._normalized = normalized
     self._force_api = force_api
     self._freq = FreqDictionary(freq_fpath)
     self._babelnet = self._load(
         babelnet_fpath, divide_by_freq=divide_by_freq
     )  # format: (word->sense_id->{"bow" , "wnOffset"}
 def __init__(self, babelnet_keys, babelnet_fpath="", freq_fpath="", normalized=True, divide_by_freq=False, force_api=False):
     self._babelnet_keys = babelnet_keys
     self._babelnet_dir = babelnet_fpath
     ensure_dir(self._babelnet_dir)
     self._normalized = normalized
     self._force_api = force_api
     self._freq =  FreqDictionary(freq_fpath)
     self._babelnet = self._load(babelnet_fpath, divide_by_freq=divide_by_freq)  # format: (word->sense_id->{"bow" , "wnOffset"}
class BabelNet(object):
    def __init__(self, babelnet_keys, babelnet_fpath="", freq_fpath="", normalized=True, divide_by_freq=False, force_api=False):
        self._babelnet_keys = babelnet_keys
        self._babelnet_dir = babelnet_fpath
        ensure_dir(self._babelnet_dir)
        self._normalized = normalized
        self._force_api = force_api
        self._freq =  FreqDictionary(freq_fpath)
        self._babelnet = self._load(babelnet_fpath, divide_by_freq=divide_by_freq)  # format: (word->sense_id->{"bow" , "wnOffset"}

    @property
    def data(self):
        return self._babelnet

    def _load(self, babelnet_fpath, divide_by_freq=False, sanity_check=True):
        if not exists(babelnet_fpath): return defaultdict(dict)

        with open(babelnet_fpath, 'rb') as babelnet_file:
            bn = pickle.load(babelnet_file)

        if sanity_check:
            err_num = 0
            for word in bn:
                if len(bn[word]) <= 0:
                    err_num += 1
                    print "Warning: local word with no senses", word
            if err_num > 0:
                print "Warning:", err_num, "local words with no senses"

            print "Loaded BabelNet with %d words from: %s" % (len(bn), babelnet_fpath)

        self._block_save = False
        if self._normalized:
            for word in bn:
                for sense_id in bn[word]:
                    if divide_by_freq:
                        bow = Counter({w: bn[word][sense_id]["bow"][w] / self._freq.freq(w) for w in bn[word][sense_id]["bow"] if good_token(w)})
                        self._block_save = True
                    else:
                        bow = bn[word][sense_id]["bow"]

                    max_freq_norm = float(max(bow.values())) if len(bow) > 0 else 1.0
                    if max_freq_norm == 0.0: max_freq_norm = 1.0
                    bow_range_norm = Counter({w: bow[w] / max_freq_norm for w in bow if good_token(w)})

                    bn[word][sense_id]["bow"] = bow_range_norm

        return bn

    def wn_mapping(self):
        words = self.data.keys()
        wn_ids = defaultdict(dict)
        for word in words:
            for bn_id in self.data[word]:
                if len(self.data[word][bn_id]["wnOffsets"]) == 0: continue
                for wnid_dict in self.data[word][bn_id]["wnOffsets"]:
                    wn_ids[word][bn_id] = []
                    if "id" not in wnid_dict: continue
                    else: wn_ids[word][bn_id].append(wnid_dict["id"])

                if len(wn_ids[word][bn_id]) > 1:
                    print "Warning: more than two mappings", word, bn_id, wn_ids[word][bn_id]

        return wn_ids

    def save(self, babelnet_dir=""):
        if self._block_save:
            print "Save blocked"
            return

        if babelnet_dir == "" and self._babelnet_dir == "":
            print "Error: specify path to the output file"
            return
        babelnet_fpath = join(babelnet_dir, BABELNET_PKL) if babelnet_dir != "" else join(self._babelnet_dir, BABELNET_PKL)
        with open(babelnet_fpath, 'wb') as babelnet_file:
            pickle.dump(self._babelnet, babelnet_file)

        print "BabelNet saved to:", babelnet_fpath

    def _get_key(self):
        return self._babelnet_keys[0]

    def _get_synset_ids(self, word, lang=DEFAULT_LANG, pos=""):
        params = {"word": word, "lang":lang.upper(), "key": self._get_key()}
        response = requests.get(BABELNET_ENDPOINT + GET_SYNSET_IDS, params=params, headers=HEADERS, verify=True)

        if response.status_code == CODE_OK:
            content = json.loads(response.content)
            if content == LIMIT_MSG and len(self._babelnet_keys) > 1:
                self._babelnet_keys.pop(0)
                return self._get_synset_ids(word, lang=lang, pos=pos)
            elif content == LIMIT_MSG:
                raise DailyLimitException()
            else:
                return map(lambda x: x["id"], content)
        else:
            print "Error: cannot process query '%s'. Status code: %d.\n" % (word, response.status_code)
            return []

    def _get_synset(self, synset_id):
        params = {"id": synset_id, "key": self._get_key()}
        response = requests.get(BABELNET_ENDPOINT + GET_SYNSET, params=params, headers=HEADERS, verify=True)

        if response.status_code == CODE_OK:
            content = json.loads(response.content)
            if content == LIMIT_MSG:
                print "Error: BabelNet daily limit is over."
                return {}
            else:
                return content
        else:
            print "Error: cannot process query '%s'. Status code: %d.\n" % (word, response.status_code)
            return {}

    def get_wordnet_senseids(self, word):
        """ Returns a list of dicts {'wordnet': sense_id, 'babelnet': sense_id} """

        senses = []
        if word not in self._babelnet: return senses

        for babelnet_id in self._babelnet[word]:
            if "wnOffsets" not in self._babelnet[word][babelnet_id]:
                print "Warning:", babelnet_id, "no wnOffsets"
                continue

            if len(self._babelnet[word][babelnet_id]["wnOffsets"]) == 0:
                print "Warning:", babelnet_id, "no wordnet senses"
                continue

            for wn_sense in self._babelnet[word][babelnet_id]["wnOffsets"]:
                if "id" not in wn_sense:
                    print "Warning:", babelnet_id, "no id"
                    continue
                senses.append({'babelnet': babelnet_id, 'wordnet': wn_sense["id"]})

        return senses


    def _normalize(self, word, dash=False):
        word = _re_norm_babel_dash.sub(u" ", word) if dash else _re_norm_babel.sub(u" ", word)
        word = _re_whitespaces2.sub(u" ", word)
        return word.lower().strip()

    def _get_synset_bow(self, synset, lang=DEFAULT_LANG, senses=True, glosses=False, categories=False, image_names=False):
        bow = Counter()
        if senses and "senses" in synset:
            for s in synset["senses"]:
                if s["language"] != lang: continue
                lemma = self._normalize(s["lemma"])
                bow[lemma] += 1
                bow.update(lemma.split(" "))
                slemma = self._normalize(s["simpleLemma"])
                bow[slemma] += 1
                bow.update(slemma.split(" "))

        if glosses and "glosses" in synset:
            for s in synset["glosses"]:
                if s["language"] != lang: continue
                bow.update(tokenize(s["gloss"], lowercase=LOWERCASE_BOW, remove_stopwords=REMOVE_BOW_STOPWORDS))

        if categories and "categories" in synset:
            for s in synset["categories"]:
                if s["language"] != lang: continue
                bow.update(tokenize(self._normalize(s["category"]), lowercase=LOWERCASE_BOW, remove_stopwords=REMOVE_BOW_STOPWORDS))

        if image_names and "images" in synset:
            names = set(s["name"] for s in synset["images"] if s["name"])
            map(lambda n: bow.update(
                filter(lambda t: t not in IMAGE_STOP_LIST, tokenize(self._normalize(splitext(n)[0], dash=True)))
                ), names)

        return bow

    def fetch_from_voc(self, voc_fpath):
        for i, row in read_csv(voc_fpath, "\t", encoding='utf8', error_bad_lines=False).iterrows():
            try:
                s = self.get_senses(row.word)  # saving to self._babelnet
                print row.word, len(s) 
            except KeyboardInterrupt:
                self.save()
                return
            except DailyLimitException:
                print "Error: Daily limit exceeded"
                self.save()
                return
            except:
                print "Error:", row
                print format_exc()
                break
        self.save()


    def _save_synset(self, word, sid, synset):
        try:
            if not exists(self._babelnet_dir): return
            output_fpath = join(self._babelnet_dir, word + "#" + sid + ".json")
            with codecs.open(output_fpath, 'w', "utf-8") as outfile:
                print >> outfile, json.dumps(synset, ensure_ascii=False).decode("utf-8")
        except:
            print "Error saving file"
            print format_exc()

    def get_senses(self, word, lang=DEFAULT_LANG, senses=True, glosses=True, categories=True, image_names=True,
                   verbose=False, min_prob=0.0):
        """ Returns a list of tuples (sense_id, bow), where bow is a Counter and sense_id is a unicode """

        if word in self._babelnet and not self._force_api:
            senses_lst = [(sid, self._babelnet[word][sid]["bow"]) for sid in self._babelnet[word]]
            if verbose: print  word, ": local"
        else:
            senses_lst = []
            for i, sid in enumerate(self._get_synset_ids(word, lang)):
                tic = time()
                synset = self._get_synset(synset_id=sid)
                if len(synset) == 0: continue
                bow = self._get_synset_bow(synset, senses=senses, glosses=glosses, categories=categories, image_names=image_names)
                senses_lst.append((sid, bow))
                print "%s#%d\t%s\t%.2f sec." % (word, i, sid, time()-tic)

                self._babelnet[word][sid] = {"bow": bow, "wnOffsets": synset.get("wnOffsets", "")}
                self._save_synset(word, sid, synset)

                if verbose:
                    print "\n%s#%d\t%s\n===================================================" % (word.upper(), i, sid)
                    print "senses=True, glosses=True, categories=True, image_names=True"
                    print self._get_synset_bow(synset, senses=True, glosses=True, categories=True, image_names=True)

                    print "\nsenses=True"
                    print self._get_synset_bow(synset, senses=True, glosses=False, categories=False, image_names=False)

                    print "\nglosses=True"
                    print self._get_synset_bow(synset, senses=False, glosses=True, categories=False, image_names=False)

                    print "\ncategories=True"
                    print self._get_synset_bow(synset, senses=False, glosses=False, categories=True, image_names=False)

                    print "\nimage_names=True"
                    print self._get_synset_bow(synset, senses=False, glosses=False, categories=False, image_names=True)

            print word, ": api"
            sleep(BABELNET_SLEEP)

        return senses_lst

    def get_cluster(self, word, sense_id):
        if word in self._babelnet and sense_id in self._babelnet[word]:
            return self._babelnet[word][sense_id]["bow"]
        else: return []
Esempio n. 4
0
class BabelNet(object):
    def __init__(self,
                 babelnet_keys,
                 babelnet_fpath="",
                 freq_fpath="",
                 normalized=True,
                 divide_by_freq=False,
                 force_api=False):
        self._babelnet_keys = babelnet_keys
        self._babelnet_dir = babelnet_fpath
        ensure_dir(self._babelnet_dir)
        self._normalized = normalized
        self._force_api = force_api
        self._freq = FreqDictionary(freq_fpath)
        self._babelnet = self._load(
            babelnet_fpath, divide_by_freq=divide_by_freq
        )  # format: (word->sense_id->{"bow" , "wnOffset"}

    @property
    def data(self):
        return self._babelnet

    def _load(self, babelnet_fpath, divide_by_freq=False, sanity_check=True):
        if not exists(babelnet_fpath): return defaultdict(dict)

        with open(babelnet_fpath, 'rb') as babelnet_file:
            bn = pickle.load(babelnet_file)

        if sanity_check:
            err_num = 0
            for word in bn:
                if len(bn[word]) <= 0:
                    err_num += 1
                    print "Warning: local word with no senses", word
            if err_num > 0:
                print "Warning:", err_num, "local words with no senses"

            print "Loaded BabelNet with %d words from: %s" % (len(bn),
                                                              babelnet_fpath)

        self._block_save = False
        if self._normalized:
            for word in bn:
                for sense_id in bn[word]:
                    if divide_by_freq:
                        bow = Counter({
                            w:
                            bn[word][sense_id]["bow"][w] / self._freq.freq(w)
                            for w in bn[word][sense_id]["bow"] if good_token(w)
                        })
                        self._block_save = True
                    else:
                        bow = bn[word][sense_id]["bow"]

                    max_freq_norm = float(max(
                        bow.values())) if len(bow) > 0 else 1.0
                    if max_freq_norm == 0.0: max_freq_norm = 1.0
                    bow_range_norm = Counter({
                        w: bow[w] / max_freq_norm
                        for w in bow if good_token(w)
                    })

                    bn[word][sense_id]["bow"] = bow_range_norm

        return bn

    def wn_mapping(self):
        words = self.data.keys()
        wn_ids = defaultdict(dict)
        for word in words:
            for bn_id in self.data[word]:
                if len(self.data[word][bn_id]["wnOffsets"]) == 0: continue
                for wnid_dict in self.data[word][bn_id]["wnOffsets"]:
                    wn_ids[word][bn_id] = []
                    if "id" not in wnid_dict: continue
                    else: wn_ids[word][bn_id].append(wnid_dict["id"])

                if len(wn_ids[word][bn_id]) > 1:
                    print "Warning: more than two mappings", word, bn_id, wn_ids[
                        word][bn_id]

        return wn_ids

    def save(self, babelnet_dir=""):
        if self._block_save:
            print "Save blocked"
            return

        if babelnet_dir == "" and self._babelnet_dir == "":
            print "Error: specify path to the output file"
            return
        babelnet_fpath = join(babelnet_dir,
                              BABELNET_PKL) if babelnet_dir != "" else join(
                                  self._babelnet_dir, BABELNET_PKL)
        with open(babelnet_fpath, 'wb') as babelnet_file:
            pickle.dump(self._babelnet, babelnet_file)

        print "BabelNet saved to:", babelnet_fpath

    def _get_key(self):
        return self._babelnet_keys[0]

    def _get_synset_ids(self, word, lang=DEFAULT_LANG, pos=""):
        params = {"word": word, "lang": lang.upper(), "key": self._get_key()}
        response = requests.get(BABELNET_ENDPOINT + GET_SYNSET_IDS,
                                params=params,
                                headers=HEADERS,
                                verify=True)

        if response.status_code == CODE_OK:
            content = json.loads(response.content)
            if content == LIMIT_MSG and len(self._babelnet_keys) > 1:
                self._babelnet_keys.pop(0)
                return self._get_synset_ids(word, lang=lang, pos=pos)
            elif content == LIMIT_MSG:
                raise DailyLimitException()
            else:
                return map(lambda x: x["id"], content)
        else:
            print "Error: cannot process query '%s'. Status code: %d.\n" % (
                word, response.status_code)
            return []

    def _get_synset(self, synset_id):
        params = {"id": synset_id, "key": self._get_key()}
        response = requests.get(BABELNET_ENDPOINT + GET_SYNSET,
                                params=params,
                                headers=HEADERS,
                                verify=True)

        if response.status_code == CODE_OK:
            content = json.loads(response.content)
            if content == LIMIT_MSG:
                print "Error: BabelNet daily limit is over."
                return {}
            else:
                return content
        else:
            print "Error: cannot process query '%s'. Status code: %d.\n" % (
                word, response.status_code)
            return {}

    def get_wordnet_senseids(self, word):
        """ Returns a list of dicts {'wordnet': sense_id, 'babelnet': sense_id} """

        senses = []
        if word not in self._babelnet: return senses

        for babelnet_id in self._babelnet[word]:
            if "wnOffsets" not in self._babelnet[word][babelnet_id]:
                print "Warning:", babelnet_id, "no wnOffsets"
                continue

            if len(self._babelnet[word][babelnet_id]["wnOffsets"]) == 0:
                print "Warning:", babelnet_id, "no wordnet senses"
                continue

            for wn_sense in self._babelnet[word][babelnet_id]["wnOffsets"]:
                if "id" not in wn_sense:
                    print "Warning:", babelnet_id, "no id"
                    continue
                senses.append({
                    'babelnet': babelnet_id,
                    'wordnet': wn_sense["id"]
                })

        return senses

    def _normalize(self, word, dash=False):
        word = _re_norm_babel_dash.sub(
            u" ", word) if dash else _re_norm_babel.sub(u" ", word)
        word = _re_whitespaces2.sub(u" ", word)
        return word.lower().strip()

    def _get_synset_bow(self,
                        synset,
                        lang=DEFAULT_LANG,
                        senses=True,
                        glosses=False,
                        categories=False,
                        image_names=False):
        bow = Counter()
        if senses and "senses" in synset:
            for s in synset["senses"]:
                if s["language"] != lang: continue
                lemma = self._normalize(s["lemma"])
                bow[lemma] += 1
                bow.update(lemma.split(" "))
                slemma = self._normalize(s["simpleLemma"])
                bow[slemma] += 1
                bow.update(slemma.split(" "))

        if glosses and "glosses" in synset:
            for s in synset["glosses"]:
                if s["language"] != lang: continue
                bow.update(
                    tokenize(s["gloss"],
                             lowercase=LOWERCASE_BOW,
                             remove_stopwords=REMOVE_BOW_STOPWORDS))

        if categories and "categories" in synset:
            for s in synset["categories"]:
                if s["language"] != lang: continue
                bow.update(
                    tokenize(self._normalize(s["category"]),
                             lowercase=LOWERCASE_BOW,
                             remove_stopwords=REMOVE_BOW_STOPWORDS))

        if image_names and "images" in synset:
            names = set(s["name"] for s in synset["images"] if s["name"])
            map(
                lambda n: bow.update(
                    filter(
                        lambda t: t not in IMAGE_STOP_LIST,
                        tokenize(self._normalize(splitext(n)[0], dash=True)))),
                names)

        return bow

    def fetch_from_voc(self, voc_fpath):
        for i, row in read_csv(voc_fpath,
                               "\t",
                               encoding='utf8',
                               error_bad_lines=False).iterrows():
            try:
                s = self.get_senses(row.word)  # saving to self._babelnet
                print row.word, len(s)
            except KeyboardInterrupt:
                self.save()
                return
            except DailyLimitException:
                print "Error: Daily limit exceeded"
                self.save()
                return
            except:
                print "Error:", row
                print format_exc()
                break
        self.save()

    def _save_synset(self, word, sid, synset):
        try:
            if not exists(self._babelnet_dir): return
            output_fpath = join(self._babelnet_dir, word + "#" + sid + ".json")
            with codecs.open(output_fpath, 'w', "utf-8") as outfile:
                print >> outfile, json.dumps(
                    synset, ensure_ascii=False).decode("utf-8")
        except:
            print "Error saving file"
            print format_exc()

    def get_senses(self,
                   word,
                   lang=DEFAULT_LANG,
                   senses=True,
                   glosses=True,
                   categories=True,
                   image_names=True,
                   verbose=False,
                   min_prob=0.0):
        """ Returns a list of tuples (sense_id, bow), where bow is a Counter and sense_id is a unicode """

        if word in self._babelnet and not self._force_api:
            senses_lst = [(sid, self._babelnet[word][sid]["bow"])
                          for sid in self._babelnet[word]]
            if verbose: print word, ": local"
        else:
            senses_lst = []
            for i, sid in enumerate(self._get_synset_ids(word, lang)):
                tic = time()
                synset = self._get_synset(synset_id=sid)
                if len(synset) == 0: continue
                bow = self._get_synset_bow(synset,
                                           senses=senses,
                                           glosses=glosses,
                                           categories=categories,
                                           image_names=image_names)
                senses_lst.append((sid, bow))
                print "%s#%d\t%s\t%.2f sec." % (word, i, sid, time() - tic)

                self._babelnet[word][sid] = {
                    "bow": bow,
                    "wnOffsets": synset.get("wnOffsets", "")
                }
                self._save_synset(word, sid, synset)

                if verbose:
                    print "\n%s#%d\t%s\n===================================================" % (
                        word.upper(), i, sid)
                    print "senses=True, glosses=True, categories=True, image_names=True"
                    print self._get_synset_bow(synset,
                                               senses=True,
                                               glosses=True,
                                               categories=True,
                                               image_names=True)

                    print "\nsenses=True"
                    print self._get_synset_bow(synset,
                                               senses=True,
                                               glosses=False,
                                               categories=False,
                                               image_names=False)

                    print "\nglosses=True"
                    print self._get_synset_bow(synset,
                                               senses=False,
                                               glosses=True,
                                               categories=False,
                                               image_names=False)

                    print "\ncategories=True"
                    print self._get_synset_bow(synset,
                                               senses=False,
                                               glosses=False,
                                               categories=True,
                                               image_names=False)

                    print "\nimage_names=True"
                    print self._get_synset_bow(synset,
                                               senses=False,
                                               glosses=False,
                                               categories=False,
                                               image_names=True)

            print word, ": api"
            sleep(BABELNET_SLEEP)

        return senses_lst

    def get_cluster(self, word, sense_id):
        if word in self._babelnet and sense_id in self._babelnet[word]:
            return self._babelnet[word][sense_id]["bow"]
        else:
            return []