Esempio n. 1
0
    def __init__(self, params):
        super().__init__(params)
        self.wiktionary = self.__get_wiktionary(params['wiki_path'])
        self.wiki_model = KeyedVectors.load_word2vec_format(params['wiki_vectors_path'], binary=False)
        self.node2vec = KeyedVectors.load_word2vec_format(params["node2vec_path"], binary=False)
        self.n = params['n']
        self.projection = ProjectionVectorizer(self.w2v_data, params["projection_path"])

        self.delete_bracets = re.compile(r"\(.+?\)")
        if params['language'] == 'ru':
            self.pattern = re.compile("[^А-я \-]")
        else:
            self.pattern = re.compile("[^A-z \-]")
Esempio n. 2
0
class Node2VecRankedModel(RankedModel):
    def __init__(self, params):
        super().__init__(params)
        self.node2vec = KeyedVectors.load_word2vec_format(params["node2vec_path"], binary=False)
        self.projection = ProjectionVectorizer(self.w2v_data, params["projection_path"])

    def compute_candidates(self, neologism, get_hypernym_fn, get_hyponym_fn, get_taxonomy_name_fn, topn=10):
        node2vec, node2vec_vector = self.generate_node2vec(neologism, get_hypernym_fn, topn)
        second_order_hypernyms = [s_o for hypernym in node2vec for s_o in get_hypernym_fn(hypernym)]
        all_hypernyms = Counter(node2vec + second_order_hypernyms)

        sorted_hypernyms = reversed(sorted(all_hypernyms.items(), key=lambda x: self.get_node2vec_score(neologism,
                                                                                                        node2vec_vector,
                                                                                                        *x)))

        return [i[0] for i in sorted_hypernyms][:topn]

    def generate_node2vec(self, neologism, compute_hypernyms, topn=10) -> list:
        neighbours, node2vec_vector = self.projection.predict_projection_word(neologism, self.node2vec)
        associates = map(itemgetter(0), neighbours)
        hchs = [hypernym for associate in associates for hypernym in compute_hypernyms(associate)]
        return hchs, node2vec_vector

    def get_node2vec_score(self, neologism, node2vec_vector, candidate, count):
        return count * (self.get_similarity(neologism, candidate)) #+ self.get_node2vec_similarity(node2vec_vector,
                                                                    #                              candidate))

    def get_node2vec_similarity(self, v1, candidate):
        v2 = self.node2vec[candidate]
        v1 = v1 / (sum(v1 ** 2) ** 0.5)
        v2 = v2 / (sum(v2 ** 2) ** 0.5)
        return 1 - spatial.distance.cosine(v1, v2)
Esempio n. 3
0
class Node2vecBaselineModel(BaselineModel):
    def __init__(self, params):
        super().__init__(params)
        self.node2vec = KeyedVectors.load_word2vec_format(params["node2vec_path"], binary=False)
        self.projection = ProjectionVectorizer(self.w2v_data, params["projection_path"])

    def generate_associates(self, neologism, topn=10) -> list:
        neighbours, _ = self.projection.predict_projection_word(neologism, self.node2vec)
        return neighbours
class ClassifierNode2VecRankedModel(RankedModel):
    def __init__(self, params):
        super().__init__(params)
        self.params = params
        self.node2vec = KeyedVectors.load_word2vec_format(params["node2vec_path"], binary=False)
        self.projection = ProjectionVectorizer(self.w2v_data, params["projection_path"])
        self.predicted = self.generate_predictions(params["predictions"])

    def generate_predictions(self, path):
        data = defaultdict(list)
        ruwordnet = RuWordnet(self.params["db_path"], self.params["ruwordnet_path"])

        with open(path, 'r', encoding='utf-8') as f:  # "./labelled_hch.tsv"
            for line in f:
                label, _, neologism, candidate_word = line.strip().split("\t")
                label = float(label)
                candidate = ruwordnet.get_id_by_name(candidate_word)
                if label == 1.0:
                    data[neologism].append(candidate)
        return data

    def compute_candidates(self, neologism, get_hypernym_fn, get_hyponym_fn, get_taxonomy_name_fn, topn=10):
        hypernyms = self.compute_hchs(neologism, get_hypernym_fn, topn)
        second_order_hypernyms = [s_o for hypernym in hypernyms for s_o in get_hypernym_fn(hypernym)]

        node2vec, node2vec_vector = self.generate_node2vec(neologism, get_hypernym_fn, topn)
        all_hypernyms = Counter(hypernyms + second_order_hypernyms)

        sorted_hypernyms = reversed(sorted(all_hypernyms.items(), key=lambda x: self.get_node2vec_score(neologism,
                                                                                                        node2vec_vector,
                                                                                                        *x)))

        return [i[0] for i in sorted_hypernyms][:topn]

    def generate_node2vec(self, neologism, compute_hypernyms, topn=10) -> list:
        neighbours, node2vec_vector = self.projection.predict_projection_word(neologism, self.node2vec)
        associates = map(itemgetter(0), neighbours)
        hchs = [hypernym for associate in associates for hypernym in compute_hypernyms(associate)]
        return hchs, node2vec_vector

    def get_node2vec_score(self, neologism, node2vec_vector, candidate, count):
        nn_score = 0.5 if candidate in self.predicted[neologism] else 1
        return count * (self.get_similarity(neologism, candidate)) + \
               self.get_node2vec_similarity(node2vec_vector, candidate)

    def get_node2vec_similarity(self, v1, candidate):
        v2 = self.node2vec[candidate]
        v1 = v1 / (sum(v1 ** 2) ** 0.5)
        v2 = v2 / (sum(v2 ** 2) ** 0.5)
        return 1 - spatial.distance.cosine(v1, v2)
 def __init__(self, params):
     super().__init__(params)
     self.params = params
     self.node2vec = KeyedVectors.load_word2vec_format(params["node2vec_path"], binary=False)
     self.projection = ProjectionVectorizer(self.w2v_data, params["projection_path"])
     self.predicted = self.generate_predictions(params["predictions"])
Esempio n. 6
0
class AllModel(HCHModel):
    def __init__(self, params):
        super().__init__(params)
        self.wiktionary = self.__get_wiktionary(params['wiki_path'])
        self.wiki_model = KeyedVectors.load_word2vec_format(params['wiki_vectors_path'], binary=False)
        self.node2vec = KeyedVectors.load_word2vec_format(params["node2vec_path"], binary=False)
        self.n = params['n']
        self.projection = ProjectionVectorizer(self.w2v_data, params["projection_path"])
        self.poincare_model = PoincareKeyedVectors.load_word2vec_format(params["poincare_path"], binary=False)
        self.n = params["n"]

        self.delete_bracets = re.compile(r"\(.+?\)")
        if params['language'] == 'ru':
            self.pattern = re.compile("[^А-я \-]")
        else:
            self.pattern = re.compile("[^A-z \-]")

    def __get_wiktionary(self, path):
        wiktionary = {}
        with open(path, 'r') as f:
            for line in f:
                data = json.loads(line)
                wiktionary[data['word']] = {"hypernyms": data['hypernyms'], "synonyms": data['synonyms'],
                                            "meanings": data['meanings']}
        return wiktionary

    def compute_candidates(self, neologism, get_hypernym_fn, get_hyponym_fn, get_taxonomy_name_fn, topn=10):
        hypernyms = self.compute_hchs(neologism, get_hypernym_fn, topn)
        second_order_hypernyms = [s_o for hypernym in hypernyms for s_o in get_hypernym_fn(hypernym)]
        all_hypernyms = Counter(hypernyms + second_order_hypernyms)
        associates = self.generate_associates(neologism, 50)

        node2vec, mean_node2vec = self.generate_node2vec(neologism, get_hypernym_fn, topn)
        similars = [i[0] for i in self.w2v_synsets.similar_by_vector(self.w2v_data[neologism])]
        poincare_vector = self.aggregate(similars)

        votes = Counter()
        for associate, similarity in associates:
            for hypernym in get_hypernym_fn(associate):
                votes[hypernym] += similarity
        sorted_hypernyms = reversed(sorted((all_hypernyms + votes).items(),
                                           key=lambda x: self.get_wiki_score(neologism, get_taxonomy_name_fn,
                                                                             mean_node2vec, poincare_vector,
                                                                             *x)
                                           ))
        return [i[0] for i in sorted_hypernyms][:topn]

    def get_wiki_score(self, neologism, get_taxonomy_fn, node2vec_vector, poincare_vector, candidate, count):
        wiki_count = 0.3
        definition_count = 0.8
        synonym_count = 1
        wiki_similarity = 1

        if neologism.lower() in self.wiktionary:
            wiktionary_data = self.wiktionary[neologism.lower()]
            candidate_words = self.delete_bracets.sub("", get_taxonomy_fn(candidate)).split(',')

            if any([candidate_word.lower() in wiktionary_data['hypernyms'] for candidate_word in candidate_words]):
                wiki_count = 2

            if any([any([candidate_word.lower() in i for candidate_word in candidate_words])
                    for i in wiktionary_data['meanings']]):
                definition_count = 2

            if any([candidate_word.lower() in wiktionary_data['synonyms'] for candidate_word in candidate_words]):
                synonym_count = 2

            wiki_similarities = []
            for wiki_hypernym in wiktionary_data['hypernyms']:
                wiki_hypernym = wiki_hypernym.replace("|", " ").replace('--', '')
                wiki_hypernym = self.pattern.sub("", wiki_hypernym)
                if not all([i == " " for i in wiki_hypernym]):
                    wiki_similarities.append(self.compute_similarity(wiki_hypernym.replace(" ", "_"), candidate))
            if wiki_similarities:
                wiki_similarity = sum(wiki_similarities) / len(wiki_similarities)

        node2vec_similarity = self.get_node2vec_similarity(node2vec_vector, candidate)
        poincare_similarity = self.get_poincare_similarity(poincare_vector, candidate)

        # return synonym_count * 0.5 + definition_count * 0.8 + wiki_count * 0.5 + \
        #        0.6 * count * self.get_similarity(neologism, candidate) + 2 * wiki_similarity + 2 * node2vec_similarity
        return synonym_count * 2 + definition_count * 0.4 + wiki_count * 0.3 + 0.6 * count * self.get_similarity(
        neologism, candidate) + 2 * wiki_similarity + 2 * node2vec_similarity + 2 * poincare_similarity

    def compute_similarity(self, neologism, candidate):
        v1 = self.wiki_model[neologism]
        v2 = self.w2v_synsets[candidate]
        v1 = v1 / (sum(v1 ** 2) ** 0.5)
        v2 = v2 / (sum(v2 ** 2) ** 0.5)
        return 1 - spatial.distance.cosine(v1, v2)

    def get_node2vec_similarity(self, v1, candidate):
        v2 = self.node2vec[candidate]
        v1 = v1 / (sum(v1 ** 2) ** 0.5)
        v2 = v2 / (sum(v2 ** 2) ** 0.5)
        return 1 - spatial.distance.cosine(v1, v2)

    def get_node2vec(self, neologism, topn=10) -> list:
        neighbours, _ = self.projection.predict_projection_word(neologism, self.node2vec, topn=topn)
        return neighbours

    def generate_node2vec(self, neologism, compute_hypernyms, topn=10) -> list:
        associates = map(itemgetter(0), self.get_node2vec(neologism, topn))
        hchs = [hypernym for associate in associates for hypernym in compute_hypernyms(associate) if associate in self.w2v_synsets]
        _, node2vec_vector = self.projection.predict_projection_word(neologism, self.node2vec)
        return hchs, node2vec_vector

    def get_poincare_similarity(self, neologism, candidate):
        distance = self.poincare_model.distances(neologism, [candidate])[0]
        similarity = 1 / (1 + distance)
        return similarity

    def aggregate(self, synsets):
        synsets = synsets[:self.n]
        gammas = [(1 / math.sqrt(1 - np.linalg.norm(self.poincare_model[i]) ** 2),
                   self.poincare_model[i]) for i in synsets if i in self.poincare_model.vocab]
        sum_v = sum([i[0] for i in gammas])
        return sum([(i[0] / sum_v) * i[1] for i in gammas])