コード例 #1
0
def main():
    args = parse_args()
    file_paths = [
        os.path.join(x, i) for x, _, z in os.walk(args.corpus_path) for i in z
    ]

    if "ruwordnet_path" in args:
        ruwordnet = RuWordnet(db_path=args.ruwordnet_path, ruwordnet_path="")
        sense2synset = create_sense2synset(ruwordnet.get_all_senses(),
                                           args.pos)
        synset_senses = create_senses_chain(ruwordnet, args.pos)
        for filename in file_paths:
            start_time = time.time()
            retrieve_ruwordnet_positions(filename, args.output_path,
                                         synset_senses, sense2synset)
            print(
                f"---- File {filename} took {(time.time() - start_time)} seconds ----"
            )

    elif "data_path" in args:
        data = read_test_data(args.data_path)
        for filename in file_paths:
            start_time = time.time()
            retrieve_word_positions(filename, args.output_path, data)
            print(
                f"---- File {filename} took {(time.time() - start_time)} seconds ----"
            )
    def generate_predictions(self, path):
        data = defaultdict(list)
        ruwordnet = RuWordnet(self.params["db_path"], self.params["ruwordnet_path"])

        with open(path, 'r', encoding='utf-8') as f:  # "./labelled_hch.tsv"
            for line in f:
                label, _, neologism, candidate_word = line.strip().split("\t")
                label = float(label)
                candidate = ruwordnet.get_id_by_name(candidate_word)
                if label == 1.0:
                    data[neologism].append(candidate)
        return data
def main():
    params = load_config()
    model = HCHModel(params)
    ruwordnet = RuWordnet(db_path=params["db_path"], ruwordnet_path=params["ruwordnet_path"])

    with open(params['test_path'], 'r', encoding='utf-8') as f:
        test_data = f.read().split("\n")[:-1]

    with open("private_nouns_top100_candidates_second_order.tsv", "w", encoding="utf-8") as w:
        for neologism in test_data:
            candidates = model.generate_associates(neologism, topn=10)
            for candidate, similarity in candidates:
                w.write(f"{neologism}\t{candidate}\t{similarity}\n")
                for second_order in ruwordnet.get_hypernyms_by_id(candidate):
                    w.write(f"{neologism}\t{second_order}\t{model.get_similarity(neologism, second_order)}\n")
コード例 #4
0
 def __init__(self, params):
     self.ruwordnet = RuWordnet(db_path=params["db_path"],
                                ruwordnet_path=params["ruwordnet_path"])
     self.w2v_ruwordnet = KeyedVectors.load_word2vec_format(
         params['ruwordnet_vectors_path'], binary=False)
     self.w2v_data = KeyedVectors.load_word2vec_format(
         params['data_vectors_path'], binary=False)
コード例 #5
0
def generate_taxonomy_fns(params, model):
    # for English WordNet
    if params['language'] == 'en':
        wn = WordNetCorpusReader(params["wordnet_path"], None)
        return lambda x: [hypernym.name() for hypernym in wn.synset(x).hypernyms()
                          if hypernym.name() in model.w2v_synsets.vocab], \
               lambda x: [hyponym.name() for hyponym in wn.synset(x).hyponyms() if hyponym.name()
                          in model.w2v_synsets.vocab], \
               lambda x: x.split(".")[0].replace("_", " ")
    # for RuWordNet
    elif params['language'] == 'ru':
        ruwordnet = RuWordnet(db_path=params["db_path"],
                              ruwordnet_path=params["wordnet_path"])
        return lambda x: ruwordnet.get_hypernyms_by_id(x), lambda x: ruwordnet.get_hyponyms_by_id(x), \
               lambda x: ruwordnet.get_name_by_id(x)
    else:
        raise Exception("task / language is not supported")
コード例 #6
0
    # save
    # -------------------------------------------------------------

    @staticmethod
    def save_as_w2v(words: list, vectors: np.array, output_path: str):
        assert len(words) == len(vectors)
        with open(output_path, 'w', encoding='utf-8') as w:
            w.write(f"{vectors.shape[0]} {vectors.shape[1]}\n")
            for word, vector in zip(words, vectors):
                vector_line = " ".join(map(str, vector))
                w.write(f"{word} {vector_line}\n")


if __name__ == '__main__':
    ft_vec = FasttextVectorizer("models/cc.ru.300.bin")
    ruwordnet = RuWordnet(db_path="../dataset/ruwordnet.db",
                          ruwordnet_path=None)
    noun_synsets = defaultdict(list)
    verb_synsets = defaultdict(list)
    for sense_id, synset_id, text in ruwordnet.get_all_senses():
        if synset_id.endswith("N"):
            noun_synsets[synset_id].append(text)
        elif synset_id.endswith("V"):
            verb_synsets[synset_id].append(text)

    ft_vec.vectorize_ruwordnet(noun_synsets,
                               "models/vectors/nouns_ruwordnet_fasttext.txt")
    ft_vec.vectorize_ruwordnet(verb_synsets,
                               "models/vectors/verbs_ruwordnet_fasttext.txt")

    with open("../dataset/public/verbs_public_no_labels.tsv",
              'r',
        hchs = [hypernym for associate in associates for hypernym in compute_hypernyms(associate)]
        return hchs, node2vec_vector

    def get_node2vec_score(self, neologism, node2vec_vector, candidate, count):
        nn_score = 0.5 if candidate in self.predicted[neologism] else 1
        return count * (self.get_similarity(neologism, candidate)) + \
               self.get_node2vec_similarity(node2vec_vector, candidate)

    def get_node2vec_similarity(self, v1, candidate):
        v2 = self.node2vec[candidate]
        v1 = v1 / (sum(v1 ** 2) ** 0.5)
        v2 = v2 / (sum(v2 ** 2) ** 0.5)
        return 1 - spatial.distance.cosine(v1, v2)

data = defaultdict(list)
ruwordnet = RuWordnet("../dataset/ruwordnet.db", None)

with open("./labelled_hch.tsv", 'r', encoding='utf-8') as f:
    for line in f:
        label, similarity, neologism, candidate_word = line.strip().split("\t")
        label = float(label)
        similarity = float(similarity)
        candidate = ruwordnet.get_id_by_name(candidate_word)
        if label == 1.0:
            data[neologism].append((candidate, similarity))

with open("predictions_classification_private_nouns.tsv", 'w', encoding='utf-8') as w:
    for i in data:
        candidates = reversed(sorted(set(data[i]), key=lambda x: x[1]))
        for candidate in candidates:
            w.write(f"{i}\t{candidate[0]}\t{ruwordnet.get_name_by_id(candidate[0])}\n")
コード例 #8
0
    data_parser = subparsers.add_parser('data', help='data help')
    data_parser.add_argument('--data_path',
                             type=str,
                             dest="data_path",
                             help='path to test data')
    data_parser.add_argument('--upper', action="store_true")

    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    bert_vectorizer = BertVectorizer(args.bert_path)

    if 'ruwordnet_path' in args:
        ruwordnet = RuWordnet(args.ruwordnet_path, None)
        synsets = defaultdict(list)
        for sense_id, synset_id, text in ruwordnet.get_all_senses():
            if synset_id.endswith(args.pos):
                synsets[synset_id].append(text.lower())
        bert_vectorizer.vectorize_groups(synsets,
                                         args.output_path,
                                         to_upper=False)

    if 'wordnet_old' in args:
        wn_old = WordNetCorpusReader(args.wordnet_old, None)
        wn_new = WordNetCorpusReader(args.wordnet_new, None)
        synsets = compute_synsets_from_wordnets(wn_old, wn_new, args.pos)
        bert_vectorizer.vectorize_groups(synsets,
                                         args.output_path,
                                         to_upper=False)
コード例 #9
0
 def __init__(self, params, part, phase):
     self.part = part # the part of speech
     self.phase = phase
     self.ruwordnet = RuWordnet(db_path=params["db_path"], ruwordnet_path=params["ruwordnet_path"])
     self.w2v_ruwordnet = KeyedVectors.load_word2vec_format(params[f"ruwordnet_vectors_{part}_path"], binary=False)
     self.w2v_data = KeyedVectors.load_word2vec_format(params[f"{phase}_data_vectors_{part}_path"], binary=False)
コード例 #10
0
            for word, vector in zip(words, vectors):
                vector_line = " ".join(map(str, vector))
                w.write(f"{word.upper()} {vector_line}\n")


def process_data(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        dataset = f.read().lower().split("\n")[:-1]
    w2v_vec.vectorize_data(dataset, output_file)


if __name__ == '__main__':
    from helpers.utils import load_config
    config = load_config()
    w2v_vec = wiki2vecVectorizer(config["vectorizer_path"])
    ruwordnet = RuWordnet(db_path=config["db_path"], ruwordnet_path=config["ruwordnet_path"], with_lemmas=False)
    noun_synsets = defaultdict(list)
    verb_synsets = defaultdict(list)
    for sense_id, synset_id, text in ruwordnet.get_all_senses():
        if synset_id.endswith("N"):
            noun_synsets[synset_id].append(text.lower())
        elif synset_id.endswith("V"):
            verb_synsets[synset_id].append(text.lower())

    w2v_vec.vectorize_ruwordnet(noun_synsets, "models/vectors/ruwordnet_nouns.txt")
    w2v_vec.vectorize_ruwordnet(verb_synsets, "models/vectors/ruwordnet_verbs.txt")

    process_data("../data/public_test/verbs_public.tsv", "models/vectors/verbs_public.txt")
    process_data("../data/public_test/nouns_public.tsv", "models/vectors/nouns_public.txt")
    process_data("../data/private_test/verbs_private.tsv", "models/vectors/verbs_private.txt")
    process_data("../data/private_test/nouns_private.tsv", "models/vectors/nouns_private.txt")
# -*- coding: utf-8 -*-
import json
from collections import defaultdict
from itertools import combinations

import networkx as nx
from pymorphy2 import MorphAnalyzer

from ruwordnet.ruwordnet_reader import RuWordnet

USE_SYNSETS = False
USE_TOPONYMS = False
ruwordnet = RuWordnet("../dataset/ruwordnet.db", None)
morph = MorphAnalyzer()


def read_file(filename):
    with open(filename, encoding='utf-8') as f:
        return [([i.split(";")[0]] + i.split(";")[-2:])
                for i in f.read().split("\n")[:-1]]


adj_nouns = [
    "ДАУНХИЛ", "ИНФОРМВОЙНА", "МАСТИТ", "ШКОЛЕНЬЕ", "ЭКЗИТПОЛ", "АВАРКОМ",
    "АКТИВ-НЕТТО", "БАМУТСКИЙ", "БАСАЕВСКИЙ", "БАШХИМ", "БЛИННАЯ", "БУЛОЧНАЯ",
    "ГОЙСКОЕ", "ГОНЧАЯ", "ГРАНДЖ", "ДЕТСКАЯ", "ДМИТРИЕВ-ЛЬГОВСКИЙ",
    "ПОДУШЕВОЙ", "ДМИТРОВСК-ОРЛОВСКИЙ", "ЖЕЛЕЗНОГОРСК-ИЛИМСКИЙ", "ИРБИТСКОЕ",
    "КАМЕНКА-ДНЕПРОВСКАЯ", "КАМЕНЬ-КАШИРСКИЙ", "КОНДИТЕРСКАЯ", "ЛИКВИДКОМ",
    "МОГИЛЕВ-ПОДОЛЬСКИЙ", "МРАВИНСКИЙ", "МУНДА", "ОТПУСКНЫЕ", "ПАРИКМАХЕРСКАЯ",
    "ПЕЛЬМЕННАЯ", "ПИРОЖКОВАЯ", "ПЛИССЕ", "ПРАЛИНЕ", "ПРИЕМНАЯ", "РОКОКО",
    "РЮМОЧНАЯ", "СПАССК-РЯЗАНСКИЙ", "ТУ-154М", "УРАЛХИМ", "ЧАЙНАЯ",
コード例 #12
0
def main():
    args = parse_args()

    description1 = "---- File {0} took {1} seconds ----\n"
    description2 = "All: {2}, Found: {3}, Left: {4}"
    description = description1 + description2

    if "ruwordnet_path1" in args:
        file_paths = tqdm([
            os.path.join(x, i) for x, _, z in os.walk(args.corpus_path)
            for i in z
        ])

        # ------------ RuWordnet initialization ------------
        ruwordnet1 = RuWordnet(db_path=args.ruwordnet_path1, ruwordnet_path="")
        ruwordnet2 = RuWordnet(db_path=args.ruwordnet_path2, ruwordnet_path="")
        senses = ruwordnet1.get_all_senses() + ruwordnet2.get_all_senses()
        synset_senses, sense2synset = create_senses_data(senses, args.pos)
        synsets = set(ruwordnet1.get_all_ids(args.pos))
        print(sense2synset)
        # ------------ Find contexts ------------
        # for filename in file_paths:
        #     start_time = time.time()
        #     retrieve_ruwordnet_positions(filename, args.output_path, synset_senses, sense2synset)
        #     file_paths.set_description(description.format(filename, (time.time() - start_time),
        #                                                   len(synsets), len(found_lemmas),
        #                                                   len(synsets.difference(set(found_lemmas)))))
        #
        # print(description2.format(len(synsets), len(found_lemmas), len(synsets.difference(set(found_lemmas)))))
        # print(found_lemmas)
        # print(synsets.difference(set(found_lemmas)))

    if "wordnet_old" in args:
        wordnet_old = WordNetCorpusReader(args.wordnet_old, None)
        wordnet_new = WordNetCorpusReader(args.wordnet_new, None)
        synsets = compute_synsets_from_wordnets(wordnet_old, wordnet_new, 'n')

        for synset in synsets:
            print(
                set([i.name() for i in wordnet_old.synset(synset).lemmas()] +
                    [i.name() for i in wordnet_new.synset(synset).lemmas()]))
        # for filename in file_paths:
        #     start_time = time.time()
        #     retrieve_ruwordnet_positions(filename, args.output_path, synset_senses, sense2synset)
        #     file_paths.set_description(description.format(filename, (time.time() - start_time),
        #                                                   len(synsets), len(found_lemmas),
        #                                                   len(synsets.difference(set(found_lemmas)))))
        #
        # print(description2.format(len(synsets), len(found_lemmas), len(synsets.difference(set(found_lemmas)))))
        # print(found_lemmas)
        # print(synsets.difference(set(found_lemmas)))

    elif "data_path" in args:
        file_paths = tqdm([
            os.path.join(x, i) for x, _, z in os.walk(args.corpus_path)
            for i in z
        ])

        data = read_test_data(args.data_path)
        for filename in file_paths:
            start_time = time.time()
            retrieve_word_positions(filename, args.output_path, data)
            file_paths.set_description(
                description.format(filename, (time.time() - start_time),
                                   len(data), len(found_lemmas),
                                   len(data.difference(set(found_lemmas)))))

        print(
            description2.format(len(data), len(found_lemmas),
                                len(data.difference(set(found_lemmas)))))
        print(found_lemmas)
        print(data.difference(set(found_lemmas)))
import os
from collections import defaultdict

from ruwordnet.ruwordnet_reader import RuWordnet
from vectorizers.fasttext_vectorizer import FasttextVectorizer


def process_data(vectorizer, input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        dataset = f.read().lower().split("\n")[:-1]
    vectorizer.vectorize_words(dataset, output_file)


if __name__ == '__main__':
    ft = FasttextVectorizer("models/cc.ru.300.bin")
    ruwordnet = RuWordnet(db_path="../dataset/ruwordnet.db",
                          ruwordnet_path=None)
    vector_path = "models/vectors/fasttext/ru/"

    # ----------------------
    # vectorize synsets
    # ----------------------
    # noun_synsets = defaultdict(list)
    # verb_synsets = defaultdict(list)
    # all_synsets = defaultdict(list)
    #
    # for sense_id, synset_id, text in ruwordnet.get_all_senses():
    #     if synset_id.endswith("N"):
    #         noun_synsets[synset_id].append(text.lower())
    #     elif synset_id.endswith("V"):
    #         verb_synsets[synset_id].append(text.lower())
    #     all_synsets[synset_id].append(text.lower())