Beispiel #1
0
def generate_taxonomy_fns(params, model):
    # for English WordNet
    if params['language'] == 'en':
        wn = WordNetCorpusReader(params["wordnet_path"], None)
        return lambda x: [hypernym.name() for hypernym in wn.synset(x).hypernyms()
                          if hypernym.name() in model.w2v_synsets.vocab], \
               lambda x: [hyponym.name() for hyponym in wn.synset(x).hyponyms() if hyponym.name()
                          in model.w2v_synsets.vocab], \
               lambda x: x.split(".")[0].replace("_", " ")
    # for RuWordNet
    elif params['language'] == 'ru':
        ruwordnet = RuWordnet(db_path=params["db_path"],
                              ruwordnet_path=params["wordnet_path"])
        return lambda x: ruwordnet.get_hypernyms_by_id(x), lambda x: ruwordnet.get_hyponyms_by_id(x), \
               lambda x: ruwordnet.get_name_by_id(x)
    else:
        raise Exception("task / language is not supported")
Beispiel #2
0
def main():
    args = parse_args()

    description1 = "---- File {0} took {1} seconds ----\n"
    description2 = "All: {2}, Found: {3}, Left: {4}"
    description = description1 + description2

    if "ruwordnet_path1" in args:
        file_paths = tqdm([
            os.path.join(x, i) for x, _, z in os.walk(args.corpus_path)
            for i in z
        ])

        # ------------ RuWordnet initialization ------------
        ruwordnet1 = RuWordnet(db_path=args.ruwordnet_path1, ruwordnet_path="")
        ruwordnet2 = RuWordnet(db_path=args.ruwordnet_path2, ruwordnet_path="")
        senses = ruwordnet1.get_all_senses() + ruwordnet2.get_all_senses()
        synset_senses, sense2synset = create_senses_data(senses, args.pos)
        synsets = set(ruwordnet1.get_all_ids(args.pos))
        print(sense2synset)
        # ------------ Find contexts ------------
        # for filename in file_paths:
        #     start_time = time.time()
        #     retrieve_ruwordnet_positions(filename, args.output_path, synset_senses, sense2synset)
        #     file_paths.set_description(description.format(filename, (time.time() - start_time),
        #                                                   len(synsets), len(found_lemmas),
        #                                                   len(synsets.difference(set(found_lemmas)))))
        #
        # print(description2.format(len(synsets), len(found_lemmas), len(synsets.difference(set(found_lemmas)))))
        # print(found_lemmas)
        # print(synsets.difference(set(found_lemmas)))

    if "wordnet_old" in args:
        wordnet_old = WordNetCorpusReader(args.wordnet_old, None)
        wordnet_new = WordNetCorpusReader(args.wordnet_new, None)
        synsets = compute_synsets_from_wordnets(wordnet_old, wordnet_new, 'n')

        for synset in synsets:
            print(
                set([i.name() for i in wordnet_old.synset(synset).lemmas()] +
                    [i.name() for i in wordnet_new.synset(synset).lemmas()]))
        # for filename in file_paths:
        #     start_time = time.time()
        #     retrieve_ruwordnet_positions(filename, args.output_path, synset_senses, sense2synset)
        #     file_paths.set_description(description.format(filename, (time.time() - start_time),
        #                                                   len(synsets), len(found_lemmas),
        #                                                   len(synsets.difference(set(found_lemmas)))))
        #
        # print(description2.format(len(synsets), len(found_lemmas), len(synsets.difference(set(found_lemmas)))))
        # print(found_lemmas)
        # print(synsets.difference(set(found_lemmas)))

    elif "data_path" in args:
        file_paths = tqdm([
            os.path.join(x, i) for x, _, z in os.walk(args.corpus_path)
            for i in z
        ])

        data = read_test_data(args.data_path)
        for filename in file_paths:
            start_time = time.time()
            retrieve_word_positions(filename, args.output_path, data)
            file_paths.set_description(
                description.format(filename, (time.time() - start_time),
                                   len(data), len(found_lemmas),
                                   len(data.difference(set(found_lemmas)))))

        print(
            description2.format(len(data), len(found_lemmas),
                                len(data.difference(set(found_lemmas)))))
        print(found_lemmas)
        print(data.difference(set(found_lemmas)))