def generate_taxonomy_fns(params, model): # for English WordNet if params['language'] == 'en': wn = WordNetCorpusReader(params["wordnet_path"], None) return lambda x: [hypernym.name() for hypernym in wn.synset(x).hypernyms() if hypernym.name() in model.w2v_synsets.vocab], \ lambda x: [hyponym.name() for hyponym in wn.synset(x).hyponyms() if hyponym.name() in model.w2v_synsets.vocab], \ lambda x: x.split(".")[0].replace("_", " ") # for RuWordNet elif params['language'] == 'ru': ruwordnet = RuWordnet(db_path=params["db_path"], ruwordnet_path=params["wordnet_path"]) return lambda x: ruwordnet.get_hypernyms_by_id(x), lambda x: ruwordnet.get_hyponyms_by_id(x), \ lambda x: ruwordnet.get_name_by_id(x) else: raise Exception("task / language is not supported")
def main(): args = parse_args() description1 = "---- File {0} took {1} seconds ----\n" description2 = "All: {2}, Found: {3}, Left: {4}" description = description1 + description2 if "ruwordnet_path1" in args: file_paths = tqdm([ os.path.join(x, i) for x, _, z in os.walk(args.corpus_path) for i in z ]) # ------------ RuWordnet initialization ------------ ruwordnet1 = RuWordnet(db_path=args.ruwordnet_path1, ruwordnet_path="") ruwordnet2 = RuWordnet(db_path=args.ruwordnet_path2, ruwordnet_path="") senses = ruwordnet1.get_all_senses() + ruwordnet2.get_all_senses() synset_senses, sense2synset = create_senses_data(senses, args.pos) synsets = set(ruwordnet1.get_all_ids(args.pos)) print(sense2synset) # ------------ Find contexts ------------ # for filename in file_paths: # start_time = time.time() # retrieve_ruwordnet_positions(filename, args.output_path, synset_senses, sense2synset) # file_paths.set_description(description.format(filename, (time.time() - start_time), # len(synsets), len(found_lemmas), # len(synsets.difference(set(found_lemmas))))) # # print(description2.format(len(synsets), len(found_lemmas), len(synsets.difference(set(found_lemmas))))) # print(found_lemmas) # print(synsets.difference(set(found_lemmas))) if "wordnet_old" in args: wordnet_old = WordNetCorpusReader(args.wordnet_old, None) wordnet_new = WordNetCorpusReader(args.wordnet_new, None) synsets = compute_synsets_from_wordnets(wordnet_old, wordnet_new, 'n') for synset in synsets: print( set([i.name() for i in wordnet_old.synset(synset).lemmas()] + [i.name() for i in wordnet_new.synset(synset).lemmas()])) # for filename in file_paths: # start_time = time.time() # retrieve_ruwordnet_positions(filename, args.output_path, synset_senses, sense2synset) # file_paths.set_description(description.format(filename, (time.time() - start_time), # len(synsets), len(found_lemmas), # len(synsets.difference(set(found_lemmas))))) # # print(description2.format(len(synsets), len(found_lemmas), len(synsets.difference(set(found_lemmas))))) # print(found_lemmas) # print(synsets.difference(set(found_lemmas))) elif "data_path" in args: file_paths = tqdm([ os.path.join(x, i) for x, _, z in os.walk(args.corpus_path) for i in z ]) data = read_test_data(args.data_path) for filename in file_paths: start_time = time.time() retrieve_word_positions(filename, args.output_path, data) file_paths.set_description( description.format(filename, (time.time() - start_time), len(data), len(found_lemmas), len(data.difference(set(found_lemmas))))) print( description2.format(len(data), len(found_lemmas), len(data.difference(set(found_lemmas))))) print(found_lemmas) print(data.difference(set(found_lemmas)))