return parser.parse_args() if __name__ == '__main__': args = parse_args() bert_vectorizer = BertVectorizer(args.bert_path) if 'ruwordnet_path' in args: ruwordnet = RuWordnet(args.ruwordnet_path, None) synsets = defaultdict(list) for sense_id, synset_id, text in ruwordnet.get_all_senses(): if synset_id.endswith(args.pos): synsets[synset_id].append(text.lower()) bert_vectorizer.vectorize_groups(synsets, args.output_path, to_upper=False) if 'wordnet_old' in args: wn_old = WordNetCorpusReader(args.wordnet_old, None) wn_new = WordNetCorpusReader(args.wordnet_new, None) synsets = compute_synsets_from_wordnets(wn_old, wn_new, args.pos) bert_vectorizer.vectorize_groups(synsets, args.output_path, to_upper=False) if "data_path" in args: data = read_file(args.data_path, lower=args.upper) bert_vectorizer.vectorize_data(data, args.output_path, upper=args.upper)
from nltk.corpus import WordNetCorpusReader from fasttext_vectorize_en import compute_synsets_from_wordnets wn2 = WordNetCorpusReader( 'D:\\dialogue2020\\semevals\\semeval-2016-task-14\\WN1.7', None) wn3 = WordNetCorpusReader('D:\\dialogue2020\\semeval-2016-task-14\\WN3.0', None) input_path = "D:/dialogue2020/semeval-2016-task-14/reader/" vector_path = "models/vectors/fasttext/en/" # vectorize wordnet noun_synsets = compute_synsets_from_wordnets(wn2, wn3, 'n') verb_synsets = compute_synsets_from_wordnets(wn2, wn3, 'v')
import nltk from nltk.corpus import WordNetCorpusReader from sqlalchemy import * from xml.dom import minidom from nltk.corpus import wordnet as wn import difflib import pickle #wordnet-1.6 の読み込み cwd = os.getcwd() nltk.data.path.append(cwd) wordnet16_dir = "resources/wordnet-1.6/" wn16_path = "{0}/dict".format(wordnet16_dir) WN16 = WordNetCorpusReader(os.path.abspath("{0}/{1}".format(cwd, wn16_path)), nltk.data.find(wn16_path)) # load Wordnet-Affect synsets # corpus: a-synset.xml # return: { # 'noun': { # '05586574': { 'categ': 'electricity', 'pos': 'noun', 'offset16': '05586574' } # }, ... # } def load_asynsets(corpus): tree = ET.parse(corpus) root = tree.getroot() asynsets = {} for pos in ["noun", "adj", "verb", "adv"]:
import os import xml.etree.ElementTree as ET from xml.etree.ElementTree import * from sqlalchemy import * from xml.dom import minidom from sqlite3 import dbapi2 as sqlite from functools import reduce cwd = os.getcwd() os.environ["NLTK_DATA"] = cwd import nltk from nltk.corpus import WordNetCorpusReader WN16_DIR = "resources/wordnet-1.6/dict" WN30_DIR = "resources/WordNet-3.0/dict" WN16 = WordNetCorpusReader(cwd + "/" + WN16_DIR, nltk.data.find(WN16_DIR)) WN = WordNetCorpusReader(cwd + "/" + WN30_DIR, nltk.data.find(WN30_DIR)) DB = create_engine('sqlite:///resources/wnjpn.db') # load Wordnet-Affect synsets # corpus: a-synset.xml # return: { # 'noun': { # '05586574': { 'categ': 'electricity', 'pos': 'noun', 'offset16': '05586574' } # }, ... # } def load_asynsets(corpus): tree = ET.parse(corpus) root = tree.getroot() asynsets = {}
from nltk.tokenize import RegexpTokenizer from nltk.corpus import cmudict, wordnet, WordNetCorpusReader, words from Word import * from Properties import * #from sentiwordnet import SentiWordNetCorpusReader, SentiSynset from AffectTree import * import pickle #preparing resources... #pronounciation dictionary - cmu pron_dict = cmudict.dict() #SentiWordNet #swn = SentiWordNetCorpusReader(swn_filename) wordnet_1_6 = WordNetCorpusReader(wn_1_6_corpus_root) #wordnet dict for id search english_wordlist = list(w.lower() for w in nltk.corpus.words.words()) with open(Properties.anew_filename, 'r') as f: anew_list = { l.rstrip().split("\t")[0]: (float(l.rstrip().split("\t")[2]), float(l.rstrip().split("\t")[4])) for l in f.readlines()[1:] } web_text = nltk.Text(word.lower() for word in nltk.corpus.webtext.words()) #if '_word_context_index' not in web_text.__dict__: # print 'Building word-context index...' # word_context_index = ContextIndex(web_text.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower()) def pickle_wn_dict_id():
def main(): args = parse_args() description1 = "---- File {0} took {1} seconds ----\n" description2 = "All: {2}, Found: {3}, Left: {4}" description = description1 + description2 if "ruwordnet_path1" in args: file_paths = tqdm([ os.path.join(x, i) for x, _, z in os.walk(args.corpus_path) for i in z ]) # ------------ RuWordnet initialization ------------ ruwordnet1 = RuWordnet(db_path=args.ruwordnet_path1, ruwordnet_path="") ruwordnet2 = RuWordnet(db_path=args.ruwordnet_path2, ruwordnet_path="") senses = ruwordnet1.get_all_senses() + ruwordnet2.get_all_senses() synset_senses, sense2synset = create_senses_data(senses, args.pos) synsets = set(ruwordnet1.get_all_ids(args.pos)) print(sense2synset) # ------------ Find contexts ------------ # for filename in file_paths: # start_time = time.time() # retrieve_ruwordnet_positions(filename, args.output_path, synset_senses, sense2synset) # file_paths.set_description(description.format(filename, (time.time() - start_time), # len(synsets), len(found_lemmas), # len(synsets.difference(set(found_lemmas))))) # # print(description2.format(len(synsets), len(found_lemmas), len(synsets.difference(set(found_lemmas))))) # print(found_lemmas) # print(synsets.difference(set(found_lemmas))) if "wordnet_old" in args: wordnet_old = WordNetCorpusReader(args.wordnet_old, None) wordnet_new = WordNetCorpusReader(args.wordnet_new, None) synsets = compute_synsets_from_wordnets(wordnet_old, wordnet_new, 'n') for synset in synsets: print( set([i.name() for i in wordnet_old.synset(synset).lemmas()] + [i.name() for i in wordnet_new.synset(synset).lemmas()])) # for filename in file_paths: # start_time = time.time() # retrieve_ruwordnet_positions(filename, args.output_path, synset_senses, sense2synset) # file_paths.set_description(description.format(filename, (time.time() - start_time), # len(synsets), len(found_lemmas), # len(synsets.difference(set(found_lemmas))))) # # print(description2.format(len(synsets), len(found_lemmas), len(synsets.difference(set(found_lemmas))))) # print(found_lemmas) # print(synsets.difference(set(found_lemmas))) elif "data_path" in args: file_paths = tqdm([ os.path.join(x, i) for x, _, z in os.walk(args.corpus_path) for i in z ]) data = read_test_data(args.data_path) for filename in file_paths: start_time = time.time() retrieve_word_positions(filename, args.output_path, data) file_paths.set_description( description.format(filename, (time.time() - start_time), len(data), len(found_lemmas), len(data.difference(set(found_lemmas))))) print( description2.format(len(data), len(found_lemmas), len(data.difference(set(found_lemmas))))) print(found_lemmas) print(data.difference(set(found_lemmas)))
import nltk from nltk.corpus import wordnet from nltk.corpus import WordNetCorpusReader from duplicity.tempdir import default print('loading wordnet2.0') wn2 = WordNetCorpusReader(nltk.data.find('corpora/wordnet2.0'), None) print('done loading') print('loading wordnet3.0') wn3 = WordNetCorpusReader(nltk.data.find('corpora/wordnet'), None) print('done loading') S2 = wn2.synset L = wn2.lemma S3 = wn3.synset # hash with key is synset_offset of Wordnet 2.0's synset and value is domain domain_list = {} # hash with key is synset_offset of Wordnet 2.0's synset and value is synset 2.0 synset2_list = {} # hash wiht key is synset_offset of WN 3.0's synset and value is synset 3.0 synset3_list = {} def addZero(number): numberStr = str(number)
def setUp(self): self.wncr = WordNetCorpusReader(resource_filename('clasificador.recursos', 'wordnet_spa'), None)