def __init__(self, wordnet16_dir=None, wn_domains_dir=None): """Initializes the WordNet-Affect object.""" wordnet16_dir = wordnet16_dir or join(dirname(__file__), "wordnet-1.6") wn_domains_dir = wn_domains_dir or join(dirname(__file__), "wn-domains-3.2") cwd = os.getcwd() nltk.data.path.append(cwd) wn16_path = "{0}/dict".format(wordnet16_dir) self.wn16 = WordNetCorpusReader( os.path.abspath("{0}/{1}".format(cwd, wn16_path)), nltk.data.find(wn16_path)) self.flat_pos = { 'NN': 'NN', 'NNS': 'NN', 'JJ': 'JJ', 'JJR': 'JJ', 'JJS': 'JJ', 'RB': 'RB', 'RBR': 'RB', 'RBS': 'RB', 'VB': 'VB', 'VBD': 'VB', 'VGB': 'VB', 'VBN': 'VB', 'VBP': 'VB', 'VBZ': 'VB' } self.wn_pos = { 'NN': self.wn16.NOUN, 'JJ': self.wn16.ADJ, 'VB': self.wn16.VERB, 'RB': self.wn16.ADV } self._load_emotions(wn_domains_dir) self.synsets = self._load_synsets(wn_domains_dir)
def main(): ft = FasttextVectorizer("models/cc.en.300.bin") wn2 = WordNetCorpusReader( 'D:\\dialogue2020\\semevals\\semeval-2016-task-14\\WN1.6', None) wn3 = WordNetCorpusReader( 'D:\\dialogue2020\\semevals\\semeval-2016-task-14\\WN3.0', None) input_path = "D:/dialogue2020/semevals/semeval-2016-task-14/reader/" vector_path = "models/vectors/fasttext/en/new" # vectorize wordnet noun_synsets = compute_synsets_from_wordnets(wn2, wn3, 'n') verb_synsets = compute_synsets_from_wordnets(wn2, wn3, 'v') ft.vectorize_groups(noun_synsets, os.path.join(vector_path, "nouns_wordnet_fasttext_1.6-3.0.txt"), to_upper=False) ft.vectorize_groups(verb_synsets, os.path.join(vector_path, "verbs_wordnet_fasttext_1.6-3.0.txt"), to_upper=False) # vectorize words process_data( ft, os.path.join(input_path, "no_labels_nouns_en_new.1.6-3.0.tsv"), os.path.join(vector_path, "nouns_fasttext_cut_1.6-3.0.txt")) process_data( ft, os.path.join(input_path, "no_labels_verbs_en_new.1.6-3.0.tsv"), os.path.join(vector_path, "verbs_fasttext_cut_1.6-3.0.txt"))
def __init__(self): super(Antonimos, self).__init__() self.nombre = "Antonimos" self.descripcion = """ Mide la cantidad de pares de antónimos presentes en el texto. """ self.thread_safe = False # Tiene problemas de concurrencia: https://github.com/nltk/nltk/issues/803 self.wncr = WordNetCorpusReader(resource_filename('clasificador.recursos', 'wordnet_spa'), None)
def __init__(self, wordnet16_dir, wn_domains_dir): """Initializes the WordNet-Affect object.""" cwd = os.getcwd() nltk.data.path.append(cwd) wn16_path = "{0}/dict".format(wordnet16_dir) self.wn16 = WordNetCorpusReader(os.path.abspath(wn16_path), nltk.data.find(wn16_path)) self.flat_pos = {'NN':'NN', 'NNS':'NN', 'JJ':'JJ', 'JJR':'JJ', 'JJS':'JJ', 'RB':'RB', 'RBR':'RB', 'RBS':'RB', 'VB':'VB', 'VBD':'VB', 'VGB':'VB', 'VBN':'VB', 'VBP':'VB', 'VBZ':'VB'} self.wn_pos = {'NN':self.wn16.NOUN, 'JJ':self.wn16.ADJ, 'VB':self.wn16.VERB, 'RB':self.wn16.ADV} self._load_emotions(wn_domains_dir) self.synsets = self._load_synsets(wn_domains_dir)
class TestWordNet(unittest.TestCase): def setUp(self): self.wncr = WordNetCorpusReader(resource_filename('clasificador.recursos', 'wordnet_spa'), None) # Habría que actualizar el offset ya que cambió y el test deja de servir. # def test_invalid_literal_for_int_16(self): # self.wncr._synset_from_pos_and_line('n', # '04122387 00 n 0a agudeza 0 broma 0 chiste 0 chufleta 0 comentario' \ # + '_burlón 0 cuchufleta 0 idea 0 ocurrencia 0 pulla 0 salida 0 04 @' \ # + ' 04120601 n 0000 + 00620096 v 0000 + 00499330 v 0000 + 00558467' \ # + ' v 0000 | comentario ingenioso para hacer reír \n') def test_key_error(self): self.wncr.lemma("menor.a.09.menor").antonyms()
def yield_17_candidates(corpus): wn16 = WordNetCorpusReader('wordnet/1.6/') wn17 = WordNetCorpusReader('wordnet/1.7.1/') for w in corpus.get_unique_words(): synsets17 = wn17.synsets(w) lexclasses = list(set([s.lexname for s in synsets17])) synsets16 = wn16.synsets(w) if synsets16: continue if len(lexclasses) != 1: continue if 'noun' not in lexclasses[0]: continue yield w
def activate(self, *args, **kwargs): nltk.download('stopwords') self._stopwords = stopwords.words('english') #local_path=os.path.dirname(os.path.abspath(__file__)) self._categories = {'anger': ['general-dislike',], 'fear': ['negative-fear',], 'disgust': ['shame',], 'joy': ['gratitude','affective','enthusiasm','love','joy','liking'], 'sadness': ['ingrattitude','daze','humility','compassion','despair','anxiety','sadness']} self._wnaffect_mappings = {'anger': 'anger', 'fear': 'negative-fear', 'disgust': 'disgust', 'joy': 'joy', 'sadness': 'sadness'} self._load_emotions(self.hierarchy_path) if 'total_synsets' not in self.sh: total_synsets = self._load_synsets(self.synsets_path) self.sh['total_synsets'] = total_synsets self._total_synsets = self.sh['total_synsets'] if 'wn16' not in self.sh: self._wn16_path = self.wn16_path wn16 = WordNetCorpusReader(os.path.abspath("{0}".format(self._wn16_path)), nltk.data.find(self._wn16_path)) self.sh['wn16'] = wn16 self._wn16 = self.sh['wn16']
class Antonimos(Feature): def __init__(self): super(Antonimos, self).__init__() self.nombre = "Antonimos" self.descripcion = """ Mide la cantidad de pares de antónimos presentes en el texto. """ self.thread_safe = False # Tiene problemas de concurrencia: https://github.com/nltk/nltk/issues/803 self.wncr = WordNetCorpusReader(resource_filename('clasificador.recursos', 'wordnet_spa'), None) def calcular_feature(self, tweet): oraciones = Freeling.procesar_texto(remover_hashtags(remover_usuarios(tweet.texto))) tokens = Freeling.get_tokens_de_oraciones(oraciones) cant_antonimos = 0 for token in tokens: antonimos = [] for synset in self.wncr.synsets(token.lemma): for lemma in synset.lemmas(): antonimos += [lemma_antonimo.name() for lemma_antonimo in lemma.antonyms()] for otro_token in tokens: if otro_token.lemma in antonimos: cant_antonimos += 1 break if len(tokens) == 0: return 0 else: return cant_antonimos / math.sqrt(len(tokens)) / 2.0 # divido entre 2 para contar una vez cada par
def main(): # python fasttext_vectorize_en.py models/cc.en.300.bin ../../datasets/WNs 2.0 models/vectors/fasttext/en ../../datasets/en if len(sys.argv) < 7: raise Exception( "Required arguments: <fasttext-path> <wn-dir> <old-version> <new-version> <vector-path> <input-path>" ) ft = FasttextVectorizer(sys.argv[1]) old = sys.argv[3] new = sys.argv[4] wn2 = WordNetCorpusReader(os.path.join(sys.argv[2], "WN" + old), None) vector_path = sys.argv[5] if not os.path.exists(vector_path): os.makedirs(vector_path) data_path = sys.argv[6] for pos in ['nouns', 'verbs']: synsets = compute_synsets_from_wordnets(wn2, pos[0]) ft.vectorize_groups( synsets, os.path.join(vector_path, f"{pos}_wordnet_fasttext_{old}-{new}.txt"), False) process_data( ft, os.path.join(data_path, f"no_labels_{pos}_en.{old}-{new}.tsv"), os.path.join(vector_path, f"{pos}_fasttext_{old}-{new}.txt"))
def generate_taxonomy_fns(params, model): # for English WordNet if params['language'] == 'en': wn = WordNetCorpusReader(params["wordnet_path"], None) return lambda x: [hypernym.name() for hypernym in wn.synset(x).hypernyms() if hypernym.name() in model.w2v_synsets.vocab], \ lambda x: [hyponym.name() for hyponym in wn.synset(x).hyponyms() if hyponym.name() in model.w2v_synsets.vocab], \ lambda x: x.split(".")[0].replace("_", " ") # for RuWordNet elif params['language'] == 'ru': ruwordnet = RuWordnet(db_path=params["db_path"], ruwordnet_path=params["wordnet_path"]) return lambda x: ruwordnet.get_hypernyms_by_id(x), lambda x: ruwordnet.get_hyponyms_by_id(x), \ lambda x: ruwordnet.get_name_by_id(x) else: raise Exception("task / language is not supported")
def main(): args = parse_args() ft_vec = FasttextVectorizer(args.fasttext_path) if args.data_path: # read data with open(args.data_path, 'r', encoding='utf-8') as f: dataset = [ line.split("\t")[1].replace(" ", "_") for line in f.read().split("\n") if line ] # vectorize wordnet if "wordnet" in args: wn = WordNetCorpusReader(args.wordnet, None) for word in dataset: print(word, wn.synsets(word, pos=args.pos)) else: ft_vec.vectorize_multiword_data(dataset, args.output_path, to_upper=False) elif args.data_dir: for system_dir in os.listdir(args.data_dir): for dirpath, _, filenames in os.walk( os.path.join(args.data_dir, system_dir, args.language)): for filename in filenames: if filename.endswith(".terms"): input_path = os.path.join(dirpath, filename) os.makedirs(os.path.join(args.output_path, system_dir), exist_ok=True) output_path = os.path.join( args.output_path, system_dir, filename.replace(".terms", ".txt").replace( system_dir + "_", "")) with open(input_path, 'r', encoding='utf-8') as f: dataset = [ line.split("\t")[1].replace(" ", "_") for line in f.read().split("\n") if line ] ft_vec.vectorize_multiword_data(dataset, output_path, to_upper=False) print(f"Processed: {filename}") else: raise Exception("Please, specify either --data_dir or --data_path")
def activate(self, *args, **kwargs): nltk.download(['stopwords', 'averaged_perceptron_tagger', 'wordnet']) self._stopwords = stopwords.words('english') self._wnlemma = wordnet.WordNetLemmatizer() self._syntactics = {'N': 'n', 'V': 'v', 'J': 'a', 'S': 's', 'R': 'r'} local_path = os.path.dirname(os.path.abspath(__file__)) self._categories = { 'anger': [ 'general-dislike', ], 'fear': [ 'negative-fear', ], 'disgust': [ 'shame', ], 'joy': ['gratitude', 'affective', 'enthusiasm', 'love', 'joy', 'liking'], 'sadness': [ 'ingrattitude', 'daze', 'humility', 'compassion', 'despair', 'anxiety', 'sadness' ] } self._wnaffect_mappings = { 'anger': 'anger', 'fear': 'negative-fear', 'disgust': 'disgust', 'joy': 'joy', 'sadness': 'sadness' } self._load_emotions(local_path + self.hierarchy_path) if 'total_synsets' not in self.sh: total_synsets = self._load_synsets(local_path + self.synsets_path) self.sh['total_synsets'] = total_synsets self._total_synsets = self.sh['total_synsets'] self._wn16_path = self.wn16_path self._wn16 = WordNetCorpusReader( os.path.abspath("{0}".format(local_path + self._wn16_path)), nltk.data.find(local_path + self._wn16_path))
def activate(self, *args, **kwargs): self._stopwords = stopwords.words('english') self._wnlemma = wordnet.WordNetLemmatizer() self._syntactics = {'N': 'n', 'V': 'v', 'J': 'a', 'S': 's', 'R': 'r'} local_path = os.environ.get("SENPY_DATA") self._categories = { 'anger': [ 'general-dislike', ], 'fear': [ 'negative-fear', ], 'disgust': [ 'shame', ], 'joy': ['gratitude', 'affective', 'enthusiasm', 'love', 'joy', 'liking'], 'sadness': [ 'ingrattitude', 'daze', 'humility', 'compassion', 'despair', 'anxiety', 'sadness' ] } self._wnaffect_mappings = { 'anger': 'anger', 'fear': 'negative-fear', 'disgust': 'disgust', 'joy': 'joy', 'sadness': 'sadness' } self._load_emotions(self.find_file(self.hierarchy_path)) if 'total_synsets' not in self.sh: total_synsets = self._load_synsets( self.find_file(self.synsets_path)) self.sh['total_synsets'] = total_synsets self._total_synsets = self.sh['total_synsets'] self._wn16_path = self.wn16_path self._wn16 = WordNetCorpusReader( self.find_file(self._wn16_path), nltk.data.find(self.find_file(self._wn16_path)))
def run_pos_parallel(specs, job_length): def sentiment_chunk(specs, start, finish): positive = [ tuple(line.split(",")) for line in open(specs["positive_filename"]).read().splitlines() ] negative = [ tuple(line.split(",")) for line in open(specs["negative_filename"]).read().splitlines() ] neutral = [ tuple(line.split(",")) for line in open(specs["neutral_filename"]).read().splitlines() ] lex = sentimentlexicon.SentimentLexicon(positive, negative, neutral, specs["pos"], start=start, finish=finish, weight=0.2) sentiment = lex.iterate() output_filename = "wn.%s.%s%s-%s.yaml" % (specs["classification"], specs["pos"], start, finish) yaml.dump(sentiment, file(output_filename, "w")) return sentiment # Get the overall size, lc. synsets = list(WordNetCorpusReader(wn_root).all_synsets(pos=specs["pos"])) lc = {} for synset in synsets: for lemma in synset.lemmas: lc[lemma] = True lc = len(lc.keys()) print "Lemma count", lc # Build the jobs. start = 0 finish = job_length jobs = [] ppservers = () job_server = pp.Server(ppservers=ppservers) while start < lc: jobs.append( job_server.submit(sentiment_chunk, (specs, start, finish), (), ("sentimentlexicon", "yaml"))) start = finish finish += job_length if finish > lc: finish = lc print len(jobs), "jobs created; now running ..." all_sentiment = {} for job in jobs: sentiment = job() for key, val in sentiment.items(): all_sentiment[key] = val job_server.print_stats() output_filename = "wn.%s.%s.yaml" % (specs["classification"], specs["pos"]) yaml.dump(all_sentiment, file(output_filename, "w"))
def __init__(self, wordnet_version='3.5'): self.wordnet_version = wordnet_version if self.wordnet_version == '3.5': self.wn = wn else: nltk.data.path.append(ROOT_PATH) wn_dir = "wordnet/resources/WordNet-" + self.wordnet_version + '/' wn_path = "{0}/dict".format(wn_dir) self.wn = WordNetCorpusReader( os.path.abspath("{0}/{1}".format(ROOT_PATH, wn_path)), nltk.data.find(wn_path))
def __init__(self, wordnet16_dir, wn_domains_dir): """Initializes the WordNet-Affect object.""" cwd = os.getcwd() nltk.data.path.append(cwd) wn16_path = "{0}/dict".format(wordnet16_dir) self.wn16 = WordNetCorpusReader(os.path.abspath("{0}/{1}".format(cwd, wn16_path)), nltk.data.find(wn16_path)) self.flat_pos = {'NN':'NN', 'NNS':'NN', 'JJ':'JJ', 'JJR':'JJ', 'JJS':'JJ', 'RB':'RB', 'RBR':'RB', 'RBS':'RB', 'VB':'VB', 'VBD':'VB', 'VGB':'VB', 'VBN':'VB', 'VBP':'VB', 'VBZ':'VB'} self.wn_pos = {'NN':self.wn16.NOUN, 'JJ':self.wn16.ADJ, 'VB':self.wn16.VERB, 'RB':self.wn16.ADV} self._load_emotions(wn_domains_dir) self.synsets = self._load_synsets(wn_domains_dir)
def __init__(self, wordnet_home): assert exists(f'{wordnet_home}/WordNet-2.0' ), f'error: missing WordNet-2.0 in {wordnet_home}' assert exists(f'{wordnet_home}/wn-domains-3.2' ), f'error: missing WordNetDomains in {wordnet_home}' # load WordNet2.0 self.wn = WordNetCorpusReader(f'{wordnet_home}/WordNet-2.0/dict', 'WordNet-2.0/dict') # load WordNetDomains (based on https://stackoverflow.com/a/21904027/8759307) self.domain2synsets = defaultdict(list) self.synset2domains = defaultdict(list) for i in open(f'{wordnet_home}/wn-domains-3.2/wn-domains-3.2-20070223', 'r'): ssid, doms = i.strip().split('\t') doms = doms.split() self.synset2domains[ssid] = doms for d in doms: self.domain2synsets[d].append(ssid)
class TestTransform(unittest.TestCase): @classmethod def setUpClass(cls): cls.languages = ["cat", "eng", "eus", "glg", "spa"] cls.wn_names = {} for lang in cls.languages: cls.wn_names[lang] = '.wordnet_' + lang with tarfile.open('wordnet_' + lang + '.tar.gz') as f: f.extractall(cls.wn_names[lang]) def test_all_synsets(self): self.wncr = WordNetCorpusReader(self.wn_names['spa'], None) for synset in self.wncr.all_synsets(): a = synset # success if there is no error # This will also test that all synsets in data files are in index files. def test_invalid_literal_for_int_16(self): self.wncr = WordNetCorpusReader(self.wn_names['spa'], None) for synset in self.wncr.synsets("agudeza"): a = synset # self.wncr._synset_from_pos_and_line('n', # "04122387 00 n 0a agudeza 0 broma 0 chiste 0 chufleta 0 comentario_burlón 0 cuchufleta 0 idea 0 ocurrencia 0 pulla 0 salida 0 04 @ 04120601 n 0000 + 00620096 v 0000 + 00499330 v 0000 + 00558467 v 0000 | comentario ingenioso para hacer reír \n") # # success if there is no error def test_key_error(self): self.wncr = WordNetCorpusReader(self.wn_names['spa'], None) self.wncr.lemma("menor.a.09.menor").antonyms() # success if there is no error def test_load_wordnet(self): for lang in self.languages: self.wncr = WordNetCorpusReader(self.wn_names[lang], None) # success if there is no error @classmethod def tearDownClass(cls): for lang in cls.languages: shutil.rmtree(cls.wn_names[lang])
def main(): # python en_dataset_creation.py ../../datasets/WNs ../../datasets/en/ 2.0 3.0 if len(sys.argv) < 3: raise Exception( "The following arguments are required:<WordNet path> <output_path> <old_version_float> <new_version_float>" ) path = sys.argv[1] out_path = sys.argv[2] old_version = sys.argv[3] if len(sys.argv) == 5: new_version = sys.argv[4] else: new_version = "3.0" wn2 = WordNetCorpusReader(os.path.join(path, 'WN' + old_version), None) wn3 = WordNetCorpusReader(os.path.join(path, 'WN' + new_version), None) for pos in ['nouns', 'verbs']: synsets_2n = set(wn2.all_synsets(pos[0])) synsets_3n = set(wn3.all_synsets(pos[0])) reference_nouns = synsets_3n.intersection(synsets_2n) new = extract_new_lemmas(synsets_3n.difference(synsets_2n), wn2, pos[0]) hypernyms = generate_gold(new, wn3, reference_nouns, pos[0]) print(f"Len {pos} {len(hypernyms)}") save(dict(hypernyms), out_path, f"{pos}_en.{old_version}-{new_version}.tsv")
def initialize_s(self): """Builds the vectors s, as a dictionary mapping words to reals. The domain of the dictionary is the full vocabulary.""" synsets = list(WordNetCorpusReader(wn_root).all_synsets(pos=self.pos)) for synset in synsets: for lemma in synset.lemmas: if (lemma.name, synset.pos) in self.positive: self.s[lemma] = 1.0 self.s0[lemma] = 1.0 elif (lemma.name, synset.pos) in self.negative: self.s[lemma] = -1.0 self.s0[lemma] = -1.0 else: self.s[lemma] = 0.0 self.s0[lemma] = 0.0
def __init__(self, wordnet16_dir, wn_domains_dir): """Initializes the WordNet-Affect object.""" try: cwd = os.getcwd() nltk.data.path.append(cwd) wn16_path = "{0}/dict".format(wordnet16_dir) self.wn16 = WordNetCorpusReader( os.path.abspath("{0}/{1}".format(cwd, wn16_path)), nltk.data.find(wn16_path)) self.flat_pos = { 'NN': 'NN', 'NNS': 'NN', 'JJ': 'JJ', 'JJR': 'JJ', 'JJS': 'JJ', 'RB': 'RB', 'RBR': 'RB', 'RBS': 'RB', 'VB': 'VB', 'VBD': 'VB', 'VGB': 'VB', 'VBN': 'VB', 'VBP': 'VB', 'VBZ': 'VB' } self.wn_pos = { 'NN': self.wn16.NOUN, 'JJ': self.wn16.ADJ, 'VB': self.wn16.VERB, 'RB': self.wn16.ADV } self._load_emotions(wn_domains_dir) self.synsets = self._load_synsets(wn_domains_dir) except: print "Please download the dependencies and re-run the script after installing them successfully. Exiting !" exit()
def __init__(self, info, *args, **kwargs): super(EmotionTextPlugin, self).__init__(info, *args, **kwargs) self.id = info['module'] self.info = info self._stopwords = stopwords.words('english') local_path=os.path.dirname(os.path.abspath(__file__)) self._categories = {'anger': ['general-dislike',], 'fear': ['negative-fear',], 'disgust': ['shame',], 'joy': ['gratitude','affective','enthusiasm','love','joy','liking'], 'sadness': ['ingrattitude','daze','humility','compassion','despair','anxiety','sadness']} self._wnaffect_mappings = {'anger': 'anger', 'fear': 'negative-fear', 'disgust': 'disgust', 'joy': 'joy', 'sadness': 'sadness'} self._load_emotions(local_path+self.info['hierarchy_path']) self._total_synsets = self._load_synsets(local_path+self.info['synsets_path']) self._wn16_path = local_path+self.info['wn16_path'] self._wn16= None self._wn16 = WordNetCorpusReader(os.path.abspath("{0}".format(self._wn16_path)), nltk.data.find(self._wn16_path))
output_path = os.path.join(exp_folder, 'wsd_framework_results.json') with open(output_path, 'w') as outfile: json.dump(results, outfile) if __name__ == '__main__': #exp_folder = 'coling2018/synset-se13-semcor' exp_folder = 'coling2018/synset-se2-framework-semcor' scorer_folder = '/Users/marten/Downloads/WSD_Unified_Evaluation_Datasets' from nltk.corpus import WordNetCorpusReader if any(['se13' in exp_folder, 'framework' in exp_folder]): from nltk.corpus import wordnet as wn elif 'se2' in exp_folder: path_to_wn_dict_folder = '/Users/marten/Downloads/WordNet-1.7.1/dict' wn = WordNetCorpusReader(path_to_wn_dict_folder, None) create_key_file(wn, exp_folder, debug=1) score_using_official_scorer(exp_folder, scorer_folder)
def test_load_wordnet(self): for lang in self.languages: self.wncr = WordNetCorpusReader(self.wn_names[lang], None)
#!/usr/bin/env python import sys from nltk.corpus import WordNetCorpusReader dict_dir = sys.argv[1] wn = WordNetCorpusReader(dict_dir) for synset in wn.all_synsets(): for lem in synset.lemmas: print lem.name, synset.lexname
from nltk.corpus import WordNetCorpusReader from nltk.corpus import wordnet as wn import csv #csvモジュールをインポートする import pandas as pd from scipy.stats import spearmanr import re #wordnet-1.7.1 の読み込み cwd = os.getcwd() nltk.data.path.append(cwd) wordnet17_dir="resources/WordNet-1.7.1/" wn17_path = "{0}/dict".format(wordnet17_dir) WN17 = WordNetCorpusReader(os.path.abspath("{0}/{1}".format(cwd, wn17_path)), nltk.data.find(wn17_path)) if __name__ == '__main__': mapping = ['AutoExtend', 'GlossTfIdf', 'GlossAve', 'Word'] S = {} N = 999 #datanum # 学習済みモデルのロード model = word2vec.Word2Vec.load_word2vec_format("../word2vec/models/GoogleNews-vectors-negative300.bin", binary=True) with open('synset2vecAE.pickle', 'rb') as f: S['AutoExtend'] = pickle.load(f) with open('synset2vecG.pickle', 'rb') as f: S['GlossTfIdf'] = pickle.load(f)
def test_all_synsets(self): self.wncr = WordNetCorpusReader(self.wn_names['spa'], None) for synset in self.wncr.all_synsets(): a = synset
from nltk.corpus import WordNetCorpusReader from fasttext_vectorize_en import compute_synsets_from_wordnets wn2 = WordNetCorpusReader( 'D:\\dialogue2020\\semevals\\semeval-2016-task-14\\WN1.7', None) wn3 = WordNetCorpusReader('D:\\dialogue2020\\semeval-2016-task-14\\WN3.0', None) input_path = "D:/dialogue2020/semeval-2016-task-14/reader/" vector_path = "models/vectors/fasttext/en/" # vectorize wordnet noun_synsets = compute_synsets_from_wordnets(wn2, wn3, 'n') verb_synsets = compute_synsets_from_wordnets(wn2, wn3, 'v')
def test_invalid_literal_for_int_16(self): self.wncr = WordNetCorpusReader(self.wn_names['spa'], None) for synset in self.wncr.synsets("agudeza"): a = synset
return parser.parse_args() if __name__ == '__main__': args = parse_args() bert_vectorizer = BertVectorizer(args.bert_path) if 'ruwordnet_path' in args: ruwordnet = RuWordnet(args.ruwordnet_path, None) synsets = defaultdict(list) for sense_id, synset_id, text in ruwordnet.get_all_senses(): if synset_id.endswith(args.pos): synsets[synset_id].append(text.lower()) bert_vectorizer.vectorize_groups(synsets, args.output_path, to_upper=False) if 'wordnet_old' in args: wn_old = WordNetCorpusReader(args.wordnet_old, None) wn_new = WordNetCorpusReader(args.wordnet_new, None) synsets = compute_synsets_from_wordnets(wn_old, wn_new, args.pos) bert_vectorizer.vectorize_groups(synsets, args.output_path, to_upper=False) if "data_path" in args: data = read_file(args.data_path, lower=args.upper) bert_vectorizer.vectorize_data(data, args.output_path, upper=args.upper)
class WNAffect(EmotionPlugin, ShelfMixin): ''' Emotion classifier using WordNet-Affect to calculate the percentage of each emotion. This plugin classifies among 6 emotions: anger,fear,disgust,joy,sadness or neutral. The only available language is English (en) ''' name = 'emotion-wnaffect' author = ["@icorcuera", "@balkian"] version = '0.2' extra_params = { 'language': { "@id": 'lang_wnaffect', 'description': 'language of the input', 'aliases': ['language', 'l'], 'required': True, 'options': [ 'en', ] } } synsets_path = "a-synsets.xml" hierarchy_path = "a-hierarchy.xml" wn16_path = "wordnet1.6/dict" onyx__usesEmotionModel = "emoml:big6" nltk_resources = ['stopwords', 'averaged_perceptron_tagger', 'wordnet'] def _load_synsets(self, synsets_path): """Returns a dictionary POS tag -> synset offset -> emotion (str -> int -> str).""" tree = ET.parse(synsets_path) root = tree.getroot() pos_map = {"noun": "NN", "adj": "JJ", "verb": "VB", "adv": "RB"} synsets = {} for pos in ["noun", "adj", "verb", "adv"]: tag = pos_map[pos] synsets[tag] = {} for elem in root.findall(".//{0}-syn-list//{0}-syn".format( pos, pos)): offset = int(elem.get("id")[2:]) if not offset: continue if elem.get("categ"): synsets[tag][offset] = Emo.emotions[elem.get( "categ")] if elem.get( "categ") in Emo.emotions else None elif elem.get("noun-id"): synsets[tag][offset] = synsets[pos_map["noun"]][int( elem.get("noun-id")[2:])] return synsets def _load_emotions(self, hierarchy_path): """Loads the hierarchy of emotions from the WordNet-Affect xml.""" tree = ET.parse(hierarchy_path) root = tree.getroot() for elem in root.findall("categ"): name = elem.get("name") if name == "root": Emo.emotions["root"] = Emo("root") else: Emo.emotions[name] = Emo(name, elem.get("isa")) def activate(self, *args, **kwargs): self._stopwords = stopwords.words('english') self._wnlemma = wordnet.WordNetLemmatizer() self._syntactics = {'N': 'n', 'V': 'v', 'J': 'a', 'S': 's', 'R': 'r'} local_path = os.environ.get("SENPY_DATA") self._categories = { 'anger': [ 'general-dislike', ], 'fear': [ 'negative-fear', ], 'disgust': [ 'shame', ], 'joy': ['gratitude', 'affective', 'enthusiasm', 'love', 'joy', 'liking'], 'sadness': [ 'ingrattitude', 'daze', 'humility', 'compassion', 'despair', 'anxiety', 'sadness' ] } self._wnaffect_mappings = { 'anger': 'anger', 'fear': 'negative-fear', 'disgust': 'disgust', 'joy': 'joy', 'sadness': 'sadness' } self._load_emotions(self.find_file(self.hierarchy_path)) if 'total_synsets' not in self.sh: total_synsets = self._load_synsets( self.find_file(self.synsets_path)) self.sh['total_synsets'] = total_synsets self._total_synsets = self.sh['total_synsets'] self._wn16_path = self.wn16_path self._wn16 = WordNetCorpusReader( self.find_file(self._wn16_path), nltk.data.find(self.find_file(self._wn16_path))) def deactivate(self, *args, **kwargs): self.save() def _my_preprocessor(self, text): regHttp = re.compile( '(http://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?') regHttps = re.compile( '(https://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?') regAt = re.compile('@([a-zA-Z0-9]*[*_/&%#@$]*)*[a-zA-Z0-9]*') text = re.sub(regHttp, '', text) text = re.sub(regAt, '', text) text = re.sub('RT : ', '', text) text = re.sub(regHttps, '', text) text = re.sub('[0-9]', '', text) text = self._delete_punctuation(text) return text def _delete_punctuation(self, text): exclude = set(string.punctuation) s = ''.join(ch for ch in text if ch not in exclude) return s def _extract_ngrams(self, text): unigrams_lemmas = [] pos_tagged = [] unigrams_words = [] tokens = text.split() for token in nltk.pos_tag(tokens): unigrams_words.append(token[0]) pos_tagged.append(token[1]) if token[1][0] in self._syntactics.keys(): unigrams_lemmas.append( self._wnlemma.lemmatize(token[0], self._syntactics[token[1][0]])) else: unigrams_lemmas.append(token[0]) return unigrams_words, unigrams_lemmas, pos_tagged def _find_ngrams(self, input_list, n): return zip(*[input_list[i:] for i in range(n)]) def _clean_pos(self, pos_tagged): pos_tags = { 'NN': 'NN', 'NNP': 'NN', 'NNP-LOC': 'NN', 'NNS': 'NN', 'JJ': 'JJ', 'JJR': 'JJ', 'JJS': 'JJ', 'RB': 'RB', 'RBR': 'RB', 'RBS': 'RB', 'VB': 'VB', 'VBD': 'VB', 'VGB': 'VB', 'VBN': 'VB', 'VBP': 'VB', 'VBZ': 'VB' } for i in range(len(pos_tagged)): if pos_tagged[i] in pos_tags: pos_tagged[i] = pos_tags[pos_tagged[i]] return pos_tagged def _extract_features(self, text): feature_set = {k: 0 for k in self._categories} ngrams_words, ngrams_lemmas, pos_tagged = self._extract_ngrams(text) matches = 0 pos_tagged = self._clean_pos(pos_tagged) tag_wn = { 'NN': self._wn16.NOUN, 'JJ': self._wn16.ADJ, 'VB': self._wn16.VERB, 'RB': self._wn16.ADV } for i in range(len(pos_tagged)): if pos_tagged[i] in tag_wn: synsets = self._wn16.synsets(ngrams_words[i], tag_wn[pos_tagged[i]]) if synsets: offset = synsets[0].offset() if offset in self._total_synsets[pos_tagged[i]]: if self._total_synsets[pos_tagged[i]][offset] is None: continue else: emotion = self._total_synsets[ pos_tagged[i]][offset].get_level(5).name matches += 1 for i in self._categories: if emotion in self._categories[i]: feature_set[i] += 1 if matches == 0: matches = 1 for i in feature_set: feature_set[i] = (feature_set[i] / matches) return feature_set def analyse_entry(self, entry, activity): params = activity.params text_input = entry['nif:isString'] text = self._my_preprocessor(text_input) feature_text = self._extract_features(text) emotionSet = EmotionSet(id="Emotions0") emotions = emotionSet.onyx__hasEmotion for i in feature_text: emotions.append( Emotion(onyx__hasEmotionCategory=self._wnaffect_mappings[i], onyx__hasEmotionIntensity=feature_text[i])) entry.emotions = [emotionSet] yield entry def test(self, *args, **kwargs): results = list() params = { 'algo': 'emotion-wnaffect', 'intype': 'direct', 'expanded-jsonld': 0, 'informat': 'text', 'prefix': '', 'plugin_type': 'analysisPlugin', 'urischeme': 'RFC5147String', 'outformat': 'json-ld', 'i': 'Hello World', 'input': 'Hello World', 'conversion': 'full', 'language': 'en', 'algorithm': 'emotion-wnaffect' } self.activate() texts = { 'I hate you': 'anger', 'i am sad': 'sadness', 'i am happy with my marks': 'joy', 'This movie is scary': 'negative-fear' } for text in texts: response = next( self.analyse_entry(Entry(nif__isString=text), self.activity(params))) expected = texts[text] emotionSet = response.emotions[0] max_emotion = max(emotionSet['onyx:hasEmotion'], key=lambda x: x['onyx:hasEmotionIntensity']) assert max_emotion['onyx:hasEmotionCategory'] == expected
import nltk from nltk.corpus import WordNetCorpusReader from sqlalchemy import * from xml.dom import minidom from nltk.corpus import wordnet as wn import difflib import pickle #wordnet-1.6 の読み込み cwd = os.getcwd() nltk.data.path.append(cwd) wordnet16_dir = "resources/wordnet-1.6/" wn16_path = "{0}/dict".format(wordnet16_dir) WN16 = WordNetCorpusReader(os.path.abspath("{0}/{1}".format(cwd, wn16_path)), nltk.data.find(wn16_path)) # load Wordnet-Affect synsets # corpus: a-synset.xml # return: { # 'noun': { # '05586574': { 'categ': 'electricity', 'pos': 'noun', 'offset16': '05586574' } # }, ... # } def load_asynsets(corpus): tree = ET.parse(corpus) root = tree.getroot() asynsets = {} for pos in ["noun", "adj", "verb", "adv"]:
def test_key_error(self): self.wncr = WordNetCorpusReader(self.wn_names['spa'], None) self.wncr.lemma("menor.a.09.menor").antonyms()
class WNAffect: """WordNet-Affect ressource.""" def __init__(self, wordnet16_dir, wn_domains_dir): """Initializes the WordNet-Affect object.""" cwd = os.getcwd() nltk.data.path.append(cwd) wn16_path = "{0}/dict".format(wordnet16_dir) self.wn16 = WordNetCorpusReader(os.path.abspath("{0}/{1}".format(cwd, wn16_path)), nltk.data.find(wn16_path)) self.flat_pos = {'NN':'NN', 'NNS':'NN', 'JJ':'JJ', 'JJR':'JJ', 'JJS':'JJ', 'RB':'RB', 'RBR':'RB', 'RBS':'RB', 'VB':'VB', 'VBD':'VB', 'VGB':'VB', 'VBN':'VB', 'VBP':'VB', 'VBZ':'VB'} self.wn_pos = {'NN':self.wn16.NOUN, 'JJ':self.wn16.ADJ, 'VB':self.wn16.VERB, 'RB':self.wn16.ADV} self._load_emotions(wn_domains_dir) self.synsets = self._load_synsets(wn_domains_dir) def _load_synsets(self, wn_domains_dir): """Returns a dictionary POS tag -> synset offset -> emotion (str -> int -> str).""" tree = ET.parse("{0}/a-synsets.xml".format(wn_domains_dir)) root = tree.getroot() pos_map = { "noun": "NN", "adj": "JJ", "verb": "VB", "adv": "RB" } synsets = {} for pos in ["noun", "adj", "verb", "adv"]: tag = pos_map[pos] synsets[tag] = {} for elem in root.findall(".//{0}-syn-list//{0}-syn".format(pos, pos)): offset = int(elem.get("id")[2:]) if not offset: continue if elem.get("categ"): synsets[tag][offset] = Emotion.emotions[elem.get("categ")] if elem.get("categ") in Emotion.emotions else None elif elem.get("noun-id"): synsets[tag][offset] = synsets[pos_map["noun"]][int(elem.get("noun-id")[2:])] return synsets def _load_emotions(self, wn_domains_dir): """Loads the hierarchy of emotions from the WordNet-Affect xml.""" tree = ET.parse("{0}/a-hierarchy.xml".format(wn_domains_dir)) root = tree.getroot() for elem in root.findall("categ"): name = elem.get("name") if name == "root": Emotion.emotions["root"] = Emotion("root") else: Emotion.emotions[name] = Emotion(name, elem.get("isa")) def get_emotion(self, word, pos): """Returns the emotion of the word. word -- the word (str) pos -- part-of-speech (str) """ if pos in self.flat_pos: pos = self.flat_pos[pos] synsets = self.wn16.synsets(word, self.wn_pos[pos]) if synsets: offset = synsets[0].offset() if offset in self.synsets[pos]: return self.synsets[pos][offset] return None
def yield_single_sense_nouns_in_corpus(corpus): wn = WordNetCorpusReader('wordnet/1.6/') for word in corpus.get_unique_words(): synsets = list(wn.synsets(word)) if len(synsets) == 1 and 'noun' in synsets[0].lexname: yield word
def yield_single_supersense_nouns_in_corpus(corpus): wn = WordNetCorpusReader('wordnet/1.6/') for w in corpus.get_unique_words(): lexclasses = list(set([s.lexname for s in wn.synsets(w)])) if len(lexclasses) == 1 and 'noun' in lexclasses[0]: yield w
class WNAffect(object): """WordNet-Affect resource.""" def __init__(self, wordnet16_dir=None, wn_domains_dir=None): """Initializes the WordNet-Affect object.""" wordnet16_dir = wordnet16_dir or join(dirname(__file__), "wordnet-1.6") wn_domains_dir = wn_domains_dir or join(dirname(__file__), "wn-domains-3.2") cwd = os.getcwd() nltk.data.path.append(cwd) wn16_path = "{0}/dict".format(wordnet16_dir) self.wn16 = WordNetCorpusReader( os.path.abspath("{0}/{1}".format(cwd, wn16_path)), nltk.data.find(wn16_path)) self.flat_pos = { 'NN': 'NN', 'NNS': 'NN', 'JJ': 'JJ', 'JJR': 'JJ', 'JJS': 'JJ', 'RB': 'RB', 'RBR': 'RB', 'RBS': 'RB', 'VB': 'VB', 'VBD': 'VB', 'VGB': 'VB', 'VBN': 'VB', 'VBP': 'VB', 'VBZ': 'VB' } self.wn_pos = { 'NN': self.wn16.NOUN, 'JJ': self.wn16.ADJ, 'VB': self.wn16.VERB, 'RB': self.wn16.ADV } self._load_emotions(wn_domains_dir) self.synsets = self._load_synsets(wn_domains_dir) def _load_synsets(self, wn_domains_dir): """Returns a dictionary POS tag -> synset offset -> emotion (str -> int -> str).""" tree = ET.parse( "{0}/wn-affect-1.1/a-synsets.xml".format(wn_domains_dir)) root = tree.getroot() pos_map = {"noun": "NN", "adj": "JJ", "verb": "VB", "adv": "RB"} synsets = {} for pos in ["noun", "adj", "verb", "adv"]: tag = pos_map[pos] synsets[tag] = {} for elem in root.findall(".//{0}-syn-list//{0}-syn".format( pos, pos)): offset = int(elem.get("id")[2:]) if not offset: continue if elem.get("categ"): synsets[tag][offset] = WNAffectEmotion.emotions[elem.get( "categ")] if elem.get( "categ") in WNAffectEmotion.emotions else None elif elem.get("noun-id"): synsets[tag][offset] = synsets[pos_map["noun"]][int( elem.get("noun-id")[2:])] return synsets def _load_emotions(self, wn_domains_dir): """Loads the hierarchy of emotions from the WordNet-Affect xml.""" tree = ET.parse( "{0}/wn-affect-1.1/a-hierarchy.xml".format(wn_domains_dir)) root = tree.getroot() for elem in root.findall("categ"): name = elem.get("name") if name == "root": WNAffectEmotion.emotions["root"] = WNAffectEmotion("root") else: WNAffectEmotion.emotions[name] = WNAffectEmotion( name, elem.get("isa")) def get_emotion(self, word, pos): """Returns the emotion of the word. word -- the word (str) pos -- part-of-speech (str) """ if pos in self.flat_pos: pos = self.flat_pos[pos] synsets = self.wn16.synsets(word, self.wn_pos[pos]) if synsets: for synset in synsets: offset = synset.offset() if offset in self.synsets[pos]: return self.synsets[pos][offset] return None def get_emotion_synset(self, offset): """Returns the emotion of the synset. offset -- synset offset (int) """ for pos in self.flat_pos.values(): if offset in self.synsets[pos]: return self.synsets[pos][offset] return None
def setUp(self): self.wncr = WordNetCorpusReader(resource_filename('clasificador.recursos', 'wordnet_spa'), None)
class EmotionTextPlugin(EmotionPlugin): def __init__(self, info, *args, **kwargs): super(EmotionTextPlugin, self).__init__(info, *args, **kwargs) self.id = info['module'] self.info = info self._stopwords = stopwords.words('english') local_path=os.path.dirname(os.path.abspath(__file__)) self._categories = {'anger': ['general-dislike',], 'fear': ['negative-fear',], 'disgust': ['shame',], 'joy': ['gratitude','affective','enthusiasm','love','joy','liking'], 'sadness': ['ingrattitude','daze','humility','compassion','despair','anxiety','sadness']} self._wnaffect_mappings = {'anger': 'anger', 'fear': 'negative-fear', 'disgust': 'disgust', 'joy': 'joy', 'sadness': 'sadness'} self._load_emotions(local_path+self.info['hierarchy_path']) self._total_synsets = self._load_synsets(local_path+self.info['synsets_path']) self._wn16_path = local_path+self.info['wn16_path'] self._wn16= None self._wn16 = WordNetCorpusReader(os.path.abspath("{0}".format(self._wn16_path)), nltk.data.find(self._wn16_path)) def _load_synsets(self, synsets_path): """Returns a dictionary POS tag -> synset offset -> emotion (str -> int -> str).""" tree = ET.parse(synsets_path) root = tree.getroot() pos_map = { "noun": "NN", "adj": "JJ", "verb": "VB", "adv": "RB" } synsets = {} for pos in ["noun", "adj", "verb", "adv"]: tag = pos_map[pos] synsets[tag] = {} for elem in root.findall(".//{0}-syn-list//{0}-syn".format(pos, pos)): offset = int(elem.get("id")[2:]) if not offset: continue if elem.get("categ"): synsets[tag][offset] = Emo.emotions[elem.get("categ")] if elem.get("categ") in Emo.emotions else None elif elem.get("noun-id"): synsets[tag][offset] = synsets[pos_map["noun"]][int(elem.get("noun-id")[2:])] return synsets def _load_emotions(self, hierarchy_path): """Loads the hierarchy of emotions from the WordNet-Affect xml.""" tree = ET.parse(hierarchy_path) root = tree.getroot() for elem in root.findall("categ"): name = elem.get("name") if name == "root": Emo.emotions["root"] = Emo("root") else: Emo.emotions[name] = Emo(name, elem.get("isa")) def activate(self, *args, **kwargs): logger.info("EmoText plugin is ready to go!") def deactivate(self, *args, **kwargs): logger.info("EmoText plugin is being deactivated...") def _my_preprocessor(self, text): regHttp = re.compile('(http://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?') regHttps = re.compile('(https://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?') regAt = re.compile('@([a-zA-Z0-9]*[*_/&%#@$]*)*[a-zA-Z0-9]*') text = re.sub(regHttp, '', text) text = re.sub(regAt, '', text) text = re.sub('RT : ', '', text) text = re.sub(regHttps, '', text) text = re.sub('[0-9]', '', text) text = self._delete_punctuation(text) return text def _delete_punctuation(self, text): exclude = set(string.punctuation) s = ''.join(ch for ch in text if ch not in exclude) return s def _extract_ngrams(self, text): unigrams_lemmas = [] pos_tagged = [] unigrams_words = [] sentences = parse(text,lemmata=True).split() for sentence in sentences: for token in sentence: if token[0].lower() not in self._stopwords: unigrams_words.append(token[0].lower()) unigrams_lemmas.append(token[4]) pos_tagged.append(token[1]) return unigrams_words,unigrams_lemmas,pos_tagged def _find_ngrams(self, input_list, n): return zip(*[input_list[i:] for i in range(n)]) def _clean_pos(self, pos_tagged): pos_tags={'NN':'NN', 'NNP':'NN','NNP-LOC':'NN', 'NNS':'NN', 'JJ':'JJ', 'JJR':'JJ', 'JJS':'JJ', 'RB':'RB', 'RBR':'RB', 'RBS':'RB', 'VB':'VB', 'VBD':'VB', 'VGB':'VB', 'VBN':'VB', 'VBP':'VB', 'VBZ':'VB'} for i in range(len(pos_tagged)): if pos_tagged[i] in pos_tags: pos_tagged[i]=pos_tags[pos_tagged[i]] return pos_tagged def _extract_features(self, text): feature_set={k:0 for k in self._categories} ngrams_words,ngrams_lemmas,pos_tagged = self._extract_ngrams(text) matches=0 pos_tagged=self._clean_pos(pos_tagged) tag_wn={'NN':self._wn16.NOUN,'JJ':self._wn16.ADJ,'VB':self._wn16.VERB,'RB':self._wn16.ADV} for i in range(len(pos_tagged)): if pos_tagged[i] in tag_wn: synsets = self._wn16.synsets(ngrams_words[i], tag_wn[pos_tagged[i]]) if synsets: offset = synsets[0].offset() if offset in self._total_synsets[pos_tagged[i]]: if self._total_synsets[pos_tagged[i]][offset] is None: continue else: emotion = self._total_synsets[pos_tagged[i]][offset].get_level(5).name matches+=1 for i in self._categories: if emotion in self._categories[i]: feature_set[i]+=1 if matches == 0: matches=1 for i in feature_set: feature_set[i] = (feature_set[i]/matches)*100 return feature_set def analyse(self, **params): logger.debug("Analysing with params {}".format(params)) text_input = params.get("input", None) text=self._my_preprocessor(text_input) feature_text=self._extract_features(text) response = Results() entry = Entry(id="Entry", text=text_input) emotionSet = EmotionSet(id="Emotions0") emotions = emotionSet.onyx__hasEmotion for i in feature_text: emotions.append(Emotion(onyx__hasEmotionCategory=self._wnaffect_mappings[i], onyx__hasEmotionIntensity=feature_text[i])) entry.emotions = [emotionSet] response.entries.append(entry) return response