def __init__(self): '''データのロード''' root = nltk.data.find('corpora/wordnet') cd = os.path.dirname(__file__) if cd == "": cd = "." filename = cd + '/wnjpn-ok.tab' WordNetCorpusReader.__init__(self, root) with codecs.open(filename, encoding="utf-8") as f: self._jword2offset = {} counter = 0 for line in f: try: _cells = line.strip().split('\t') _offset_pos = _cells[0] _word = _cells[1] if len(_cells) > 2: _tag = _cells[2] _offset, _pos = _offset_pos.split('-') self._jword2offset[_word] = { 'offset': int(_offset), 'pos': _pos } counter += 1 except: print("failed to lead line %d" % counter)
def __init__(self, root, filename): WordNetCorpusReader.__init__(self, root) import codecs f=codecs.open(filename, encoding="utf-8") self._jword2offset = {} for line in f: _cells = line.strip().split('\t') _offset_pos = _cells[0] _word = _cells[1] if len(_cells)>2: _tag = _cells[2] _offset, _pos = _offset_pos.split('-') try: self._jword2offset[_word].append({'offset': int(_offset), 'pos': _pos}) except: self._jword2offset[_word]=[{'offset': int(_offset), 'pos': _pos}]
def synset(self, word): "synsetの取得" if word in self._jword2offset: return WordNetCorpusReader._synset_from_pos_and_offset( self, self._jword2offset[word]['pos'], self._jword2offset[word]['offset'] ) else: return None
def synset(self, word): '''synsetの取得''' if word in self._jword2offset: return WordNetCorpusReader._synset_from_pos_and_offset( self, self._jword2offset[word]['pos'], self._jword2offset[word]['offset']) else: return None
def synsets(self, word): if word in self._jword2offset: results = [] for offset in (self._jword2offset[word]): results.append(WordNetCorpusReader._synset_from_pos_and_offset( self, offset['pos'], offset['offset'] )) return results else: return None
def load_wn(version="3.0", location="../../data/wordnet/", base="wn"): """ I kept forgetting how to load WordNet, and this makes it easier to handle different versions of wordnet. Assumes that in the nltk_data directory a directory called "alt_wordnets" exists, and the dict directory of every version is named "base-0.0" (e.g. "wn-1.6") inside that directory. Returns an initialized wn reader. Defaults to the normal installation if it can't find the WN you're looking for (pay attention to the error message if that happens, as you might not be using the version you thought). """ path = location + "%s-%s" % (base, version) print "Looking for ", path if os.path.exists(path): return WordNetCorpusReader(path) else: print("Failed to find WN - defaulting to NLTK's version") return WordNetCorpusReader(nltk.data.find("corpora/wordnet"))
def synsets(self, word): # results = [[ ], [ ]] if word in self._jword2offset: results = [] for offset in (self._jword2offset[word]): results.append( WordNetCorpusReader._synset_from_pos_and_offset( self, offset['pos'], offset['offset'])) return results else: return []
def __init__(self, root, filename): WordNetCorpusReader.__init__(self, root) import codecs f = codecs.open(filename, encoding="utf-8") self._jword2offset = {} for line in f: _cells = line.strip().split('\t') _offset_pos = _cells[0] _word = _cells[1] if len(_cells) > 2: _tag = _cells[2] _offset, _pos = _offset_pos.split('-') try: self._jword2offset[_word].append({ 'offset': int(_offset), 'pos': _pos }) except: self._jword2offset[_word] = [{ 'offset': int(_offset), 'pos': _pos }]
class WordNetLookup(object): def __init__(self, path='corpora/wordnet'): self.path = path self.WN = None def wn(self): if not self.WN: self.WN = WordNetCorpusReader(nltk.data.find(self.path)) def is_superclass_of(self, first, second): "Is the second noun the superclass of the first one?" self.wn() # We cannot guarantee it is a noun. By the time we deal with DRSs, this is just a condition, and could have easily # come from an adjective (if the user does not provide features for nouns, as we do in our grammar) try: num_of_senses_first = self._num_of_senses(first) num_of_senses_second = self._num_of_senses(second) except: return False # At first I wanted to take the first senses of both words, but the first sense is not always the basic meaning of the word, e.g.: # S('hammer.n.1').definition: the part of a gunlock that strikes the percussion cap when the trigger is pulled' # S('hammer.n.2').definition: 'a hand tool with a heavy rigid head and a handle; used to deliver an impulsive force by striking' for n in range(num_of_senses_second): synset_second = self._noun_synset(second, ind=n) for i in range(num_of_senses_first): #print synset_second, self._noun_synset(first, i).common_hypernyms(synset_second) if synset_second in self._noun_synset( first, i).common_hypernyms(synset_second): #print "+++ first", first, "second", second, True return True return False def is_adjective(self, word): try: self._num_of_senses(word, 'a') return True except: return False def _noun_synset(self, noun, ind): self.wn() return self.WN.synset("%s.n.%s" % (noun, ind)) def _num_of_senses(self, word, pos='n'): self.wn() return len(self.WN._lemma_pos_offset_map[word][pos]) def is_person(self, word): return self.is_superclass_of(word, 'person') def is_animal(self, word): return self.is_superclass_of(word, 'animal')
def __init__(self): "データのロード" root = nltk.data.find('corpora/wordnet') cd = os.path.dirname(__file__) if cd == "": cd = "." filename = cd+'/wnjpn-ok.tab' WordNetCorpusReader.__init__(self, root) import codecs with codecs.open(filename, encoding="utf-8") as f: self._jword2offset = {} counter = 0 for line in f: try: _cells = line.strip().split('\t') _offset_pos = _cells[0] _word = _cells[1] if len(_cells)>2: _tag = _cells[2] _offset, _pos = _offset_pos.split('-') self._jword2offset[_word] = {'offset': int(_offset), 'pos': _pos} counter += 1 except: print "failed to lead line %d" % counter
class WordNetLookup(object): def __init__(self, path='corpora/wordnet'): self.path = path self.WN = None def wn(self): if not self.WN: self.WN = WordNetCorpusReader(nltk.data.find(self.path)) def is_superclass_of(self, first, second): "Is the second noun the superclass of the first one?" self.wn() # We cannot guarantee it is a noun. By the time we deal with DRSs, this is just a condition, and could have easily # come from an adjective (if the user does not provide features for nouns, as we do in our grammar) try: num_of_senses_first = self._num_of_senses(first) num_of_senses_second = self._num_of_senses(second) except: return False # At first I wanted to take the first senses of both words, but the first sense is not always the basic meaning of the word, e.g.: # S('hammer.n.1').definition: the part of a gunlock that strikes the percussion cap when the trigger is pulled' # S('hammer.n.2').definition: 'a hand tool with a heavy rigid head and a handle; used to deliver an impulsive force by striking' for n in range(num_of_senses_second): synset_second = self._noun_synset(second, ind=n) for i in range(num_of_senses_first): #print synset_second, self._noun_synset(first, i).common_hypernyms(synset_second) if synset_second in self._noun_synset(first, i).common_hypernyms(synset_second): #print "+++ first", first, "second", second, True return True return False def is_adjective(self, word): try: self._num_of_senses(word, 'a') return True except: return False def _noun_synset(self, noun, ind): self.wn() return self.WN.synset("%s.n.%s" % (noun, ind)) def _num_of_senses (self, word, pos='n'): self.wn() return len(self.WN._lemma_pos_offset_map[word][pos]) def is_person(self, word): return self.is_superclass_of(word, 'person') def is_animal(self, word): return self.is_superclass_of(word, 'animal')
class WordNetLookup(object): def __init__(self, path='corpora/wordnet'): self.path = path self.WN = None def wn(self): if not self.WN: self.WN = WordNetCorpusReader(nltk.data.find(self.path)) def is_superclass_of(self, first, second): "Is the second noun the superclass of the first one?" self.wn() try: num_of_senses_first = self._num_of_senses(first) num_of_senses_second = self._num_of_senses(second) except: return False for n in range(num_of_senses_second): synset_second = self._noun_synset(second, ind=n) for i in range(num_of_senses_first): if synset_second in self._noun_synset( first, i).common_hypernyms(synset_second): return True return False def is_adjective(self, word): try: self._num_of_senses(word, 'a') return True except: return False def _noun_synset(self, noun, ind): self.wn() return self.WN.synset("%s.n.%s" % (noun, ind)) def _num_of_senses(self, word, pos='n'): self.wn() return len(self.WN._lemma_pos_offset_map[word][pos]) def is_person(self, word): return self.is_superclass_of(word, 'person') def is_animal(self, word): return self.is_superclass_of(word, 'animal')
class WordNetLookup(object): def __init__(self, path='corpora/wordnet'): self.path = path self.WN = None def wn(self): if not self.WN: self.WN = WordNetCorpusReader(nltk.data.find(self.path)) def is_superclass_of(self, first, second): "Is the second noun the superclass of the first one?" self.wn() try: num_of_senses_first = self._num_of_senses(first) num_of_senses_second = self._num_of_senses(second) except: return False for n in range(num_of_senses_second): synset_second = self._noun_synset(second, ind=n) for i in range(num_of_senses_first): if synset_second in self._noun_synset(first, i).common_hypernyms(synset_second): return True return False def is_adjective(self, word): try: self._num_of_senses(word, 'a') return True except: return False def _noun_synset(self, noun, ind): self.wn() return self.WN.synset("%s.n.%s" % (noun, ind)) def _num_of_senses (self, word, pos='n'): self.wn() return len(self.WN._lemma_pos_offset_map[word][pos]) def is_person(self, word): return self.is_superclass_of(word, 'person') def is_animal(self, word): return self.is_superclass_of(word, 'animal')
case_strategy = args.use_case_strategy == 'True' number_strategy = args.use_number_strategy == 'True' lp_strategy = args.use_lp == 'True' case_freq = pickle.load(open(args.path_case_freq, 'rb')) plural_freq = pickle.load(open(args.path_plural_freq, 'rb')) lp_info = dict() the_wn_version = '30' # load relevant wordnet if '171' in args.wsd_df_path: the_wn_version = '171' cwd = os.path.dirname(os.path.realpath(__file__)) path_to_wn_dict_folder = os.path.join(cwd, 'scripts', 'wordnets', '171', 'WordNet-1.7.1', 'dict') wn = WordNetCorpusReader(path_to_wn_dict_folder, None) with open(args.sense_embeddings_path + '.freq', 'rb') as infile: meaning_freqs = pickle.load(infile) with open(args.log_path, 'w') as outfile: json.dump(args.__dict__, outfile) def lp_output(row, lp_info, candidate_synsets, debug=False): target_lemma = row['target_lemma'] target_pos = row['pos'] key = (target_lemma, target_pos) if key not in lp_info:
def wn(self): if not self.WN: self.WN = WordNetCorpusReader(nltk.data.find(self.path))
# encoding: utf-8 from nltk.corpus.reader.wordnet import WordNetCorpusReader wn = WordNetCorpusReader(YOUR_WORDNET_PATH, '.*') # 这种方式就会有函数补全 print('wordnet version %s: %s' % (wn.get_version(), YOUR_WORDNET_PATH)) print'get gloss from sensekey......' key = 'dance%1:04:00::' lemma = wn.lemma_from_key(key) synset = lemma.synset() print synset.definition()
import lxml.etree as et import math import numpy as np import collections import re import random from bs4 import BeautifulSoup from bs4 import NavigableString import pickle from utils import path from nltk.corpus.reader.wordnet import WordNetCorpusReader from nltk.stem import WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer() # download wordnet: import nltk; nltk.download("wordnet") in readme.txt _path = path.WSD_path() wn = WordNetCorpusReader(_path.WORDNET_PATH, '.*') print('wordnet version %s: %s' % (wn.get_version(), _path.WORDNET_PATH)) path_words_notin_vocab = '../tmp/words_notin_vocab_{}.txt' pos_dic = { 'ADJ': u'a', 'ADV': u'r', 'NOUN': u'n', 'VERB': u'v', } POS_LIST = pos_dic.values() # ['a', 'r', 'n', 'v'] def load_train_data(dataset): if dataset in _path.LS_DATASET:
def demo(): # print('loading wordnet') # wn = WordNetCorpusReader(nltk.data.find('corpora/wordnet')) # print('done loading') # S = wn.synset # L = wn.lemma # # print('getting a synset for go') # move_synset = S('go.v.21') # print(move_synset.name, move_synset.pos, move_synset.lexname) # print(move_synset.lemma_names) # print(move_synset.definition) # print(move_synset.examples) # # zap_n = ['zap.n.01'] # zap_v = ['zap.v.01', 'zap.v.02', 'nuke.v.01', 'microwave.v.01'] # # def _get_synsets(synset_strings): # return [S(synset) for synset in synset_strings] # # zap_n_synsets = _get_synsets(zap_n) # zap_v_synsets = _get_synsets(zap_v) # zap_synsets = set(zap_n_synsets + zap_v_synsets) # # print(zap_n_synsets) # print(zap_v_synsets) # # print("Navigations:") # print(S('travel.v.01').hypernyms()) # print(S('travel.v.02').hypernyms()) # print(S('travel.v.03').hypernyms()) # # print(L('zap.v.03.nuke').derivationally_related_forms()) # print(L('zap.v.03.atomize').derivationally_related_forms()) # print(L('zap.v.03.atomise').derivationally_related_forms()) # print(L('zap.v.03.zap').derivationally_related_forms()) # # print(S('dog.n.01').member_holonyms()) # print(S('dog.n.01').part_meronyms()) # # print(S('breakfast.n.1').hypernyms()) # print(S('meal.n.1').hyponyms()) # print(S('Austen.n.1').instance_hypernyms()) # print(S('composer.n.1').instance_hyponyms()) # # print(S('faculty.n.2').member_meronyms()) # print(S('copilot.n.1').member_holonyms()) # # print(S('table.n.2').part_meronyms()) # print(S('course.n.7').part_holonyms()) # # print(S('water.n.1').substance_meronyms()) # print(S('gin.n.1').substance_holonyms()) # # print(L('leader.n.1.leader').antonyms()) # print(L('increase.v.1.increase').antonyms()) # # print(S('snore.v.1').entailments()) # print(S('heavy.a.1').similar_tos()) # print(S('light.a.1').attributes()) # print(S('heavy.a.1').attributes()) # # print(L('English.a.1.English').pertainyms()) # # print(S('person.n.01').root_hypernyms()) # print(S('sail.v.01').root_hypernyms()) # print(S('fall.v.12').root_hypernyms()) # # print(S('person.n.01').lowest_common_hypernyms(S('dog.n.01'))) # # print(S('dog.n.01').path_similarity(S('cat.n.01'))) # print(S('dog.n.01').lch_similarity(S('cat.n.01'))) # print(S('dog.n.01').wup_similarity(S('cat.n.01'))) # # wnic = WordNetICCorpusReader(nltk.data.find('corpora/wordnet_ic'), # '.*\.dat') # ic = wnic.ic('ic-brown.dat') # print(S('dog.n.01').jcn_similarity(S('cat.n.01'), ic)) # # ic = wnic.ic('ic-semcor.dat') # print(S('dog.n.01').lin_similarity(S('cat.n.01'), ic)) # # print(S('code.n.03').topic_domains()) # print(S('pukka.a.01').region_domains()) # print(S('freaky.a.01').usage_domains()) wn = WordNetCorpusReader(nltk.data.find('corpora/wordnet')) # word = wn.synset('street.n.01') # # print word.lemma_names # print word.definition # print word.examples # print wn.lemma('dog.n.01.dog').synset # print word.hypernyms() # print word.hyponyms() # print word.member_holonyms() # print word.member_meronyms() # print word.root_hypernyms() # print # # # paths = word.hypernym_paths() # # # for path in paths: # print simple_path(path) # # from itertools import islice # for synset in islice(wn.all_synsets('n'), 5): # print synset, synset.hypernyms() # for synset in list(wn.all_synsets('n'))[:10]: # print synset # # print len(list(wn.all_synsets('n'))) # road = wn.synsets("road", pos = wn.NOUN) # road = wn.synset('road.n.01') # paths = road.hypernym_paths() # # for path in paths: # print simple_path(path) # # paths = wn.synset("street.n.01").hypernym_paths() # for path in paths: # print simple_path(path) # print wn.synsets('geographic_area') # print_hyponyms(find_all_hyponyms(wn, wn.synset('way.n.06'))) # print_hyponyms(find_all_hyponyms(wn, wn.synset('geological_formation.n.01'))) # print wn.synsets('am', pos = wn.VERB) # print_hyponyms(find_all_hyponyms(wn, wn.synset('structure.n.01'))) # syset = wn.synset('geographical_area.n.01').hyponyms() # syset = wn.synset('country.n.04').hyponyms() # for hyponym in syset: # print hyponym # print hyponym.definition # print # print len(syset) # print wn.synsets("institution", pos = wn.NOUN) for synset in wn.synsets('go', pos = wn.VERB): paths = synset.hypernym_paths() print synset print len(paths) print synset.definition for path in paths: print simple_path(path) print
"january", "fabruary", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december" ] #with open('data/words.txt') as f: # dictionary = f.readlines() dictionary = set(line.strip() for line in open('data/words.txt')) #dictionary = set(line.strip() for line in open('data/words2.txt')) common_names = set(line.strip() for line in open('data/common_names.txt')) common_surnames = set(line.strip() for line in open('data/common_surnames_conv3.txt')) print('loading wordnet') wn = WordNetCorpusReader(nltk.data.find('corpora/wordnet'), None) print('done loading') S = wn.synset L = wn.lemma tweetSentences = list([]) class TokenSentenceData: def __init__(self, token, tokenId): self.token = token # instance variable unique to each instance self.tokenId = tokenId def __str__(self): return self.token
from nltk.corpus.reader.wordnet import WordNetCorpusReader from matplotlib import pyplot as plt from matplotlib_venn import venn3_unweighted wn = WordNetCorpusReader("./resources/WordNet-3.0/dict",None) adjectives = {a for a in wn.all_synsets('a')} attributes = {n for n in wn.all_synsets('n') if n.lexname() == 'noun.attribute'} direct_attributes = {attribute for adjective in adjectives for attribute in adjective.attributes()} morphologically_related = {related_lemma.synset() for adjective in adjectives for lemma in adjective.lemmas() for related_lemma in lemma.derivationally_related_forms() if related_lemma.synset().pos() == 'n'} diagram = venn3_unweighted([attributes, direct_attributes, morphologically_related], ['labeled as\nnoun.attribute', 'direct\nattributes', 'morphologically\nrelated nouns']) for patch in diagram.patches: patch.set_edgecolor('k') patch.set_facecolor('w') # remove this line for color diagram. plt.savefig('./images/venn.pdf')