def wordnet_get_gloss_byref(_ref): """ given a wordnet reference number returns term gloss _ref is the wordnet referenace number (e.g friday%1:28:00::) """ if not _ref: # if ref is empty return '' try: return wn.synset_from_sense_key(_ref).definition() except (AttributeError,WordNetError,ValueError) as err: return 'WN Error'
def get_gloss_relation(self, name): synset = wn.synset(name) synset_pos = synset.pos() if synset_pos == "s": synset_pos = "a" synset_id = 'wn:{}{}'.format(str(synset.offset()).zfill(8), synset_pos) related_synset_keys = self.sense.getGlossRelatedWordNetSynsetIds( synset_id) related_gloss_synsets = [] for synset_key in related_synset_keys: try: related_gloss_synset = wn.synset_from_sense_key(synset_key) related_gloss_synsets.append(related_gloss_synset) except: continue return related_gloss_synsets
def get_other_senses_byref(_ref,select_name=False): """ given a wordnet reference number returns homonyms list in the form of other synsets _ref is the wordnet reference number (e.g friday%1:28:00::) """ try: ref_syn = wn.synset_from_sense_key(_ref) ref_word = ref_syn.name().split('.')[0] if not select_name: return [syn for syn in wn.synsets(ref_word) if syn != ref_syn] return [syn for syn in wn.synsets(ref_word) if (syn != ref_syn and ref_word in syn.name())] except (AttributeError,WordNetError,KeyError,ValueError) as err: return 'WN Error'
def create_hypernynm_gloss_data(config): train_files = config['train_raw_files'] gloss_data, hyp_data = [], [] oversample_ratio = config['oversample_ratio'] triplet_data = {} for file in train_files: train_file = open(os.path.join(config['train_raw_dir'], file), 'r', encoding='utf8') for line in train_file.readlines()[1:]: info = line.strip().split('\t') target_id, label, sentence, gloss, sense_key = info[0], info[ 1], info[2], info[3], info[4] gloss_data.append([target_id, label, sentence, gloss, sense_key]) if sentence in triplet_data: triplet_obj = triplet_data[sentence] else: triplet_obj = TripletSample(sentence) triplet_data[sentence] = triplet_obj if label == '1': triplet_obj.positives.add(gloss) else: triplet_obj.negatives.add(gloss) if label == '1': for _ in range(oversample_ratio - 1): gloss_data.append( [target_id, label, sentence, gloss, sense_key]) try: wn_synset = wn.synset_from_sense_key(sense_key) if len(wn_synset.hypernyms()) > 0: for hyp in wn_synset.hypernyms(): defs = hyp.definition() word = hyp.lemmas()[0].name().replace('_', ' ') new_gloss = word + ' : ' + defs hyp_data.append( [target_id, label, sentence, new_gloss, sense_key]) if label == '1': for _ in range(oversample_ratio - 1): hyp_data.append([ target_id, label, sentence, new_gloss, sense_key ]) except: continue train_file.close() return gloss_data, hyp_data, triplet_data
def accuracy(y, y_pred, by_synset=False): n = len(y) k = 0 for key in y.keys(): try: senses = [ wn.synset_from_sense_key(sense) if by_synset else sense for sense in y[key] ] if y_pred[key].lemmas()[0].key() in senses: k += 1 #else: print(y_pred[key], senses) # Why does wordnet.synset_from_sense_key return 'brazil_nut.n.02'? # Are these labels all right? except WordNetError: pass # This should only occur for "budget" # BAD PRACTICE return k / n
def get_lemma_classifiers(instances, keys, classifier=MultinomialNB, binary=False, idf=True): ''' :type instances: dict of WSDInstance :type keys: dict of list ''' classifiers = {} data = {} for k, inst in instances.items(): if inst.lemma not in data.keys(): data[inst.lemma] = [[], []] # [[context lists], [sense label]] try: for sense in keys[k]: data[inst.lemma][0].append(inst.context) data[inst.lemma][1].append( wn.synset_from_sense_key(sense).name()) except WordNetError as e: # "budget" again. pass for k, d in data.items(): if not d[1]: continue # 'budget' strikes again vctr = TfidfVectorizer(lowercase=False, preprocessor=lambda x: x, tokenizer=lambda x: x, binary=binary, use_idf=idf) x = vctr.fit_transform(d[0]) clfr = classifier() clfr.fit(x.toarray(), d[1]) if len( clfr.classes_ ) > 1: # Oh great... Should have used SemCor but don't have time. classifiers[k] = {'vct': vctr, 'clf': clfr} return classifiers
def get_synset_id_from_sense_key(sense_key): return wn.synset_from_sense_key(sense_key).name()
from nltk.corpus import wordnet as wn dataset = 'ALL' if __name__ == "__main__": with open(f'./data/mono/evaluation/{dataset}/{dataset}_n.gold.key.txt', 'w') as out: for line in open( f'./data/mono/evaluation/{dataset}/{dataset}.gold.key.txt'): line = line.replace('\n', '') id = line.split(' ')[0] sense_key = line.split(' ')[1] syn = wn.synset_from_sense_key(sense_key) if syn.pos() != 'n': continue syn_id = syn.name() out.write(f"{id} {syn_id}\n")