def get_synset(word): lemma, pos = word[:-2], word[-1] if pos == 'j': pos = "s" try: offsets = wn._lemma_pos_offset_map[lemma][pos] except KeyError: offsets = [syn._offset for syn in wn.synsets(lemma)] padding = [pad(ss) for ss in offsets] omw_list = [str(ss) + "-" + str(pos) for ss in padding] syn_list = [] for offset in omw_list: syn = wn.synset("oven.n.01") try: syn = wn.of2ss(offset) except StopIteration: pass except AssertionError: pass except nltk.corpus.reader.wordnet.WordNetError: continue except ValueError: continue finally: syn_list.append(syn) return syn_list
def decompile(inf, out_dir): session = get_session() for lemma, grouping in gen_groupings(inf): with open(pjoin(out_dir, lemma), "w") as outf: first = True for group_num, synsets in grouping.items(): if not first: outf.write("\n") else: first = False for synset in synsets: outf.write(synset) outf.write(" # ") if is_wn_ref(synset): sense = wordnet.of2ss(synset).definition() else: sense = session.execute(select([ word_sense.c.sense, ]).select_from(joined).where( (headword.c.name == lemma) & (word_sense.c.sense_id == synset) )).fetchone()["sense"] tokens = word_tokenize(sense) outf.write(" ".join(tokens)) outf.write("\n")
def main(): inf = fileinput.input() next(inf) for line in inf: frame, ssof = line.strip().split(",", 1) ss = wordnet.of2ss(ssof) print(frame, " ".join((l.name() for l in ss.lemmas(lang="fin"))))
def get_wn_ss(imagenet_id): """ Transforms an imagenet id into a wordnet synset :param imagenet_id: :return: """ return wn.of2ss(imagenet_id[1:] + '-' + imagenet_id[0])
def load_csi(self): with open('data/csi_data/wn_synset2csi.txt') as csi_map_f: for line in csi_map_f: elems = line.strip().split('\t') wn_offset, csi_labels = elems[0], elems[1:] wn_offset = wn_offset.lstrip('wn:') syn = wn.of2ss(wn_offset) self.map_syn2csi[syn.name()] = csi_labels[0]
def ann2ss(ann): from stiff.munge.utils import synset_id_of_ann from nltk.corpus import wordnet from finntk.wordnet.utils import pre_id_to_post synset_id = pre_id_to_post(synset_id_of_ann(ann)) # TODO: proper handling of new FinnWordNet synsets if synset_id[0] == "9": return return wordnet.of2ss(synset_id)
def get_lemma_names(ssof, wns): wns = list(wns) lemmas = [] if "qf2" in wns: fi_ssof = en2fi_post(ssof) ss = fiwn.of2ss(fi_ssof) lemmas.extend(ss.lemmas()) wns.remove("qf2") for wnref in wns: ss = wordnet.of2ss(ssof) lemmas.extend(ss.lemmas(lang=wnref)) return {l.name() for l in lemmas}
def bbl2wn(self, babelSynsetID): service_url = 'https://babelnet.io/v4/getSynset' params = {'id': babelSynsetID, 'key': BABEL_KEY} data = self.get(service_url, params) wnOffsets = data['wnOffsets'] if len(wnOffsets) != 0: wnOffsets = data['wnOffsets'][0]['mapping']['WN_30'][0] sense = wn.of2ss(wnOffsets) else: sense = None return sense
def is_bird(model, img_path): # Load image and transform for model input x = load_img(img_path, target_size=(224, 224)) x = img_to_array(x) x = np.expand_dims(x, axis=0) x = preprocess_input(x) # Run image through network and decode result preds = model.predict(x) top_pred = decode_predictions(preds, top=1)[0][0] # (offset_id, name, prob) return synset_is_bird(wn.of2ss(top_pred[0][1:] + top_pred[0][0]))
def pessimistic_score(synsets): selected_synset = None max_score = 0 for synset in synsets: if not synset[0][0] == '8': # ignore synsets offsets 8.......-. # they are odd synsets that WordNet can't find... synset_name = wn.of2ss(synset[0]).name() # Get score from SentiWordNet neg_score = swn.senti_synset(synset_name).neg_score() if neg_score > max_score: max_score = neg_score selected_synset = synset_name if selected_synset is not None: return (swn.senti_synset(selected_synset).pos_score(), swn.senti_synset(selected_synset).neg_score(), swn.senti_synset(selected_synset).pos_score()) else: return 0, 0, 0
def offset_to_label(wnet_offset): return wn.of2ss(wnet_offset.split('n')[1]+'-n')
# for c2 in range(1000): # if sim[c1][c2] != sim[c2][c1]: # print "diff " + str(labels[c1]) + " " + str(labels[c2]) #Read labels and transform into NLTK compatible inf = open('labels.txt', 'r') labels = list() synsets = list() labelsNLTK = list() for line in inf: labels.append(line) labelsNLTK.append((line.split()[0])[1:]+'-n') #Obtain synsets for s1 in labelsNLTK: synsets.append(wn.of2ss(s1)) d_s = wn.synset('dog.n.01').closure(lambda s:s.hyponyms()) dogs = [] for d in d_s: dogs.append(d) w_s = wn.synset('wheeled_vehicle.n.01').closure(lambda s:s.hyponyms()) wheeled = [] for w in w_s: wheeled.append(w) wheeled.append(wn.synset('school_bus.n.01')) wheeled.append(wn.synset('minibus.n.01')) wheeled.append(wn.synset('trolleybus.n.01')) a_s = wn.synset('animal.n.01').closure(lambda s:s.hyponyms())
labels = {hyper: hyper.name() for hyper in gg} pos = graphviz_layout(graph) nx.draw_networkx(graph, pos, labels=labels) # nx.draw_networkx_labels(graph, pos, labels) plt.show() with open(map_clsloc) as ifs: classes_temp = ifs.read().strip().split('\n') imagenet_classes = [kls.split() for kls in classes_temp] imagenet_classes = {k: v for k, _, v in imagenet_classes} orig_hypos = [] for wnid, label in imagenet_classes.items(): offset = wnid.split('n')[-1] hypo = wn.of2ss(offset + 'n') orig_hypos.append(hypo) all_hypos = set() set_hypers = set() hyper_to_hypo = {} for orig_hypo in orig_hypos: for hyper in orig_hypo.closure(get_hypernyms, depth=1): set_hypers.add(hyper) for new_hypo in hyper.closure(get_hyponyms, depth=1): all_hypos.add(new_hypo) hyper_to_hypo.setdefault(hyper, set()).add(new_hypo) for hyper, hypos in sorted(hyper_to_hypo.items(), key=lambda x: (x[0].max_depth(), -1 * len(x[1]))): print(hyper.max_depth(), len(hypos), hyper.name())
else: sim = 0 return sim for similarity in similarity_list: np_matrix = np.zeros((len(wnid_list), len(wnid_list)), float) print(similarity) matrix_path = "/home/li/datasets/csv/" + str( similarity) + "_similarity_" + str(len(wnid_list)) + ".csv" for i in range(len(wnid_list)): if i % 100 == 0: print(i) wnid_1 = wnid_list[i] offset_1 = str(wnid_1[1:]) + "n" synset_1 = wn.of2ss(offset_1) for j in range(len(wnid_list) - i): wnid_2 = wnid_list[j + i] offset_2 = str(wnid_2[1:]) + "n" synset_2 = wn.of2ss(offset_2) np_matrix[i][j] = wn_similarity(synset_1, synset_2, similarity=similarity) np_matrix[j][i] = np_matrix[i][j] df1 = DataFrame(np_matrix, index=wnid_list, columns=wnid_list) df1.to_csv(matrix_path) ################### Choose Valid Synsets from Japanese wordnet ''' japanese_wn_list = []
def synid2syn(self, synid): return wn.of2ss(synid)
def add_synsets_to_sentences(sentences, print_synsets=False, _state_queue=None, _id_process=None, freeling_modules=None): """ Performs a Freeling process to disambiguate words of the sentences according to their context (UKB algorithm) linking them to a unique synset (if possible).\n Our sentences are converted to Freeling Sentences before processing.\n Notice that even if we may have already computed the Lemmas for example, Freeling Sentences generated from our sentences are "raw sentences", without any analysis linked to their Words. So we make all the Freeling process from scratch every time, except *tokenization* and *sentence splitting*, to avoid any confusion. .. note:: This function should be used only inside the file_process.add_files() function. :param sentences: Sentences to process :type sentences: :obj:`list` of |Sentence| :param print_synsets: If True, print disambiguation results :type print_synsets: boolean """ from loacore.conf import DB_TIMEOUT from loacore.utils.db import safe_commit, safe_execute freeling_sentences = [ sentence.compute_freeling_sentence() for sentence in sentences ] if freeling_modules is None: if _state_queue is not None: _state_queue.put( ProcessState(_id_process, os.getpid(), "Loading Freeling...", " - ")) morfo, tagger, sen, wsd = init_freeling() else: morfo, tagger, sen, wsd = freeling_modules _disambiguation_state(_state_queue, _id_process) # perform morphosyntactic analysis and disambiguation processed_sentences = morfo.analyze(freeling_sentences) processed_sentences = tagger.analyze(processed_sentences) # annotate and disambiguate senses processed_sentences = sen.analyze(processed_sentences) processed_sentences = wsd.analyze(processed_sentences) # Copy freeling results into our Words for s in range(len(sentences)): sentence = sentences[s] if not len(sentence.words) == len(processed_sentences[s]): print("/!\\ Warning, sentence offset error in synset_process /!\\") print(sentence.sentence_str()) print([w.get_form() for w in processed_sentences[s]]) for w in range(len(sentence.words)): word = sentence.words[w] rank = processed_sentences[s][w].get_senses() if len(rank) > 0: if not rank[0][0][0] == '8': # ignore synsets offsets 8.......-. # they are odd synsets that WordNet can't find... word.synset = Synset(None, word.id_word, rank[0][0], wn.of2ss(rank[0][0]).name(), None, None, None) if print_synsets: print("Word : " + word.word) print("Synset code : " + rank[0][0]) print("Synset name : " + wn.of2ss(rank[0][0]).name()) # Add synsets to database conn = sql.connect(DB_PATH, timeout=DB_TIMEOUT) c = conn.cursor() sentence_count = 0 total_sentence = len(sentences) for sentence in sentences: # Print state sentence_count += 1 _commit_state(_state_queue, _id_process, sentence_count, total_sentence) for word in sentence.words: synset = word.synset if synset is not None: # Add synset safe_execute( c, "INSERT INTO Synset (ID_Word, Synset_Code, Synset_Name) " "VALUES (?, ?, ?)", 0, _state_queue, _id_process, mark_args=(word.id_word, synset.synset_code, synset.synset_name)) # Get back id of last inserted review safe_execute(c, "SELECT last_insert_rowid()", 0, _state_queue, _id_process) id_synset = c.fetchone()[0] # Update Word table safe_execute( c, "UPDATE Word SET ID_Synset = " + str(id_synset) + " WHERE ID_Word = " + str(word.id_word), 0, _state_queue, _id_process) safe_commit(conn, 0, _state_queue, _id_process) conn.close()
def test_dog_is_cat(): dog = Label('02099601-n', 'golden retriever', '') cat = wordnet.of2ss('02123045-n') assert not dog.is_a(cat)
from operator import itemgetter import nltk from nltk.corpus import wordnet as wn import IO nltk.data.path.append(IO.data_source_dir+"/nltk_data") imagenet_classes = IO.read_imagenet_wnid_words_file() imagenet_labels = list(imagenet_classes.values()) imagenet_synsets = [] for wnid, label in imagenet_classes.items(): offset = wnid.split('n')[-1] synset = wn.of2ss(offset + 'n') imagenet_synsets.append(synset) p_tallies = IO.read_pixabay_tally_file(hit_limit=0, top3=True) p_metadata = IO.read_pixabay_metadata_file() # How many images have 3, 2, 1, and 0 labels from ImageNet? # How many images have 3, 2, 1, and 0 words from WordNet? id_tags_dict = {ii: meta['top3'] for ii, meta in p_metadata.items()} num_images_with_tags_in_imagenet = {0: 0, 1: 0, 2: 0, 3: 0} for ii, tags in id_tags_dict.items(): jj = 0 for tag in tags:
def __init__(self, id: str, name: str, uri: str): self.id = id self.syn = wordnet.of2ss(id) self.name = name self.uri = uri
# -*- coding: utf-8 -*- """ Get definition and examples from WordNet ID """ from nltk.corpus import wordnet as wn word = input("WordNet ID : ") word_wn = wn.of2ss(word.replace('-', '')) print(word_wn.definition()) print(word_wn.examples()) print("WordNet ID : " + wn.ss2of(word_wn))
result += x return result exclude_ss = [] exclude_hypos_of = [ '01326291-n', # microorganism '07992450-n' # taxonomic group ] for synset in exclude_hypos_of: ss_set = extracthypos(wn.of2ss(synset)) for ss in ss_set: exclude_ss.append(ss) pwn = open('pwn_data.py', 'w+') pwn.write("from collections import defaultdict as dd\n") pwn.write("pwn = dd(lambda: dd())\n") for ss in wn.all_synsets(): if ss not in exclude_ss: pos = ss.pos() if pos == 's': pos = 'a'
def wnid2synset(wnids): _wnid2synset = lambda id: wn.of2ss(id[1:] + id[0]) if isinstance(wnids, (tuple, list)): return [_wnid2synset(id) for id in wnids] return _wnid2synset(wnids)
def get_synset(imagenet_synset_id): return wordnet.of2ss(imagenet_synset_id[1:] + 'n')
tag = tag.next_sibling.next_sibling #用转换英文单词的方式构造pa_child 和 parent #create pa_child and parent list (searching index in english-version wordnet and translating into corresponding Chinese words) for id in pa_child: #每个存在中文的单词的id 寻找父节点与子节点 b = id.split('-') english_id = b[2] + b[3] #15028818n 格式 #生成parent字典 parent[id] = [] for name in pa_child[id]: #名字压进去 parent[id].append(name) try: english_name = wn.of2ss( english_id) #english_name 格式 : Synset('isoagglutinin.n.01') except: continue #这个节点的两个list均为空 发生某个中文id没有对应英文id的情况,但是中文id有对应的单词 else: #先构造pa_child 字典 children_names = english_name.hyponyms() if children_names: #有子节点 for child_name in children_names: child_id = str( child_name.offset()).zfill(8) + '-' + child_name.pos() chinese_child_id = 'cmn-10-' + child_id if chinese_child_id in pa_child.keys(): for name in pa_child[id]: pa_child[id][name].append(chinese_child_id)
#keep paragraphs parag_act = wf.attrib['para'] if parag_act != parag_ant: fo.write(bcolors.JUMP) #search words that have a synset expr="//term/span/target[@id='"+wf.attrib['id']+"']" term=tree.find(expr).getparent().getparent() wordsense = term.find("./externalReferences/externalRef") #if it has a sense if wordsense is not None: ref = wordsense.attrib['reference'] ref = ref.replace('ili-30-','') syn = wordnet.of2ss(ref) try: #find its antonym lema1 = syn.lemmas(lang='eng')[0] lemma2 = lema1.antonyms()[0] #get the lemma in the desired language synAnt = lemma2.synset() lemma = synAnt.lemma_names(lang1)[0] fo.write(colors[lang1] + lemma.encode('utf8') + bcolors.ENDC + " ") except: #if something goes wrong, write the original word fo.write(wf.text.encode('utf8') + " ") else:
distances_wup.append(sim3) distances_res.append(sim4) distances_jcn.append(sim5) distances_lin.append(sim6) distances_res_bnc.append(sim7) distances_jcn_bnc.append(sim8) distances_lin_bnc.append(sim9) #Import IC calculation from nltk.corpus import wordnet_ic brown_ic = wordnet_ic.ic('ic-brown-resnik-add1.dat') bnc_ic = wordnet_ic.ic('ic-bnc-resnik-add1.dat') #For each pair of synsets, compute distance for s1 in synsets: syn1 = wn.of2ss(s1) for s2 in synsets: syn2 = wn.of2ss(s2) distances_path[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.path_similarity(syn1,syn2) distances_lch[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.lch_similarity(syn1,syn2) distances_wup[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.wup_similarity(syn1,syn2) distances_res[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.res_similarity(syn1,syn2,brown_ic) distances_jcn[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.jcn_similarity(syn1,syn2,brown_ic) distances_lin[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.lin_similarity(syn1,syn2,brown_ic) distances_res_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.res_similarity(syn1,syn2,bnc_ic) distances_jcn_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.jcn_similarity(syn1,syn2,bnc_ic) distances_lin_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.lin_similarity(syn1,syn2,bnc_ic) #distances_path[labelsNLTK.index(s1)][labelsNLTK.index(s2)] =1/(labelsNLTK.index(s2)+1) #distances_lch[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1) #distances_wup[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1) #distances_res[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)