コード例 #1
0
def get_synset(word):
    lemma, pos = word[:-2], word[-1]
    if pos == 'j':
        pos = "s"
    try:
        offsets = wn._lemma_pos_offset_map[lemma][pos]

    except KeyError:
        offsets = [syn._offset for syn in wn.synsets(lemma)]

    padding = [pad(ss) for ss in offsets]
    omw_list = [str(ss) + "-" + str(pos) for ss in padding]
    syn_list = []
    for offset in omw_list:
        syn = wn.synset("oven.n.01")
        try:
            syn = wn.of2ss(offset)
        except StopIteration:
            pass
        except AssertionError:
            pass
        except nltk.corpus.reader.wordnet.WordNetError:
            continue
        except ValueError:
            continue
        finally:
            syn_list.append(syn)

    return syn_list
コード例 #2
0
ファイル: man_clus.py プロジェクト: frankier/finn-sense-clust
def decompile(inf, out_dir):
    session = get_session()
    for lemma, grouping in gen_groupings(inf):
        with open(pjoin(out_dir, lemma), "w") as outf:
            first = True
            for group_num, synsets in grouping.items():
                if not first:
                    outf.write("\n")
                else:
                    first = False
                for synset in synsets:
                    outf.write(synset)
                    outf.write(" # ")
                    if is_wn_ref(synset):
                        sense = wordnet.of2ss(synset).definition()
                    else:
                        sense = session.execute(select([
                            word_sense.c.sense,
                        ]).select_from(joined).where(
                            (headword.c.name == lemma) &
                            (word_sense.c.sense_id == synset)
                        )).fetchone()["sense"]
                    tokens = word_tokenize(sense)
                    outf.write(" ".join(tokens))
                    outf.write("\n")
コード例 #3
0
ファイル: dump-synsets.py プロジェクト: frankier/FinnLink
def main():
    inf = fileinput.input()
    next(inf)
    for line in inf:
        frame, ssof = line.strip().split(",", 1)
        ss = wordnet.of2ss(ssof)
        print(frame, " ".join((l.name() for l in ss.lemmas(lang="fin"))))
コード例 #4
0
def get_wn_ss(imagenet_id):
    """
    Transforms an imagenet id into a wordnet synset
    :param imagenet_id:
    :return:
    """
    return wn.of2ss(imagenet_id[1:] + '-' + imagenet_id[0])
コード例 #5
0
 def load_csi(self):
     with open('data/csi_data/wn_synset2csi.txt') as csi_map_f:
         for line in csi_map_f:
             elems = line.strip().split('\t')
             wn_offset, csi_labels = elems[0], elems[1:]
             wn_offset = wn_offset.lstrip('wn:')
             syn = wn.of2ss(wn_offset)
             self.map_syn2csi[syn.name()] = csi_labels[0]
コード例 #6
0
        def ann2ss(ann):
            from stiff.munge.utils import synset_id_of_ann
            from nltk.corpus import wordnet
            from finntk.wordnet.utils import pre_id_to_post

            synset_id = pre_id_to_post(synset_id_of_ann(ann))
            # TODO: proper handling of new FinnWordNet synsets
            if synset_id[0] == "9":
                return
            return wordnet.of2ss(synset_id)
コード例 #7
0
ファイル: wordnet.py プロジェクト: frankier/finn-sense-clust
def get_lemma_names(ssof, wns):
    wns = list(wns)
    lemmas = []
    if "qf2" in wns:
        fi_ssof = en2fi_post(ssof)
        ss = fiwn.of2ss(fi_ssof)
        lemmas.extend(ss.lemmas())
        wns.remove("qf2")
    for wnref in wns:
        ss = wordnet.of2ss(ssof)
        lemmas.extend(ss.lemmas(lang=wnref))
    return {l.name() for l in lemmas}
コード例 #8
0
    def bbl2wn(self, babelSynsetID):
        service_url = 'https://babelnet.io/v4/getSynset'
        params = {'id': babelSynsetID, 'key': BABEL_KEY}

        data = self.get(service_url, params)
        wnOffsets = data['wnOffsets']
        if len(wnOffsets) != 0:
            wnOffsets = data['wnOffsets'][0]['mapping']['WN_30'][0]
            sense = wn.of2ss(wnOffsets)
        else:
            sense = None

        return sense
コード例 #9
0
def is_bird(model, img_path):
    # Load image and transform for model input
    x = load_img(img_path, target_size=(224, 224))
    x = img_to_array(x)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)

    # Run image through network and decode result
    preds = model.predict(x)
    top_pred = decode_predictions(preds,
                                  top=1)[0][0]  # (offset_id, name, prob)

    return synset_is_bird(wn.of2ss(top_pred[0][1:] + top_pred[0][0]))
コード例 #10
0
 def pessimistic_score(synsets):
     selected_synset = None
     max_score = 0
     for synset in synsets:
         if not synset[0][0] == '8':
             # ignore synsets offsets 8.......-.
             # they are odd synsets that WordNet can't find...
             synset_name = wn.of2ss(synset[0]).name()
             # Get score from SentiWordNet
             neg_score = swn.senti_synset(synset_name).neg_score()
             if neg_score > max_score:
                 max_score = neg_score
                 selected_synset = synset_name
     if selected_synset is not None:
         return (swn.senti_synset(selected_synset).pos_score(),
                 swn.senti_synset(selected_synset).neg_score(),
                 swn.senti_synset(selected_synset).pos_score())
     else:
         return 0, 0, 0
コード例 #11
0
def offset_to_label(wnet_offset):
    return wn.of2ss(wnet_offset.split('n')[1]+'-n')
コード例 #12
0
ファイル: build_graph2.py プロジェクト: dariogarcia/tiramisu
#  for c2 in range(1000):
#    if sim[c1][c2] != sim[c2][c1]:
#      print "diff " + str(labels[c1]) + " " + str(labels[c2])

#Read labels and transform into NLTK compatible
inf = open('labels.txt', 'r')
labels = list()
synsets = list()
labelsNLTK = list()
for line in inf:
  labels.append(line)
  labelsNLTK.append((line.split()[0])[1:]+'-n')

#Obtain synsets
for s1 in labelsNLTK:
  synsets.append(wn.of2ss(s1))

d_s = wn.synset('dog.n.01').closure(lambda s:s.hyponyms())
dogs = []
for d in d_s:
  dogs.append(d)

w_s = wn.synset('wheeled_vehicle.n.01').closure(lambda s:s.hyponyms())
wheeled = []
for w in w_s:
  wheeled.append(w)
wheeled.append(wn.synset('school_bus.n.01'))
wheeled.append(wn.synset('minibus.n.01'))
wheeled.append(wn.synset('trolleybus.n.01'))

a_s = wn.synset('animal.n.01').closure(lambda s:s.hyponyms())
コード例 #13
0
labels = {hyper: hyper.name() for hyper in gg}
pos = graphviz_layout(graph)
nx.draw_networkx(graph, pos, labels=labels)
# nx.draw_networkx_labels(graph, pos, labels)
plt.show()

with open(map_clsloc) as ifs:
    classes_temp = ifs.read().strip().split('\n')

imagenet_classes = [kls.split() for kls in classes_temp]
imagenet_classes = {k: v for k, _, v in imagenet_classes}

orig_hypos = []
for wnid, label in imagenet_classes.items():
    offset = wnid.split('n')[-1]
    hypo = wn.of2ss(offset + 'n')
    orig_hypos.append(hypo)

all_hypos = set()
set_hypers = set()
hyper_to_hypo = {}
for orig_hypo in orig_hypos:
    for hyper in orig_hypo.closure(get_hypernyms, depth=1):
        set_hypers.add(hyper)
        for new_hypo in hyper.closure(get_hyponyms, depth=1):
            all_hypos.add(new_hypo)
            hyper_to_hypo.setdefault(hyper, set()).add(new_hypo)

for hyper, hypos in sorted(hyper_to_hypo.items(),
                           key=lambda x: (x[0].max_depth(), -1 * len(x[1]))):
    print(hyper.max_depth(), len(hypos), hyper.name())
コード例 #14
0
ファイル: db_text_2.py プロジェクト: nixidekaoya/webapi
    else:
        sim = 0
    return sim


for similarity in similarity_list:
    np_matrix = np.zeros((len(wnid_list), len(wnid_list)), float)
    print(similarity)
    matrix_path = "/home/li/datasets/csv/" + str(
        similarity) + "_similarity_" + str(len(wnid_list)) + ".csv"
    for i in range(len(wnid_list)):
        if i % 100 == 0:
            print(i)
        wnid_1 = wnid_list[i]
        offset_1 = str(wnid_1[1:]) + "n"
        synset_1 = wn.of2ss(offset_1)
        for j in range(len(wnid_list) - i):
            wnid_2 = wnid_list[j + i]
            offset_2 = str(wnid_2[1:]) + "n"
            synset_2 = wn.of2ss(offset_2)
            np_matrix[i][j] = wn_similarity(synset_1,
                                            synset_2,
                                            similarity=similarity)
            np_matrix[j][i] = np_matrix[i][j]
    df1 = DataFrame(np_matrix, index=wnid_list, columns=wnid_list)
    df1.to_csv(matrix_path)

################### Choose Valid Synsets from Japanese wordnet
'''

japanese_wn_list = []
コード例 #15
0
 def synid2syn(self, synid):
     return wn.of2ss(synid)
コード例 #16
0
def add_synsets_to_sentences(sentences,
                             print_synsets=False,
                             _state_queue=None,
                             _id_process=None,
                             freeling_modules=None):
    """
    Performs a Freeling process to disambiguate words of the sentences according to their context
    (UKB algorithm) linking them to a unique synset (if possible).\n
    Our sentences are converted to Freeling Sentences before processing.\n
    Notice that even if we may have already computed the Lemmas for example, Freeling Sentences generated from our
    sentences are "raw sentences", without any analysis linked to their Words. So we make all the Freeling
    process from scratch every time, except *tokenization* and *sentence splitting*, to avoid any confusion.

    .. note:: This function should be used only inside the file_process.add_files() function.

    :param sentences: Sentences to process
    :type sentences: :obj:`list` of |Sentence|
    :param print_synsets: If True, print disambiguation results
    :type print_synsets: boolean
    """

    from loacore.conf import DB_TIMEOUT
    from loacore.utils.db import safe_commit, safe_execute

    freeling_sentences = [
        sentence.compute_freeling_sentence() for sentence in sentences
    ]

    if freeling_modules is None:
        if _state_queue is not None:
            _state_queue.put(
                ProcessState(_id_process, os.getpid(), "Loading Freeling...",
                             " - "))
        morfo, tagger, sen, wsd = init_freeling()
    else:
        morfo, tagger, sen, wsd = freeling_modules

    _disambiguation_state(_state_queue, _id_process)
    # perform morphosyntactic analysis and disambiguation
    processed_sentences = morfo.analyze(freeling_sentences)
    processed_sentences = tagger.analyze(processed_sentences)
    # annotate and disambiguate senses
    processed_sentences = sen.analyze(processed_sentences)
    processed_sentences = wsd.analyze(processed_sentences)

    # Copy freeling results into our Words
    for s in range(len(sentences)):
        sentence = sentences[s]

        if not len(sentence.words) == len(processed_sentences[s]):
            print("/!\\ Warning, sentence offset error in synset_process /!\\")
            print(sentence.sentence_str())
            print([w.get_form() for w in processed_sentences[s]])

        for w in range(len(sentence.words)):
            word = sentence.words[w]
            rank = processed_sentences[s][w].get_senses()
            if len(rank) > 0:
                if not rank[0][0][0] == '8':
                    # ignore synsets offsets 8.......-.
                    # they are odd synsets that WordNet can't find...
                    word.synset = Synset(None, word.id_word, rank[0][0],
                                         wn.of2ss(rank[0][0]).name(), None,
                                         None, None)
                if print_synsets:
                    print("Word : " + word.word)
                    print("Synset code : " + rank[0][0])
                    print("Synset name : " + wn.of2ss(rank[0][0]).name())

    # Add synsets to database

    conn = sql.connect(DB_PATH, timeout=DB_TIMEOUT)
    c = conn.cursor()

    sentence_count = 0
    total_sentence = len(sentences)
    for sentence in sentences:
        # Print state
        sentence_count += 1
        _commit_state(_state_queue, _id_process, sentence_count,
                      total_sentence)

        for word in sentence.words:
            synset = word.synset

            if synset is not None:
                # Add synset

                safe_execute(
                    c,
                    "INSERT INTO Synset (ID_Word, Synset_Code, Synset_Name) "
                    "VALUES (?, ?, ?)",
                    0,
                    _state_queue,
                    _id_process,
                    mark_args=(word.id_word, synset.synset_code,
                               synset.synset_name))

                # Get back id of last inserted review
                safe_execute(c, "SELECT last_insert_rowid()", 0, _state_queue,
                             _id_process)
                id_synset = c.fetchone()[0]

                # Update Word table
                safe_execute(
                    c, "UPDATE Word SET ID_Synset = " + str(id_synset) +
                    " WHERE ID_Word = " + str(word.id_word), 0, _state_queue,
                    _id_process)

    safe_commit(conn, 0, _state_queue, _id_process)

    conn.close()
コード例 #17
0
def test_dog_is_cat():
    dog = Label('02099601-n', 'golden retriever', '')
    cat = wordnet.of2ss('02123045-n')

    assert not dog.is_a(cat)
コード例 #18
0
from operator import itemgetter

import nltk
from nltk.corpus import wordnet as wn

import IO

nltk.data.path.append(IO.data_source_dir+"/nltk_data")

imagenet_classes = IO.read_imagenet_wnid_words_file()
imagenet_labels = list(imagenet_classes.values())

imagenet_synsets = []
for wnid, label in imagenet_classes.items():
    offset = wnid.split('n')[-1]
    synset = wn.of2ss(offset + 'n')
    imagenet_synsets.append(synset)

p_tallies = IO.read_pixabay_tally_file(hit_limit=0, top3=True)

p_metadata = IO.read_pixabay_metadata_file()

# How many images have 3, 2, 1, and 0 labels from ImageNet?
# How many images have 3, 2, 1, and 0 words from WordNet?

id_tags_dict = {ii: meta['top3'] for ii, meta in p_metadata.items()}

num_images_with_tags_in_imagenet = {0: 0, 1: 0, 2: 0, 3: 0}
for ii, tags in id_tags_dict.items():
    jj = 0
    for tag in tags:
コード例 #19
0
 def __init__(self, id: str, name: str, uri: str):
     self.id = id
     self.syn = wordnet.of2ss(id)
     self.name = name
     self.uri = uri
コード例 #20
0
ファイル: find_id.py プロジェクト: wannaphong/IsanNLP
# -*- coding: utf-8 -*-
"""
Get definition and examples from WordNet ID
"""
from nltk.corpus import wordnet as wn
word = input("WordNet ID : ")
word_wn = wn.of2ss(word.replace('-', ''))
print(word_wn.definition())
print(word_wn.examples())
print("WordNet ID : " + wn.ss2of(word_wn))
コード例 #21
0
            result += x
    return result




exclude_ss = []

exclude_hypos_of = [
    '01326291-n',     # microorganism
    '07992450-n'      # taxonomic group
]

for synset in exclude_hypos_of:

    ss_set = extracthypos(wn.of2ss(synset))
    for ss in ss_set:
        exclude_ss.append(ss)

pwn = open('pwn_data.py', 'w+')
pwn.write("from collections import defaultdict as dd\n")
pwn.write("pwn = dd(lambda: dd())\n")

for ss in wn.all_synsets():

    if ss not in exclude_ss:
    
        pos = ss.pos()
        if pos == 's':
            pos = 'a'
コード例 #22
0
ファイル: utils.py プロジェクト: nicoperetti/Tesis
def wnid2synset(wnids):
    _wnid2synset = lambda id: wn.of2ss(id[1:] + id[0])
    if isinstance(wnids, (tuple, list)):
        return [_wnid2synset(id) for id in wnids]
    return _wnid2synset(wnids)
コード例 #23
0
def get_synset(imagenet_synset_id):
    return wordnet.of2ss(imagenet_synset_id[1:] + 'n')
コード例 #24
0
ファイル: task_half_trans.py プロジェクト: zhanshichen/git_l
        tag = tag.next_sibling.next_sibling

    #用转换英文单词的方式构造pa_child 和 parent
    #create pa_child and parent list (searching index in english-version wordnet and translating into corresponding Chinese words)
    for id in pa_child:  #每个存在中文的单词的id 寻找父节点与子节点
        b = id.split('-')
        english_id = b[2] + b[3]
        #15028818n 格式

        #生成parent字典
        parent[id] = []
        for name in pa_child[id]:  #名字压进去
            parent[id].append(name)

        try:
            english_name = wn.of2ss(
                english_id)  #english_name 格式 : Synset('isoagglutinin.n.01')
        except:
            continue  #这个节点的两个list均为空  发生某个中文id没有对应英文id的情况,但是中文id有对应的单词
        else:

            #先构造pa_child 字典
            children_names = english_name.hyponyms()
            if children_names:  #有子节点
                for child_name in children_names:
                    child_id = str(
                        child_name.offset()).zfill(8) + '-' + child_name.pos()
                    chinese_child_id = 'cmn-10-' + child_id
                    if chinese_child_id in pa_child.keys():
                        for name in pa_child[id]:
                            pa_child[id][name].append(chinese_child_id)
コード例 #25
0
ファイル: script_synonym.py プロジェクト: dss2016eu/codefest
	
	#keep paragraphs
	parag_act = wf.attrib['para']
	if parag_act != parag_ant:
		fo.write(bcolors.JUMP)
	
	#search words that have a synset
	expr="//term/span/target[@id='"+wf.attrib['id']+"']"
	term=tree.find(expr).getparent().getparent()
	wordsense = term.find("./externalReferences/externalRef")
	
	#if it has a sense
	if wordsense is not None:		
		ref = wordsense.attrib['reference']
		ref = ref.replace('ili-30-','')
		syn = wordnet.of2ss(ref)
		
		try:
			#find its antonym
			lema1 = syn.lemmas(lang='eng')[0]
			lemma2 = lema1.antonyms()[0]
			#get the lemma in the desired language
			synAnt = lemma2.synset()
			lemma = synAnt.lemma_names(lang1)[0]
			fo.write(colors[lang1] + lemma.encode('utf8') +  bcolors.ENDC + " ")

		except:
			#if something goes wrong, write the original word
			fo.write(wf.text.encode('utf8') + " ")
				
	else:
コード例 #26
0
  distances_wup.append(sim3)
  distances_res.append(sim4)
  distances_jcn.append(sim5)
  distances_lin.append(sim6)
  distances_res_bnc.append(sim7)
  distances_jcn_bnc.append(sim8)
  distances_lin_bnc.append(sim9)

#Import IC calculation
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown-resnik-add1.dat')
bnc_ic = wordnet_ic.ic('ic-bnc-resnik-add1.dat')

#For each pair of synsets, compute distance
for s1 in synsets:
  syn1 = wn.of2ss(s1)
  for s2 in synsets:
    syn2 = wn.of2ss(s2)
    distances_path[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.path_similarity(syn1,syn2)
    distances_lch[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.lch_similarity(syn1,syn2)
    distances_wup[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.wup_similarity(syn1,syn2)
    distances_res[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.res_similarity(syn1,syn2,brown_ic)
    distances_jcn[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.jcn_similarity(syn1,syn2,brown_ic)
    distances_lin[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.lin_similarity(syn1,syn2,brown_ic)
    distances_res_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.res_similarity(syn1,syn2,bnc_ic)
    distances_jcn_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.jcn_similarity(syn1,syn2,bnc_ic)
    distances_lin_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.lin_similarity(syn1,syn2,bnc_ic)
    #distances_path[labelsNLTK.index(s1)][labelsNLTK.index(s2)] =1/(labelsNLTK.index(s2)+1) 
    #distances_lch[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)
    #distances_wup[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)  
    #distances_res[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)