Ejemplo n.º 1
0
def process():
    # Install Open Multilingual Wordnet if not already installed.
    nltkd = nltk.downloader.Downloader()
    if not nltkd.is_installed('omw'):
        nltk.download('omw')

    # Figure out ISO 639-2 code for specified locale. Exit if unavailable.
    print args.language
    iso639_2 = langcodes.best_match(args.language, wn.langs())[0]
    print iso639_2
    print wn.langs()
    if iso639_2 == 'und': # Nearest ISO 639-2 code is undefined.
        exit("Requested language is not available on this NLTK Wordnet installation.")

    # Obtain set of lowercased lemmas that belong to only one part of speech.
    posdb = dict()
    single_pos_lemmas = set()
    for pos in ['a', 'r', 'n', 'v']:
        posdb[pos] = set()
        # Note: wn.all_lemma_names() returns the lemma names in all lowercase.
        # To remove lemmas that are sometimes or always capitalised in normal
        # writing (e.g. "China" or "Arulo"), we will need to obtain capitalised
        # lemmas from Wordnet later on, and remove members of our output set
        # that are identical to the lowercased transformation of those
        # capitalised lemmas.
        for lemma in wn.all_lemma_names(pos=pos, lang=iso639_2):
            posdb[pos].add(lemma)
        single_pos_lemmas.symmetric_difference_update(posdb[pos])

    # Remove lemmas containing characters other than a-z.
    output_set = set()
    for term in single_pos_lemmas:
        if non_word.search(term) != None:
            continue
        output_set.add(term)

    # Obtain a set of lemmas that are typically capitalised in normal writing.
    unlowered_lemmas = set()
    for synset in list(wn.all_synsets()):
        for lemma in synset.lemma_names():
            unlowered_lemmas.add(lemma)
    for word in output_set:
        lemmas =
    # Filter inspiration: http://stackoverflow.com/a/16562558
    output_set = filter(lambda x:len(x) > 4 and len(x) < 7, output_set)
    names_lowered = set()
    for name in nltk.corpus.names.words():
        names_lowered.add(name.lower())
    output_set = filter(lambda x: x not in names_lowered, output_set)
    print output_set
    # print single_pos_lemmas
    print len(single_pos_lemmas)
    print len(output_set)
Ejemplo n.º 2
0
def assert_lang_supported_by_wordnet(lang):
    lang = iso_639_alpha3(lang)  # wordnet needs iso-639-2
    if lang in wn.langs():
        return True
    raise ValueError(
        f"Language '{lang}' not implemented in WordNet. Implemented languages are : {sorted(wn.langs())}"
    )
Ejemplo n.º 3
0
 def __init__(self, folder, lang):
     self.folder = folder
     if lang in wn.langs():
         self.lang = lang
     else:
         print("language: '%s' is not supported, try another language" %
               lang)
     #initialize
     self.WordCounter = {}
     self.SynsetCounter = {}
     self.pos_list = ['a', 's', 'r', 'n', 'v']
Ejemplo n.º 4
0
 def __init__(self, file_name, folder, lang):
     self.file_name = file_name
     self.folder = folder
     if lang in wn.langs():
         self.lang = lang
     else:
         print("language: '%s' is not supported, try another language" % lang)
     #initialize
     self.WordIndex = {}
     self.SynsetIndex = {}
     self.pos_list = ['a', 's', 'r', 'n', 'v']
     self.Shared = util.Shared()
Ejemplo n.º 5
0
 def __init__(self, file_name, folder, lang):
     self.file_name = file_name  #where word2vec model exists
     self.folder = folder
     if lang in wn.langs():
         self.lang = lang
     else:
         print("language: '%s' is not supported, try another language" %
               lang)
     #initialize
     self.WordGenerality = {}
     self.pos_list = ['a', 's', 'r', 'n', 'v']
     self.model = word2vec.Word2Vec.load(self.file_name)
Ejemplo n.º 6
0
 def all_langs(self, part1_format=False):
     """
     $ python -m sagas.nlu.wordnet_procs all_langs
     :return:
     """
     from sagas.nlu.locales import iso_locales
     langs = wn.langs()
     if part1_format:
         print(', '.join(sorted(iso_locales.iso_map.keys())))
     else:
         print('total', len(langs))
         print(', '.join(sorted(langs)))
         print(', '.join(sorted(iso_locales.iso_map.keys())))
Ejemplo n.º 7
0
 def __init__(self, file_name, folder, lang):
     self.file_name = file_name
     self.folder = folder
     if lang in wn.langs():
         self.lang = lang
     else:
         print("language: '%s' is not supported, try another language" % lang)
     #initialize
     self.WordIndex = {}
     self.SynsetIndex = {}
     self.pos_list = ['a', 's', 'r', 'n', 'v']
     self.pointer_map = {"@":"hypernym", "&":"similar", "$":"verbGroup", "!":"antonym"}
     self.Shared = util.Shared()
Ejemplo n.º 8
0
def langs():
    """
        This function return a set of ISO-639 language codes.

        :return: ISO-639 language codes
        :rtype: list[str]

        :Example:
            >>> from pythainlp.corpus.wordnet import langs
            >>> langs()
            ['eng', 'als', 'arb', 'bul', 'cat', 'cmn', 'dan',
             'ell', 'eus', 'fas', 'fin', 'fra', 'glg', 'heb',
             'hrv', 'ind', 'ita', 'jpn', 'nld', 'nno', 'nob',
             'pol', 'por', 'qcn', 'slv', 'spa', 'swe', 'tha',
             'zsm']
    """
    return wordnet.langs()
Ejemplo n.º 9
0
    def nltk_locales(self):
        """
        $ python -m sagas.nlu.locales nltk_locales
        :return:
        """
        from nltk.corpus import wordnet as wn
        from iso639 import languages
        import sagas
        langs = wn.langs()
        print(len(langs), sorted(langs))
        rs = []
        excepts = ['qcn']
        for lang in langs:
            if lang not in excepts:
                loc = languages.get(part3=lang)
                rs.append((loc.part3, loc.macro, loc.name))

        df=sagas.to_df(rs, ['code', 'micro', 'name'])
        sagas.print_df(df)
Ejemplo n.º 10
0
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize

multi_lang = sorted(wn.langs())

# import _lang_sel.py

usr_in = input('enter one word here: ')

tokenized = sent_tokenize(usr_in)
for i in tokenized[:2]:
    words = nltk.word_tokenize(i)
    tagged = nltk.pos_tag(words)
    print("NLTK Tag = ", tagged)

tagged = str(tagged)
char1 = ","
char2 = ")"
tag_str = tagged[tagged.find(char1) + 3:tagged.find(char2) - 1]
print("pos_tag = ", tag_str)
"""
char1 = '('
char2 = ')'
mystr = "mystring(123234sample)"
print mystr[mystr.find(char1)+1 : mystr.find(char2)]
123234sample
"""

syns = wordnet.synsets(usr_in)
import nltk
from tabulate import tabulate
# Install Open Multilingual Wordnet and Wordnet
# if not already installed.
nltkd = nltk.downloader.Downloader()
for corpus in ['wordnet','omw']:
    if not nltkd.is_installed(corpus):
        nltk.download(corpus)

from nltk.corpus import wordnet as wn

table = list()

for lang in sorted(wn.langs()):
    my_set_of_all_lemma_names = set()
    from nltk.corpus import wordnet as wn
    for aln_term in list(wn.all_lemma_names(lang=lang)):
        for synset in wn.synsets(aln_term):
            for lemma in synset.lemma_names():
                my_set_of_all_lemma_names.add(lemma)
    table.append([lang,
        len(set(wn.all_lemma_names(lang=lang))),
        len(my_set_of_all_lemma_names)])

print tabulate(table,
    headers=["Language code",
        "all_lemma_names()",
        "lemma_name.synset.lemma.lemma_names()"])
Ejemplo n.º 12
0
def langs():
    return wordnet.langs()
Ejemplo n.º 13
0
def langs():
	return wordnet.langs()
Ejemplo n.º 14
0
    for hyponym2 in hyponyms:
        if hyponym2 != hyponym:
            newlist = []
            newlist.append(hyponym.name())
            newlist.append(hyponym2.name())
            pair = str(sorted(newlist)[0]) + ',' + str(sorted(newlist)[1])
            if pair not in lemdict:
                lemdict[pair] = 0

for key, value in lemdict.items():
    lemm = key.split(',')
    lemm1 = wn.synset(lemm[0])
    lemm2 = wn.synset(lemm[1])

    common = []
    for lang in sorted(wn.langs()):
        lang1 = lemm1.lemma_names(lang)
        lang2 = lemm2.lemma_names(lang)
        for a in lang1:
            for b in lang2:
                if a == b:
                    common.append(a)
    lemdict[key] += len(common)

G = nx.Graph()
for hyponym in hyponyms:
    G.add_node(hyponym.name())

for key, value in lemdict.items():
    word = key.split(',')
    w1 = word[0]
Ejemplo n.º 15
0
 def langs(self):
     return wn.langs()
Ejemplo n.º 16
0
# get all data from a word in a language defined by language variable

from nltk.corpus import wordnet as wn
import pprint
from itertools import chain

# get all languages
sorted(wn.langs())

input = input("Enter word to get hyponyms and hypernyms: ")
language = 'spa'
lista = []
words = wn.synsets(input, lang=language)
pp = pprint.PrettyPrinter(indent=4)

for word in words:
    lista.append({
        'id': word.name(),
        'word': word.lemma_names(language),
        'meaning': word.definition(),
        'hypernyms': [],
        'hyponyms': []
    })

    currentHyponyms = []
    currentHypernyms = []

    for hyponym in word.hyponyms():
        currentHyponyms.append(hyponym.lemma_names(language))

    flat_list = [item for sublist in currentHyponyms for item in sublist]
Ejemplo n.º 17
0
for synset in a:
	print( synset.lemma_names())
"""

female_names = names.words('female.txt')
#print(female_names)

cfd = nltk.ConditionalFreqDist((fileid, name[-1])
                               for fileid in names.fileids()
                               for name in names.words(fileid))
#cfd.plot()
print(wordNet.synsets('motorcar'))
print(wordNet.synset('car.n.01').lemma_names())
print(wordNet.synset('car.n.01').definition())

print("\nIDIOMAS: " + str(wordNet.langs()) + "\n")

print(wordNet.synsets('jugar', lang='spa'))

vero = wordNet.synsets('jugar', lang='spa')

vero = wordNet.synsets('computer')
print(vero)

pc = wordNet.synsets('automobile')
print("PC-> " + str(pc))
tiposComputadora = wordNet.synset('automobile.n.01')

print("hyponyms-> " + str(tiposComputadora.hyponyms()))
print("Hypernyms-> " + str(tiposComputadora.hypernyms()))
Ejemplo n.º 18
0
from textblob.wordnet import Synset

import nltk
from nltk.corpus import wordnet as wn

# word = Word("pen")
# print(word.synsets)
# define = word.definitions
# print(define)
#
# octopus = Synset('octopus.n.02')
# shrimp = Synset('shrimp.n.03')
# simi = octopus.path_similarity(shrimp)
# print(simi)
#
# wiki = TextBlob("Python is a high-level, general-purpose programming language.")
# print(wiki.tags)
# print(wiki.noun_phrases)

print(wn.synsets('pen'))

print(wn.synset('penitentiary.n.01'))
print(wn.synset('penitentiary.n.01').definition())
print(wn.synset('dog.n.01').lemmas())
print(sorted(wn.langs()))
print(wn.synset('dog.n.01').lemma_names('ind'))

dog = wn.synset('dog.n.01')
print(dog.root_hypernyms())
print(wn.synset('dog.n.01').lowest_common_hypernyms(wn.synset('cat.n.01')))
Ejemplo n.º 19
0
 def __init__(self, language="eng"):
     if language not in wordnet.langs():
         raise ValueError(f"Language {language} not one of {wordnet.langs()}")
     self.language = language
Ejemplo n.º 20
0
print("res_similarity: ", dog.res_similarity(cat, brown_ic))
print("res_similarity: ", dog.res_similarity(puppy, brown_ic))

# jcn_similarity
print("jcn_similarity: ", dog.jcn_similarity(cat, brown_ic))
print("jcn_similarity: ", dog.jcn_similarity(puppy, brown_ic))

# lin_similarity
print("lin_similarity: ", dog.lin_similarity(cat, brown_ic))
print("lin_similarity: ", dog.lin_similarity(puppy, brown_ic))

# Extend Open Multilingual Wordnet
# 测试Extend Open Multilingual Wordnet的使用

# 打印所有支持的语言,其中cmn表示中文
print(wn.langs())

cmn = wn.lemmas(u"选择", lang='cmn')
print("cmn: ", cmn)
name = wn.lemma('choose.v.01.选择', lang='cmn').name()
print("name: ", name)

# “选择”这个词的所有同义词集
synsets = wn.synsets(u"选择", lang='cmn')
print("synsets: ", synsets)
name = wn.synsets(u"选择", lang='cmn')[0].lemmas()[0].name()
print("name: ", name)

# 一个同义词集的中文同义词集。
cmn_synset = wn.synset("choose.v.01").lemma_names('cmn')
print("cmn_synset: ", cmn_synset)
Ejemplo n.º 21
0
    with open(path, mode='w+', encoding='utf-8') as f:
        for key, synsets in lemma2fun.items():
            print('\t'.join(list(key) + synsets), file=f)


def read_possibility_dictionary(path):
    from ast import literal_eval
    lemma_cat2fun = dict()
    print(path)
    with open(path, mode='r', encoding='utf-8') as f:
        for l in f:
            val, funs = literal_eval(l)
            lemma_cat2fun[val] = funs
    return lemma_cat2fun


if __name__ == '__main__':
    #lang2lemma2fun, _ = generate_possibility_dictionary(wn.langs())
    #print(type(lang2lemma2fun))
    lang2lemmacat2fun = generate_possibility_dictionary(wn.langs(),
                                                        usecat=True)
    print(type(lang2lemmacat2fun))
    print('Created dict')
    #for lang in wn.langs():
    #    write_possibility_dictionary('../data/possibility_dictionaries/wn2/{}.txt'.format(lang), lang2lemma2fun[lang])
    for lang in wn.langs():
        write_possibility_dictionary(
            '../data/possibility_dictionaries/wn2/{}.txt'.format(lang),
            lang2lemmacat2fun[lang])
    print('Printed dict')
    print('Done.')
Ejemplo n.º 22
0

def print_synset(synset, langs):
    name = synset.name()
    examples = '|'.join(synset.examples())
    d = synset.definition()
    print('\t'.join([
        name, '\t'.join([synset_to_str(synset, l) for l in langs]), examples, d
    ]))


# http://www.loc.gov/standards/iso639-2/php/code_list.php
# two languages not found: zsm, qcn

langs2 = [
    'en', 'sq', 'ar', 'bg', 'ca', 'zh', 'da', 'el', 'eu', 'fa', 'fi', 'fr',
    'gl', 'he', 'hr', 'id', 'it', 'jp', 'nl', 'nn', 'nb', 'pl', 'pt', 'qc',
    'sl', 'es', 'sv', 'th', 'ms'
]
langs3 = [
    'eng', 'als', 'arb', 'bul', 'cat', 'cmn', 'dan', 'ell', 'eus', 'fas',
    'fin', 'fra', 'glg', 'heb', 'hrv', 'ind', 'ita', 'jpn', 'nld', 'nno',
    'nob', 'pol', 'por', 'qcn', 'slv', 'spa', 'swe', 'tha', 'zsm'
]

langs = wn.langs()

print('\t'.join(['synset', '\t'.join(langs2), 'examples', 'definitions']))
for s in wn.all_synsets():
    print_synset(s, langs)
Ejemplo n.º 23
0
for synset in list(wn.all_synsets('n'))[:10]:
    print(synset)


print(wn.synsets('dog', pos=wn.VERB))
print("*"*111)
print(wn.synset('dog.n.01'))
print(wn.synset('dog.n.01').definition())
print(len(wn.synset('dog.n.01').examples()))
print(wn.synset('dog.n.01').examples()[0])
print(wn.synset('dog.n.01').lemmas())
a = [str(lemma.name()) for lemma in wn.synset('dog.n.01').lemmas()]
print(a)
print(wn.lemma('dog.n.01.dog').synset())
print("*"*111)
print(sorted(wn.langs()))
print(wn.synsets(b'\xe7\x8a\xac'.decode('utf-8'), lang='jpn'))
print(wn.synset('spy.n.01').lemma_names('jpn'))
print(wn.synset('dog.n.01').lemma_names('ita'))
print("*"*111)

dog = wn.synset('dog.n.01')
print(dog.hypernyms())
print(dog.hyponyms())
print(dog.member_holonyms())
print(dog.root_hypernyms())
print(wn.synset('dog.n.01').lowest_common_hypernyms(wn.synset('cat.n.01')))
print("*"*111)

good = wn.synset('good.a.01')
# print(good.antonyms())
Ejemplo n.º 24
0
print(stopwords.words('english'))
print(not_stopwords(nltk.corpus.reuters.words()))
print(not_stopwords(nltk.corpus.inaugural.words()))

print(wordnet.synsets('cat'))
print(wordnet.synsets('cat', pos=wordnet.VERB))
cat = wordnet.synset('cat.n.01')
dog = wordnet.synset('dog.n.01')

print(cat.definition())
print(len(cat.examples()))
print(cat.lemmas())
print([str(lemma.name()) for lemma in cat.lemmas()])
print(wordnet.lemma('cat.n.01.cat').synset())

print(sorted(wordnet.langs()))
print(cat.lemma_names('ita'))
print(sorted(cat.lemmas('dan')))
print(sorted(cat.lemmas('por')))
print(len(wordnet.all_lemma_names(pos='n', lang='jpn')))
print(cat.hypernyms())
print(cat.hyponyms())
print(cat.member_holonyms())
print(cat.root_hypernyms())
print(cat.lowest_common_hypernyms(dog))

# NLP_w_Python Ch 1

print('=========================================')
print('Looking up a Synset for a Word in WordNet')
print('=========================================')