Exemple #1
0
def exercise_udhr():
    print udhr.fileids()

    # 查看不同语言的世界人权宣言的字长差异
    languages = ["Chickasaw", "English", "German_Deutsch", "Greenlandic_Inuktikut", "Hungarian_Magyar", "Ibibio_Efik"]
    cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages for word in udhr.words(lang + "-Latin1"))
    cfd.plot()
Exemple #2
0
def langToWord_ratio(text):
    
    tokens=wordpunct_tokenize(text)        //把文档里面的内容分成单个词                     #tokenize the document text
    
    docWords=[]                       //创建一个新的数组                          #create empty list data sturucture called docWords
    for tokenToWord in tokens:                                  #for each token put into variable called tokenToWord
        docWords.append(tokenToWord.lower())       //将所有的词变成小写并写进那个数组                 #make all words put into tokenToWord lowercase and then append to the list docWords(puts it in at the end)
 
#    print("tokens is of variable type: ", type(tokens))        #"type" tells you the class of the variable given
#    print("words is of variable type: ", type(docWords))  
#    print("tokens: ",tokens)                                   #print out all the tokens in document 
#    print("Available languages: ", udhr.fileids())             #A udhr method called file ids brings back the list of languages available
#    print("\n")
    
    langRatios={}                                               #create empty dictionary data structure -> key:value of key
    
    if len(udhr.fileids()) >0:
        for language in udhr.fileids():         //nltk还包含多国语言语料库。比如udhr,包含有超过300种语言的世界人权宣言。fileids:语料库中的文件,每一个语言都有自己的独立文件,language就是遍历这个语料库的各个不同语言的文档#for each language file, put into variable called language
            udhr_set=set(udhr.words(language))  //  words(fileids=[f1,f2,f3])     获得指定文件中的词汇,  language是参数,获得这个language的词汇,然后通过set建立集合。例如此时language代表日文,获得udhr中的日文的词汇, 然后建立集合              #set of most used words in each language

#            print("language: ", language)
#            print(set(udhr.words(language)))
#            print("\n")
        
            docWords_set=set(docWords)       //将文档中的词创建集合,                   #set of words from our document
        
            common_elements=docWords_set.intersection(udhr_set)。//寻找两个集合的交集,也就是相同的词。 #set of words that appear in both docWords_set and udhr_set
        
            if len(common_elements)>0:   //如果存在相同的词汇
                langRatios[language]=len(common_elements)      //在languageration的数组里存入这个语言的词汇在文档中出现的频率    #for each language with atleast one common word = language:number of common words
Exemple #3
0
def find_language_word(word):
    opts = []
    # print(word)
    for fileid in udhr.fileids():
        if word in udhr.words(fileid)[:len(udhr.words(fileid)/4)]:
            opts.append(fileid)
    return opts
def find_language(string):
    text=string.split(" ")
    text=[word for word in text if word.isalpha()]
    l=len(text)
    avail_langs=[file for file in udhr.fileids() if 'Latin1' in file]
    cfd=ConditionalFreqDist([(lang, word) for lang in avail_langs for word in [word for word in text if word in udhr.words(lang)]])
    ls=sorted([(lang,cfd[lang]) for lang in avail_langs], key=lambda tple: tple[1].N())
    print("The most probable language of the text is {0} with {1:3.3f}% probability.".format(ls[-1][0].replace('-Latin1',''), 100*ls[-1][1].N()/l))
 def test_words(self):
     for name in udhr.fileids():
         try:
             words = list(udhr.words(name))
         except AssertionError:
             print(name)
             raise
         self.assertTrue(words)
Exemple #6
0
 def test_words(self):
     for name in udhr.fileids():
         try:
             words = list(udhr.words(name))
         except AssertionError:
             print(name)
             raise
         self.assertTrue(words)
def find_language(wordTested):
    latinLanguages = list()
    for language in udhr.fileids():
        if 'Latin1' in language:
            latinLanguages.append(language)
    languageContains = list()
    for latinlanguage in latinLanguages:
        if wordTested in udhr.words(latinlanguage):
            languageContains.append(latinlanguage)
    return languageContains
def exercise_udhr():
    print(udhr.fileids())

    # 查看不同语言的世界人权宣言的字长差异
    languages = [
        'Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut',
        'Hungarian_Magyar', 'Ibibio_Efik'
    ]
    cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages
                                   for word in udhr.words(lang + '-Latin1'))
    cfd.plot()
Exemple #9
0
def find_language(search_word):
    languages = []
    for lang_id in udhr.fileids():
        if 'Latin1' in lang_id:
            for word in udhr.words(lang_id):
                if search_word.lower() == word.lower():
                    languages.append(lang_id.split("-")[0])
                    break
    languages = set(languages)
    if languages:
        print("The word '", search_term, "' is in the following ",
              len(languages), "languages:", languages)
    else:
        print("no results found")
def guess_language(samples):
    final_languages = {}
    for lang, str in samples.items():
        tokens = word_tokenize(str)
        languages = [l for l in udhr.fileids() if 'Latin1' in l]
        languages_having_words = list()

        for token in tokens:
            for lang in languages:
                if token in udhr.words(lang) or token.lower() in udhr.words(lang):
                    languages_having_words.append(lang)
        final_language = language_frequency(languages_having_words)
        final_languages[final_language[0]] = str
    return final_languages
Exemple #11
0
def find_language(s):
	latin = []
	final_langs = []
	ct = 0
	for id in udhr.fileids():
		if '-Latin1' in id:
			latin.append(id)
	for lang in latin:
		for word in udhr.words(lang):
			if word == s:
				final_langs.append(lang)
				print "Found word: " + word, "Search word: " + s
				break
	return final_langs
Exemple #12
0
def ex25_findlanguage():
  from nltk.corpus import udhr
  word_lang_map = {}
  for fileid in udhr.fileids():
    if fileid.endswith("-Latin1"):
      lang = fileid[:-7]
      words = udhr.words(fileid)
      for word in words:
        try:
          word_lang_map[word]
        except KeyError:
          word_lang_map[word] = set()
        langs = word_lang_map[word]
        langs.add(lang)
        word_lang_map[word] = langs
  print word_lang_map["arashobora"]
Exemple #13
0
def ex25_findlanguage():
    from nltk.corpus import udhr
    word_lang_map = {}
    for fileid in udhr.fileids():
        if fileid.endswith("-Latin1"):
            lang = fileid[:-7]
            words = udhr.words(fileid)
            for word in words:
                try:
                    word_lang_map[word]
                except KeyError:
                    word_lang_map[word] = set()
                langs = word_lang_map[word]
                langs.add(lang)
                word_lang_map[word] = langs
    print word_lang_map["arashobora"]
def language(texto):

    #fd = nltk.FreqDist(texto)
    fd = nltk.FreqDist(word_tokenize(texto))
    #print(list(fd))
    #print(fd.most_common(50))
    correlationMax = -10000
    langFinal = '-Latin1'

    for lang in udhr.fileids():
        if lang[-7:] == '-Latin1':
            fdu = nltk.FreqDist(word_tokenize(udhr.raw(lang)))
            #fdu = nltk.FreqDist(udhr.raw(lang))
            correlation = nltk.spearman_correlation(
                list(ranks_from_sequence(fd)), list(ranks_from_sequence(fdu)))
            #print(fdu.most_common(50))
            #print(lang,correlation)
            if correlation > correlationMax:
                langFinal = lang
                correlationMax = correlation

    return langFinal + ',corr:' + str(correlationMax)
def udhr_rankings(debug=False):
    """ Get the conditional frequency distributions for each language in the udhr corpus.
    :returns: dictionary of language to conditional frequency distribution
    :rtype: dict
    """
    result = dict()

    if debug:
        stdout.write('Preparing training sets')

    for _id in [s for s in udhr.fileids() if '-' in s]:
        split_id = _id.split('-')
        language = split_id[0]

        # Only allow some encodings.
        if udhr.encoding(_id) not in ENCODINGS:
            continue

        try:
            words = udhr.words(_id)
            result[language] = FreqDist(words)
        except AssertionError:
            # Problems reading, so we skip.
            pass
        except UnicodeDecodeError:
            # Problems reading, so we skip.
            pass

        if debug:
            stdout.write('.')
            stdout.flush()

    if debug:
        stdout.write('\n')

    return result
Exemple #16
0
import nltk
from nltk.corpus import udhr

files = udhr.fileids()
languages = list(x for x in files if 'Latin1' in x)

def find_language(str):
	for language in languages:
		lexicon = udhr.words(fileids=language)
		print(lexicon)

		for latin_language in lexicon:
			if str in latin_language:
				print(latin_language)

find_language("sentence")
# 	return #list of languages with that string
'''
Exercise 2.25

This exercise DOES look trivial...

But fine, I'll do it, just to see how compact and Pythonic I can make it...
Also to review the uhdr api briefly
'''

from nltk.corpus import udhr

latin_languages = [ lang for lang in udhr.fileids() if lang.endswith('-Latin1') ]


def find_language(word):
    '''Returns list of languages for which word is found in 
    
    Limitations:
    - currently only checks nltk.corpus.udhr (universal declaration of human rights, i.e., whether word is "universal", haha)
    - currently only checks Latin-1 languages in udhr
    '''
    
    # trivial, like i said. right??
    import string  # to strip off punctuation - my little finishing touch
    return [ lang for lang in latin_languages 
                if word.strip(string.punctuation) in set(udhr.words(lang)) ]
    

    
if __name__ == "__main__":
    words = 'The quick brown fox jumps over the lazy dog.'.split()
Exemple #18
0
 def test_raw_unicode(self):
     for name in udhr.fileids():
         txt = udhr.raw(name)
         assert not isinstance(txt, bytes), name
 def test_raw_unicode(self):
     for name in udhr.fileids():
         txt = udhr.raw(name)
         assert not isinstance(txt, bytes), name
Exemple #20
0
        sorted_items=sorted(list(cfdist[word].items()), key=lambda el: -el[1])[:max([n_most_likely, len(cfdist[word].items())])]
        l1=[word for word,val in sorted_items]
        word=random.choice(l1)
    print(' ')
brown.categories()
bigram=bigrams([word for word in brown.words(categories=['government', 'mystery']) if word.isalpha()])
cfd=ConditionalFreqDist(bigram)
cfd["I"].items()
generate_model(cfd, 'I', num=25, n_most_likely=6)

#25
"""Define a function find_language() that takes a string as its argument and returns
a list of languages that have that string as a word. Use the udhr corpus and limit
your searches to files in the Latin-1 encoding."""
from nltk.corpus import udhr
[file for file in udhr.fileids() if 'Latin1' in file]
from nltk import ConditionalFreqDist

def find_language(string):
    text=string.split(" ")
    text=[word for word in text if word.isalpha()]
    l=len(text)
    avail_langs=[file for file in udhr.fileids() if 'Latin1' in file]
    cfd=ConditionalFreqDist([(lang, word) for lang in avail_langs for word in [word for word in text if word in udhr.words(lang)]])
    ls=sorted([(lang,cfd[lang]) for lang in avail_langs], key=lambda tple: tple[1].N())
    print("The most probable language of the text is {0} with {1:3.3f}% probability.".format(ls[-1][0].replace('-Latin1',''), 100*ls[-1][1].N()/l))

find_language(" ".join(udhr.words('Danish_Dansk-Latin1')[:20]))

#expected wrong results because Polish is not in the udhr corpus
find_language('Temistokles Brodowski z wydziału komunikacji społecznej biura potwierdził dziś, że agenci z katowickiej delegatury CBA zatrzymali byłego ministra sprawiedliwości, radcę prawnego Andrzeja K., jego współpracownika z kancelarii Piotra K. oraz gdańskiego biznesmena Marka S. i byłego funkcjonariusza Wojskowych Służb Informacyjnych Jerzego K. Andrzej K. był szefem resortu sprawiedliwości w drugim rządzie Marka Belki.')
import nltk
from nltk.corpus import udhr

cfd = nltk.ConditionalFreqDist(
    (word, lang)
    for lang in udhr.fileids()
    for word in udhr.words(lang))

def find_language(word):
    return cfd[word].max()
#!/usr/bin/python3
# coding: utf-8
from nltk.corpus import udhr  # contains the Universal Declaration of Human Rights in over 300 languages
##################################################################
## 简单测试
print(type(udhr))  # <class 'nltk.corpus.reader.udhr.UdhrCorpusReader'>
print(len(udhr.fileids()))  # 310
print(udhr.fileids()[:2])  # ['Abkhaz-Cyrillic+Abkh', 'Abkhaz-UTF8']
print([lang for lang in udhr.fileids() if lang.startswith('English')])  # ['English-Latin1']
print(len(udhr.words('English-Latin1')))  # 1781
print(udhr.words('English-Latin1')[:5])  # ['Universal', 'Declaration', 'of', 'Human', 'Rights']
languages = ['Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']  # 这些是常用语言
Exemple #23
0
modals = ['can', 'could', 'may', 'might', 'must', 'will']

cfd.tabulate(conditions=genres, samples=modals)

# plots with CFD
from nltk.corpus import inaugural
cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    for target in ['america', 'citizen']
    if w.lower().startswith(target))

cfd.plot()

# more plots, universal declaration of human rights
# cumulative word length distributions
from nltk.corpus import udhr
languages = ['Chickasaw', 'Greenlandic_Inuktikut', 'Quechua', 'Indonesian', 'French_Francais']
cfd = nltk.ConditionalFreqDist(
    (lang, len(word))
    for lang in languages 
    for word in udhr.words(lang + '-Latin1'))

cfd.plot(cumulative=True)
raw_text = udhr.raw('Javanese-Latin1')
nltk.FreqDist(raw_text).plot()

udhr.fileids()

Exemple #24
0
def find_language(word):
    opts = []
    for fileid in udhr.fileids():
        if word in udhr.words(fileid):
            opts.append(fileid)
    return opts
Exemple #25
0
#!/usr/bin/python3
from nltk.corpus import udhr
count=0
for file in udhr.fileids():
        count=count+1
        print("{:03} {}".format(count,file))

Exemple #26
0
# %%
'''
"lang_id" takes a string and outputs a the name of the most probable language in which the text is written.
It uses Spearman's rank correlation to compare how similar the input text is to each of the languages in the Declaration of Human Rights corupus.
Currently it only supports language encoded in "Latin1", therefore languages that rely on utf-8 are not yet supported.

More info about Spearman's Correlation: https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient
'''

import nltk
from nltk.corpus import udhr
import re


# %%
Latin1_langs = [lang for lang in udhr.fileids() if re.search(r"Latin1",lang)]


# %%
def languages_freq(langlist, input_text):
    fdistinput = nltk.FreqDist(input_text)
    result = []
    for language in Latin1_langs:
        Lang_freqdist = nltk.FreqDist(udhr.words(language))
        result.append([language,nltk.spearman_correlation(Lang_freqdist,fdistinput)])
    return result


# %%
def mostprobable(final):
    return sorted(final, key = lambda x : x[1], reverse = True)
Exemple #27
0
#!/usr/bin/python3
# coding: utf-8
from nltk.corpus import udhr  # contains the Universal Declaration of Human Rights in over 300 languages
##################################################################
## 简单测试
print(type(udhr))  # <class 'nltk.corpus.reader.udhr.UdhrCorpusReader'>
print(len(udhr.fileids()))  # 310
print(udhr.fileids()[:2])  # ['Abkhaz-Cyrillic+Abkh', 'Abkhaz-UTF8']
print([lang for lang in udhr.fileids()
       if lang.startswith('English')])  # ['English-Latin1']
print(len(udhr.words('English-Latin1')))  # 1781
print(udhr.words('English-Latin1')
      [:5])  # ['Universal', 'Declaration', 'of', 'Human', 'Rights']
languages = [
    'Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut',
    'Hungarian_Magyar', 'Ibibio_Efik'
]  # 这些是常用语言
Exemple #28
0
    print(fileid)
    for w in inaugural.words(fileid):
        print(w)

cfd = nltk.ConditionalFreqDist(
        (target,fileid[:4])
        for fileid in inaugural.fileids()
        for w in inaugural.words(fileid)
        for target in ['america','citizen']
        if w.lower().startswith(target)
        )
cfd.plot()
print(cfd.items())

from nltk.corpus import udhr
udhr.fileids()
ch = udhr.words('Chinese_Mandarin-GB2312')

nltk.FreqDist(ch).plot()

ch = nltk.ConditionalFreqDist(
            (lang,len(word))
            for lang in (['Chinese_Mandarin-GB2312'])
            for word in udhr.words('Chinese_Mandarin-GB2312')
        )

ch.plot(cumulative = True)

fdistch = nltk.FreqDist(ch)

import operator
def find_language(word):
    all_languages = [language for language in udhr.fileids() if
                 language[-6:] == 'Latin1']
    word_languages = [language for language in all_languages if
                      word in udhr.words(language)]
    return word_languages
Exemple #30
0
 def test_words(self):
     for name in udhr.fileids():
         words = list(udhr.words(name))
         self.assertTrue(words)
Exemple #31
0
def find_language(text):
    for fileid in udhr.fileids():
        if fileid[-6:] == "Latin1":
            if text in udhr.words(fileid):
                print(fileid[0:-7])
Exemple #32
0
def find_language(word):
    langs = []
    for f in udhr.fileids():
        if f.endswith('-Latin1') and word in udhr.words(f):
            langs.append(f[:-7])
    return langs
Exemple #33
0
def find_language(word):
    lang = [fileid[:-7] for fileid in udhr.fileids() if fileid.endswith('-Latin1') and word in udhr.words(fileid)]
    return lang