Example #1
0
File: 3-43.py Project: jbbe/lang
def find_language_word(word):
    opts = []
    # print(word)
    for fileid in udhr.fileids():
        if word in udhr.words(fileid)[:len(udhr.words(fileid)/4)]:
            opts.append(fileid)
    return opts
Example #2
0
def ch03_43_translate():
  from nltk.corpus import udhr
  en_fd = bigram_freqdist(udhr.words("English-Latin1"))
  fr_fd = bigram_freqdist(udhr.words("French_Francais-Latin1"))
  de_fd = bigram_freqdist(udhr.words("German_Deutsch-Latin1"))
  es_fd = bigram_freqdist(udhr.words("Spanish-Latin1"))
  inputs = ["Nice day", "Guten Tag", "Buenas Dias", "Tres Bien"]
  for input in inputs:
    words = input.lower().split(" ")
    # TODO: remove keys present in reference set
    ranks = map(lambda x : nltk.spearman_correlation(x, bigram_freqdist(words)),
      [en_fd, fr_fd, de_fd, es_fd])
    print input, ranks
Example #3
0
def runLeaveOutWordTrialUnbiased(language):
    all_words = list(set(filterWords(udhr.words(language))))
    test_set = random.choice(all_words)
    train_set = [w for w in all_words if w != test_set]

    ngrams = [(language, train_set)]
    for lang in LANGUAGES:
        if lang == language:
            continue
        ngrams.append((lang, udhr.words(lang)))

    classifier = NGramClassifier(N, ngrams)

    return [test_set], classifier.classifyWord(test_set)
Example #4
0
def ch03_43_translate():
    from nltk.corpus import udhr
    en_fd = bigram_freqdist(udhr.words("English-Latin1"))
    fr_fd = bigram_freqdist(udhr.words("French_Francais-Latin1"))
    de_fd = bigram_freqdist(udhr.words("German_Deutsch-Latin1"))
    es_fd = bigram_freqdist(udhr.words("Spanish-Latin1"))
    inputs = ["Nice day", "Guten Tag", "Buenas Dias", "Tres Bien"]
    for input in inputs:
        words = input.lower().split(" ")
        # TODO: remove keys present in reference set
        ranks = map(
            lambda x: nltk.spearman_correlation(x, bigram_freqdist(words)),
            [en_fd, fr_fd, de_fd, es_fd])
        print input, ranks
def guess_language(samples):
    final_languages = {}
    for lang, str in samples.items():
        tokens = word_tokenize(str)
        languages = [l for l in udhr.fileids() if 'Latin1' in l]
        languages_having_words = list()

        for token in tokens:
            for lang in languages:
                if token in udhr.words(lang) or token.lower() in udhr.words(lang):
                    languages_having_words.append(lang)
        final_language = language_frequency(languages_having_words)
        final_languages[final_language[0]] = str
    return final_languages
Example #6
0
def runLeaveOutWordTrialUnbiased(language):
    all_words = list(set(filterWords(udhr.words(language))))
    test_set = random.choice(all_words)
    train_set = [w for w in all_words if w not in test_set]

    bigrams = [(language, makeTrigrams(train_set))]
    for lang in LANGUAGES:
        if lang == language:
            continue
        bigrams.append((lang, makeTrigrams(udhr.words(lang))))

    grammars = makeTrigramGrammars(bigrams)

    return [test_set], predictLanguage(test_set, grammars)
Example #7
0
def langToWord_ratio(text):
    
    tokens=wordpunct_tokenize(text)        //把文档里面的内容分成单个词                     #tokenize the document text
    
    docWords=[]                       //创建一个新的数组                          #create empty list data sturucture called docWords
    for tokenToWord in tokens:                                  #for each token put into variable called tokenToWord
        docWords.append(tokenToWord.lower())       //将所有的词变成小写并写进那个数组                 #make all words put into tokenToWord lowercase and then append to the list docWords(puts it in at the end)
 
#    print("tokens is of variable type: ", type(tokens))        #"type" tells you the class of the variable given
#    print("words is of variable type: ", type(docWords))  
#    print("tokens: ",tokens)                                   #print out all the tokens in document 
#    print("Available languages: ", udhr.fileids())             #A udhr method called file ids brings back the list of languages available
#    print("\n")
    
    langRatios={}                                               #create empty dictionary data structure -> key:value of key
    
    if len(udhr.fileids()) >0:
        for language in udhr.fileids():         //nltk还包含多国语言语料库。比如udhr,包含有超过300种语言的世界人权宣言。fileids:语料库中的文件,每一个语言都有自己的独立文件,language就是遍历这个语料库的各个不同语言的文档#for each language file, put into variable called language
            udhr_set=set(udhr.words(language))  //  words(fileids=[f1,f2,f3])     获得指定文件中的词汇,  language是参数,获得这个language的词汇,然后通过set建立集合。例如此时language代表日文,获得udhr中的日文的词汇, 然后建立集合              #set of most used words in each language

#            print("language: ", language)
#            print(set(udhr.words(language)))
#            print("\n")
        
            docWords_set=set(docWords)       //将文档中的词创建集合,                   #set of words from our document
        
            common_elements=docWords_set.intersection(udhr_set)。//寻找两个集合的交集,也就是相同的词。 #set of words that appear in both docWords_set and udhr_set
        
            if len(common_elements)>0:   //如果存在相同的词汇
                langRatios[language]=len(common_elements)      //在languageration的数组里存入这个语言的词汇在文档中出现的频率    #for each language with atleast one common word = language:number of common words
Example #8
0
def prep_language_corpus(fids):
	### preps language corpus
	# pulls in all the languages, which udhr calls them the fileids)
	# fids = udhr.fileids()
	

	# makes a list of all the available languages that use Latin1 encoding.
	languages = [fileid for fileid in fids if re.findall('Latin1', fileid)]

	#pulls in all of the udhr for all diff. languages broken apart by characters.

	udhr_corpus = [[list(word.lower()) for word in udhr.words(language) if word.isalpha()] for language in languages]

	# flattens that list so that it is a clump of letters for each language

	udhr_corpus = [[item for sublist in language for item in sublist] for language in udhr_corpus]

	# gives the languages all indices. So you can pull in the text of the udhr by knowing its index number a la udhr_corpus[154] returns spanish

	languages = list(enumerate(languages))

	# gets frequency distributions for all the characters in a list. then converts it to a ranked list

	language_freq_dists = [FreqDist(language) for language in udhr_corpus]
	language_ranks = [list(ranks_from_sequence(dist)) for dist in language_freq_dists]

	return languages, language_ranks
Example #9
0
def Cal_Pred_Acc(charmodel, chardataset):
    model = LangModel(charmodel)
    words = udhr.words(chardataset)[0:1000]
    word_count = len(words)  #Calculating the total number of words in a set
    unigram_acc = 0
    bigram_acc = 0
    trigram_acc = 0

    for word in words:
        uni_pred = model.cal_unigram(word)
        if (uni_pred > 0):
            unigram_acc = unigram_acc + 1
        print("%15s - %19.18f" % (word, uni_pred))
    print("\nAccuracy of unigram model: ", unigram_acc * 100 / word_count, '%',
          '\n')

    for word in words:
        bi_pred = model.cal_bigram(word)
        if (bi_pred > 0):
            bigram_acc = bigram_acc + 1
        print("%15s - %19.18f" % (word, bi_pred))
    print("\nAccuracy of bigram model: ", bigram_acc * 100 / word_count, '%',
          '\n')

    for word in words:
        tri_pred = model.cal_trigram(word)
        if (tri_pred > 0):
            trigram_acc = trigram_acc + 1
        print("%15s - %19.18f" % (word, tri_pred))
    print("\nAccuracy of trigram model: ", trigram_acc * 100 / word_count, '%',
          '\n')
def Accuracy(LangModel, Data):
    model = Models(LangModel)
    words = udhr.words(Data)[0:1000]
    WordCount = len(words)
    UniAcc = 0
    BiAcc = 0
    TriAcc = 0

    for word in words:
        UniP = model.CalUni(word)
        if (UniP > 0):
            UniAcc += 1
        print("%15s - %19.18f" % (word, UniP))
    print("\t\t\t\t\t\tAtccuracy of unigram model: ", UniAcc * 100 / WordCount)

    for word in words:
        BiP = model.CalBi(word)
        if (BiP > 0):
            BiAcc += 1
        print("%15s - %19.18f" % (word, BiP))
    print("\t\t\t\t\t\tAccuracy of bigram model: ", BiAcc * 100 / WordCount)

    for word in words:
        TriP = model.CalTri(word)
        if (TriP > 0):
            TriAcc += 1
        print("%15s - %19.18f" % (word, TriP))
    print("\t\t\t\t\t\tAccuracy of trigram model: ", TriAcc * 100 / WordCount)
Example #11
0
def languages_freq(langlist, input_text):
    fdistinput = nltk.FreqDist(input_text)
    result = []
    for language in Latin1_langs:
        Lang_freqdist = nltk.FreqDist(udhr.words(language))
        result.append([language,nltk.spearman_correlation(Lang_freqdist,fdistinput)])
    return result
Example #12
0
def fun14():
    """cfd plot"""
    languages = ['Chickasaw', 'English', 'German_Deutsch', \
        'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
    cfd = nltk.ConditionalFreqDist((lang, len(word)) \
        for lang in languages \
        for word in udhr.words(lang + '-Latin1'))
    cfd.plot(cumulative=True)
Example #13
0
def find_language(str):
	for language in languages:
		lexicon = udhr.words(fileids=language)
		print(lexicon)

		for latin_language in lexicon:
			if str in latin_language:
				print(latin_language)
Example #14
0
File: ch02.py Project: gree2/hobby
def fun14():
    """cfd plot"""
    languages = ['Chickasaw', 'English', 'German_Deutsch', \
        'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
    cfd = nltk.ConditionalFreqDist((lang, len(word)) \
        for lang in languages \
        for word in udhr.words(lang + '-Latin1'))
    cfd.plot(cumulative=True)
 def test_words(self):
     for name in udhr.fileids():
         try:
             words = list(udhr.words(name))
         except AssertionError:
             print(name)
             raise
         self.assertTrue(words)
Example #16
0
 def test_words(self):
     for name in udhr.fileids():
         try:
             words = list(udhr.words(name))
         except AssertionError:
             print(name)
             raise
         self.assertTrue(words)
Example #17
0
def runLeaveOutWordTrialUnbiased(language):
	"""
	Choses a single word to exclude from the types of the UDHR and then tests against that
	"""
	all_words = list(set(filterWords(udhr.words(language))))
	test_set = random.choice(all_words)
	train_set = [ w for w in all_words if w not in test_set ]

	bigrams = [ (language, makeBigrams(train_set)) ]
	for lang in LANGUAGES:
		if lang == language:
			continue
		bigrams.append( ( lang, makeBigrams(udhr.words(lang)) ) )

	grammars = makeBigramGrammars(bigrams)
	
	return [ test_set ], predictLanguage(test_set, grammars)
Example #18
0
def get_udhr_word_length_cdf():
    languages = [
        'Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut',
        'Hungarian_Magyar', 'Malay_BahasaMelayu'
    ]
    cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages
                                   for word in udhr.words(lang + '-Latin1'))
    cfd.plot(cumulative=True)
Example #19
0
    def setUp(self):
        languages = ['English', 'German_Deutsch', 'French_Francais']

        # udhr corpus contains the Universal Declaration of Human Rights in over 300 languages
        language_base = dict((language, udhr.words(language + '-Latin1')) for language in languages)

        # build the language models
        self.langModeler = LangModeler(languages, language_base)
Example #20
0
def print_udhr():
    from nltk.corpus import udhr
    languages=['Chickasaw','English','German_Deutsch']
    cfd=nltk.ConditionalFreqDist(
        (lang,len(word))
        for lang in languages
        for word in udhr.words(lang+'-Latin1')
    )
    cfd.plot(cumulative=True)
Example #21
0
def get_word_set(lang,talla):

    stop_words = set(stopwords.words(lang))
    print('\n Stopwords: ',len(stop_words))

    words = udhr.words(lang+'-Latin1')
    words = [w.lower() for w in words if not w in stop_words and w.isalpha()]
    fdist  = nltk.FreqDist(words)
    words = fdist.most_common(talla)
    return words
def find_language(wordTested):
    latinLanguages = list()
    for language in udhr.fileids():
        if 'Latin1' in language:
            latinLanguages.append(language)
    languageContains = list()
    for latinlanguage in latinLanguages:
        if wordTested in udhr.words(latinlanguage):
            languageContains.append(latinlanguage)
    return languageContains
Example #23
0
def perform_experiment(modelFile, modelLanguage, dataFile, dataLanguage):
    languageModel = LanguageModel(modelFile)
    try:
        # Read test words
        words = udhr.words(dataFile)[0:1000]
    except:
        print("UDHR language file " + dataFile + " does not exist",
              file=sys.stderr)
        sys.exit(1)

    # All words in the test set
    countWords = len(words)
    # Words successfully predicted by unigram model
    unigramPredicted = 0
    # Words successfully predicted by bigram model
    bigramPredicted = 0
    # Words successfully predicted by trigram model
    trigramPredicted = 0

    print("\n# Model: " + modelLanguage + ", Test Dataset: " + dataLanguage)
    print(
        "+----------------------+---------------------+---------------------+---------------------+"
    )
    print(
        "| Word                 | Unigram Probability | Bigram Probability  | Trigram Probability |"
    )
    print(
        "|----------------------|---------------------|---------------------|---------------------|"
    )
    for word in words:
        unigramProbability = languageModel.calculate_unigram_probability(word)
        if (unigramProbability > 0):
            unigramPredicted = unigramPredicted + 1

        bigramProbability = languageModel.calculate_bigram_probability(word)
        if (bigramProbability > 0):
            bigramPredicted = bigramPredicted + 1

        trigramProbability = languageModel.calculate_trigram_probability(word)
        if (trigramProbability > 0):
            trigramPredicted = trigramPredicted + 1

        print(
            "| %20s | %19.17f | %19.17f | %19.17f |" %
            (word, unigramProbability, bigramProbability, trigramProbability))
    print(
        "|----------------------|---------------------|---------------------|---------------------|"
    )
    print("| %20s | %18.5f%% | %18.5f%% | %18.5f%% |" %
          ("Accuracy", unigramPredicted * 100 / countWords, bigramPredicted *
           100 / countWords, trigramPredicted * 100 / countWords))
    print(
        "+----------------------+---------------------+---------------------+---------------------+"
    )
Example #24
0
def exercise_udhr():
    print(udhr.fileids())

    # 查看不同语言的世界人权宣言的字长差异
    languages = [
        'Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut',
        'Hungarian_Magyar', 'Ibibio_Efik'
    ]
    cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages
                                   for word in udhr.words(lang + '-Latin1'))
    cfd.plot()
Example #25
0
def conditional_freq_dist():
    from nltk.corpus import udhr
    languages = [
        'Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut',
        'Hungarian_Magyar', 'Ibibio_Efik'
    ]
    cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages
                                   for word in udhr.words(lang + '-Latin1'))
    cfd.tabulate(conditions=['English', 'German_Deutsch'],
                 samples=range(10),
                 cumulative=True)
Example #26
0
def find_language(word):
    '''Returns list of languages for which word is found in 
    
    Limitations:
    - currently only checks nltk.corpus.udhr (universal declaration of human rights, i.e., whether word is "universal", haha)
    - currently only checks Latin-1 languages in udhr
    '''
    
    # trivial, like i said. right??
    import string  # to strip off punctuation - my little finishing touch
    return [ lang for lang in latin_languages 
                if word.strip(string.punctuation) in set(udhr.words(lang)) ]
Example #27
0
def guess_lang(text):
    '''Guess the language of the text. This version includes only Spanish,
    German and English and the sample needs to be quite big but it could be enhanced.'''
    Spanish = udhr.words('Spanish-Latin1')
    German = udhr.words('German_Deutsch-Latin1')
    English = udhr.words('English-Latin1')
    spanfd = fd(Spanish)
    small_spanfd = {}
    gerfd = fd(German)
    small_gerfd = {}
    enfd = fd(English)
    small_enfd = {}

    text_fd = fd(nltk.regexp_tokenize(text.lower(), r'\w+'))

    for key in spanfd.keys():
        if text_fd.has_key(key):
            small_spanfd[key] = spanfd[key]

    for key in enfd.keys():
        if text_fd.has_key(key):
            small_enfd[key] = enfd[key]

    for key in gerfd.keys():
        if text_fd.has_key(key):
            small_gerfd[key] = gerfd[key]

    corwithspan = cor(small_spanfd, text_fd)
    corwithen = cor(small_enfd, text_fd)
    corwithger = cor(small_gerfd, text_fd)

    if abs(corwithspan) == abs(corwithen) == abs(corwithger):
        print "I don't know..."
    elif max(abs(corwithspan), abs(corwithen), abs(corwithger)) == abs(corwithspan):
        print "It's Spanish!"
    elif max(abs(corwithspan), abs(corwithen), abs(corwithger)) == abs(corwithen):
        print "It's English!"
    elif max(abs(corwithspan), abs(corwithen), abs(corwithger)) == abs(corwithger):
        print "It's German!"
Example #28
0
def find_language(s):
	latin = []
	final_langs = []
	ct = 0
	for id in udhr.fileids():
		if '-Latin1' in id:
			latin.append(id)
	for lang in latin:
		for word in udhr.words(lang):
			if word == s:
				final_langs.append(lang)
				print "Found word: " + word, "Search word: " + s
				break
	return final_langs
Example #29
0
def find_language(search_word):
    languages = []
    for lang_id in udhr.fileids():
        if 'Latin1' in lang_id:
            for word in udhr.words(lang_id):
                if search_word.lower() == word.lower():
                    languages.append(lang_id.split("-")[0])
                    break
    languages = set(languages)
    if languages:
        print("The word '", search_term, "' is in the following ",
              len(languages), "languages:", languages)
    else:
        print("no results found")
Example #30
0
def multiLanguages():

    nltk.corpus.cess_esp.words()
    nltk.corpus.floresta.words()
    nltk.corpus.indian.words('hindi.pos')
    nltk.corpus.udhr.fileids()
    nltk.corpus.udhr.words('Javanese-Latin1')[11:]

    languages = ['Chickasaw', 'English', 'German_Deutsch',
            'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
    cfd = nltk.ConditionalFreqDist(
            (lang, len(word))
            for lang in languages
            for word in udhr.words(lang + '-Latin1'))
    cfd.plot(cumulative=True)
Example #31
0
def ex25_findlanguage():
    from nltk.corpus import udhr
    word_lang_map = {}
    for fileid in udhr.fileids():
        if fileid.endswith("-Latin1"):
            lang = fileid[:-7]
            words = udhr.words(fileid)
            for word in words:
                try:
                    word_lang_map[word]
                except KeyError:
                    word_lang_map[word] = set()
                langs = word_lang_map[word]
                langs.add(lang)
                word_lang_map[word] = langs
    print word_lang_map["arashobora"]
Example #32
0
def ex25_findlanguage():
  from nltk.corpus import udhr
  word_lang_map = {}
  for fileid in udhr.fileids():
    if fileid.endswith("-Latin1"):
      lang = fileid[:-7]
      words = udhr.words(fileid)
      for word in words:
        try:
          word_lang_map[word]
        except KeyError:
          word_lang_map[word] = set()
        langs = word_lang_map[word]
        langs.add(lang)
        word_lang_map[word] = langs
  print word_lang_map["arashobora"]
Example #33
0
def get_TTRs(languages):
    TTRs = {}
    for lang in languages:
        words = udhr.words(lang)
        ### BEGIN SOLUTION
        TTRs[lang] = []
        for num in range(100, 1301, 100):
            seen = set()
            n_type = 0
            for i in range(num):
                word = words[i].lower()
                if word not in seen:
                    seen.add(word)
                    n_type += 1
            TTRs[lang].append(n_type)
        ### END SOLUTION
    return TTRs
Example #34
0
def lengthTrial():
    results = pd.DataFrame(columns=['Language', 'Length', 'Accuracy'])
    for lang in LANGUAGES:
        words_by_length = {}
        for word in set(filterWords(udhr.words(lang))):
            words_by_length[len(word)] = [word] + words_by_length.get(
                len(word), [])

        for l, words in words_by_length.items():
            correct = 0
            for w in words:
                result = predictLanguage(w, trigram_grammars)
                prediction = max(result)[1]
                correct += 1 if prediction == lang else 0
            accuracy = correct / len(words)
            results.loc[str(l) + "-" + lang] = [lang, l, accuracy]
    return results
Example #35
0
def fun3():
    from nltk.corpus import udhr
    languages = [
        'Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut',
        'Hungarian_Magyar', 'Ibibio_Efik'
    ]
    cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages
                                   for word in udhr.words(lang + '-Latin1'))
    # 在plot()和tabulate()方法中,可以使用conditions= 参数来指定显示哪些条件。如果我们忽略它,所有条
    # 件都会显示出来。同样,可以使用samples= 参数来限制要显示的样本。这能将大量数据载入到一个条件频
    # 率分布,然后通过选定条件和样品,对完成的绘图或制表进行探索。这也使我们能全面控制条件和样本的
    # 显示顺序。例如:可以为两种语言和长度少于10个字符的词汇绘制累计频率数据表,如下所示。我们可以
    # 解释最上排最后一个单元格中数值的含义是英文文本中9个或少于9个字符长的词有1638个
    cfd.tabulate(conditions=['English', 'German_Deutsch'],
                 samples=range(10),
                 cumulative=True)
    cfd.plot(conditions=['English', 'German_Deutsch'],
             samples=range(10),
             cumulative=True)
Example #36
0
def tabulate():

    cfd = nltk.ConditionalFreqDist(
            (target, fileid[:4])
            for fileid in inaugural.fileids()
            for w in inaugural.words(fileid)
            for target in ['america', 'citizen']
            if w.lower().startswith(target))

    languages = ['Chickasaw', 'English', 'German_Deutsch',
            'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']

    cfd = nltk.ConditionalFreqDist(
            (lang, len(word))
            for lang in languages
            for word in udhr.words(lang + '-Latin1'))

    cfd.tabulate(conditions=['English', 'German_Deutsch'],
            samples=range(10), cumulative=True)
def tabulate():

    cfd = nltk.ConditionalFreqDist((target, fileid[:4])
                                   for fileid in inaugural.fileids()
                                   for w in inaugural.words(fileid)
                                   for target in ['america', 'citizen']
                                   if w.lower().startswith(target))

    languages = [
        'Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut',
        'Hungarian_Magyar', 'Ibibio_Efik'
    ]

    cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages
                                   for word in udhr.words(lang + '-Latin1'))

    cfd.tabulate(conditions=['English', 'German_Deutsch'],
                 samples=range(10),
                 cumulative=True)
Example #38
0
def lengthTrial():
	"""
	A seperate trial that sorts all the words by length and then examines the accuracy on a per-length basis
	"""
	results = pd.DataFrame(columns=['Language', 'Length', 'Accuracy'])
	for lang in LANGUAGES:
		words_by_length = {}
		for word in set(filterWords(udhr.words(lang))):
			words_by_length[len(word)] = [ word ] + words_by_length.get(len(word), [])

		for l, words in words_by_length.items():
			correct = 0
			for w in words:
				result = predictLanguage(w, bigram_grammars)
				prediction = max(result)[1]
				correct += 1 if prediction == lang else 0
			accuracy = correct / len(words)
			results.loc[str(l) + "-" + lang] = [ lang, l, accuracy ]
	return results
def udhr_rankings(debug=False):
    """ Get the conditional frequency distributions for each language in the udhr corpus.
    :returns: dictionary of language to conditional frequency distribution
    :rtype: dict
    """
    result = dict()

    if debug:
        stdout.write('Preparing training sets')

    for _id in [s for s in udhr.fileids() if '-' in s]:
        split_id = _id.split('-')
        language = split_id[0]

        # Only allow some encodings.
        if udhr.encoding(_id) not in ENCODINGS:
            continue

        try:
            words = udhr.words(_id)
            result[language] = FreqDist(words)
        except AssertionError:
            # Problems reading, so we skip.
            pass
        except UnicodeDecodeError:
            # Problems reading, so we skip.
            pass

        if debug:
            stdout.write('.')
            stdout.flush()

    if debug:
        stdout.write('\n')

    return result
Example #40
0
 def __init__(self, languages):
     self._langs = languages
     self._language_base = dict((language, udhr.words(language + '-Latin1')) for language in languages)
     self._language_model_cfd = self.build_language_models()
Example #41
0
import nltk
from nltk.corpus import udhr

cfd = nltk.ConditionalFreqDist(
    (word, lang)
    for lang in udhr.fileids()
    for word in udhr.words(lang))

def find_language(word):
    return cfd[word].max()
Example #42
0
from nltk.corpus import udhr
languages = ['Korean_Hankuko' , 'Japanese_Nihongo' , 'Vietnamese-ALRN' ]
cfd = nltk.ConditionalFreqDist(
			  (lang, len(word)) \
			  for lang in languages \
			  for word in udhr.words(lang + '-UTF8'))
cfd.plot(cumulative=True)

Example #43
0
# ◑ Download some text from a language that has vowel harmony (e.g. Hungarian), extract the vowel sequences of words, and create a vowel bigram table.
import nltk
from nltk.corpus import udhr

# pulls in the universal declaration of human rights in hungarian
text = udhr.words('Hungarian_Magyar-Latin1')

def is_vowel(letter):
	"""Checks to see if a letter is a vowel."""
	if letter in "aeiou":
		return True
	else:
		return False

def pull_out_vowels(word):
	"""Takes in a word and pulls out all vowels for it."""
	vowels = []
	for letter in word:
		if is_vowel(letter):
			vowels.extend(letter)
	vowels = nltk.bigrams(vowels)
	return vowels

def vowels_for_all_words(text):
	"""pulls out all vowels for all words."""
	vowels = []

	for word in text:
		vowels.extend(pull_out_vowels(word))

	return vowels
Example #44
0
#!/usr/bin/python
import nltk
from nltk.book import *
from nltk import FreqDist
from nltk.corpus import gutenberg
from nltk.corpus import udhr

for fid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fid))
    num_words = len(gutenberg.words(fid))
    num_sents = len(gutenberg.sents(fid))
    num_vocab = len(set(w.lower() for w in gutenberg.words(fid)))
    print ('Average word length: %.1f Average sentence length: %.1f Lexical diversity: %.1f File: %s'
           % (float(num_chars)/num_words, 
           float(num_words)/num_sents, 
           float(num_words)/num_vocab, 
           fid))
    
lang = 'English'
cfd = nltk.ConditionalFreqDist((lang, len(word)) for word in udhr.words(lang + '-Latin1'))

cfd.plot(cumulative=True)

names = nltk.corpus.names

cfd = nltk.ConditionalFreqDist((fid, name[-1])for fid in names.fileids() for name in names.words(fid))

cfd.plot()
Example #45
0
import nltk
from nltk.corpus import udhr

languages = ['Chinanteco-Ajitlan-Latin1', 'Chinanteco-UTF8', 'Chinese_Mandarin-GB2312']

cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages for word in udhr.words(lang))
cfd.plot(cumulative = True)
Example #46
0
reuters.categories(['training/9865', 'training/9880'])
reuters.fileids(['barley', 'corn'])
reuters.words('training/9865')[:14]
reuters.words(['training/9865', 'training/9880'])
reuters.words(categories=['barley', 'corn'])

#演说语料库
from nltk.corpus import inaugural
inaugural.fileids()
#多国世界人权宣言
from nltk.corpus import udhr
languages = ['Chickasaw', 'English', 'German_Deutsch','Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
cfd = nltk.ConditionalFreqDist(
    (lang, len(word))
    for lang in languages
    for word in udhr.words(lang + '-Latin1'))
        
cfd.plot(cumulative = True)
cfd.tabulate(conditions=['English', 'German_Deutsch'],samples=range(10), cumulative=True)
#条件频率分布
genre_word = [(genre, word) for genre in ['news', 'romance'] 
for word in brown.words(categories=genre)]

cfd = nltk.ConditionalFreqDist(genre_word)
cfd.conditions()
list(cfd['romance'])
cfd['romance']['could']

from nltk.corpus import inaugural
cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
Example #47
0
def exercise_udhr():
    print udhr.fileids()

    # 查看不同语言的世界人权宣言的字长差异
    languages = ["Chickasaw", "English", "German_Deutsch", "Greenlandic_Inuktikut", "Hungarian_Magyar", "Ibibio_Efik"]
    cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages for word in udhr.words(lang + "-Latin1"))
    cfd.plot()
Example #48
0
#!/usr/bin/python3
# coding: utf-8
from nltk.corpus import udhr  # contains the Universal Declaration of Human Rights in over 300 languages
##################################################################
## 简单测试
print(type(udhr))  # <class 'nltk.corpus.reader.udhr.UdhrCorpusReader'>
print(len(udhr.fileids()))  # 310
print(udhr.fileids()[:2])  # ['Abkhaz-Cyrillic+Abkh', 'Abkhaz-UTF8']
print([lang for lang in udhr.fileids() if lang.startswith('English')])  # ['English-Latin1']
print(len(udhr.words('English-Latin1')))  # 1781
print(udhr.words('English-Latin1')[:5])  # ['Universal', 'Declaration', 'of', 'Human', 'Rights']
languages = ['Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']  # 这些是常用语言
rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]
print(cvs[:10])  # ['ka', 'ka', 'ka', 'ka', 'ka', 'ro', 'ka', 'ka', 'vi', 'ko']
cfd = ConditionalFreqDist(cvs)
cfd.tabulate()
#     a    e    i    o    u
# k  418  148   94  420  173
# p   83   31  105   34   51
# r  187   63   84   89   79
# s    0    0  100    2    1
# t   47    8    0  148   37
# v   93   27  105   48   49
##################################################################
## 处理 udhr; 单词长度在不同语言的分布
languages = ['Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
cfd = ConditionalFreqDist((lang, len(word)) for lang in languages for word in udhr.words(lang + '-Latin1'))
cfd.plot(cumulative=True)
##################################################################
## 画 男女名字结尾字母 区分图; It is well known that names ending in the letter a are almost always female.
cfd = nltk.ConditionalFreqDist((fileid, name[-1]) for fileid in names.fileids() for name in names.words(fileid))
cfd.plot()
##################################################################
## 利用 NLTK 预测单词
# 任务: 训练和创建一个单词预测器, 例如: 给定一个训练过语料库, 写一个能够预测给定单词的一下个单词的程序.
#       使用这个预测器随机生成一个 20 个词的句子.

# 要创建单词预测器, 我们首先要在训练过的语料库中计算两个词的顺序分布, 例如, 我们需要累加给定单词接下来这个单词的出现次数.
# 一旦我们计算出了分布, 我们就可以通过输入一个单词, 得到它在语料库中所有可能出现的下一个单词列表, 并且可以从列表中随机输出一个单词.
# 为了随机生成一个 20 个单词的句子, 我只需要给定一个初始单词, 利用预测器来预测下一个单词, 然后重复操作指导直到句子满 20 个词.
# 清单 2 描述了怎么利用 NLTK 提供的模块来简单实现. 我们利用简奥斯丁的 Persuasion 作为训练语料库.
def generate_model(cfdist, word, num=20):
Example #50
0
def find_language(word):
    all_languages = [language for language in udhr.fileids() if
                 language[-6:] == 'Latin1']
    word_languages = [language for language in all_languages if
                      word in udhr.words(language)]
    return word_languages