def Training(): global weight print("\nTraining the model using the given data , Please Wait . . . \n") ## Read the corpora english = udhr.raw("English-Latin1") german = udhr.raw("German_Deutsch-Latin1") italian = udhr.raw("Italian-Latin1") spanish = udhr.raw("Spanish-Latin1") ## Pass these to NgramCalculator to calculate n-grams NgramCalculator(english, 1) NgramCalculator(german, 2) NgramCalculator(italian, 3) NgramCalculator(spanish, 4) print("Taking " + str(weight) + " grams") ## Read the news files sequentially for i in range(len(names)): filename = names[i] + ".txt" string = "" with open(filename, encoding="utf-8") as file: content = file.readlines() for line in content: string += "".join(line) # Append to the string NgramCalculator(string, i + 1) print("\nTraining Completed . . .\n")
def test_lang_similar(self): print 'lang_similar' ld = LDChar(languages) t_ls = [udhr.raw(language + '-Latin1') for language in languages] for t in t_ls: print ld.guess_language(t) print 'lang_similar FINISH'
def __init__(self, fileid): try: # Reads the UDHR file corpus = udhr.raw(fileid) except: print("UDHR language file " + fileid + " does not exist", file=sys.stderr) sys.exit(1) # Generate training dataset, lowercase and newlines converted to space self.train = re.sub(r'[\n]+', ' ', corpus[0:1000].strip().lower()) # Generate dev dataset self.dev = corpus[1000:1100] # Convert training words to single characters tokens = list(self.train) self.unigram = tokens self.bigram = list(nltk.bigrams(tokens)) self.trigram = list(nltk.trigrams(tokens)) # Generate unigram frequency distirbution self.unigramFreq = FreqDist(self.unigram) # Generate bigram frequency distribution self.bigramFreq = ConditionalFreqDist(self.bigram) # Generate trigram frequency distribution self.trigramFreq = ConditionalFreqDist( list(((w0, w1), w2) for w0, w1, w2 in self.trigram))
def test_digest_processor_fr(): processor = Processor() text = udhr.raw('French_Francais-Latin1') text_process_params = TextProcessParams(SummarySize.new_relative(0.1), keywords_number=10) document = processor.process_text(text, text_process_params) assert isinstance(document, Document) assert 5 <= len(document.sentences)
def test_digest_processor_de(): processor = Processor() # text = open(path.join(__location__, 'de_text.txt'), 'r', encoding='utf8').read() text = udhr.raw('German_Deutsch-Latin1') text_process_params = TextProcessParams(SummarySize.new_absolute(3), keywords_number=10) document = processor.process_text(text, text_process_params) assert isinstance(document, Document) assert 5 <= len(document.sentences)
def __init__(self, file): corpus = udhr.raw(file) self.training_set = corpus[0:1000] token = list(self.training_set) self.unigram = token self.bigram = list(nltk.bigrams(token)) self.trigram = list(nltk.trigrams(token)) self.unigram_frequency = FreqDist(self.unigram) self.bigram_frequency = ConditionalFreqDist(self.bigram) self.trigam_frequency = ConditionalFreqDist( list(((x, y), z) for x, y, z in self.trigram))
def __init__(self, corpura): corpus = udhr.raw(corpura) self.TrainingSet = corpus[0:1000] token = list(self.TrainingSet) self.Uni = token self.Bi = list(nltk.bigrams(token)) self.Tri = list(nltk.trigrams(token)) self.UniFreq = FreqDist(self.Uni) self.BiFreq = ConditionalFreqDist(self.Bi) self.TriFreq = ConditionalFreqDist( list(((w1, w2), w3) for w1, w2, w3 in self.Tri))
def language(texto): #fd = nltk.FreqDist(texto) fd = nltk.FreqDist(word_tokenize(texto)) #print(list(fd)) #print(fd.most_common(50)) correlationMax = -10000 langFinal = '-Latin1' for lang in udhr.fileids(): if lang[-7:] == '-Latin1': fdu = nltk.FreqDist(word_tokenize(udhr.raw(lang))) #fdu = nltk.FreqDist(udhr.raw(lang)) correlation = nltk.spearman_correlation( list(ranks_from_sequence(fd)), list(ranks_from_sequence(fdu))) #print(fdu.most_common(50)) #print(lang,correlation) if correlation > correlationMax: langFinal = lang correlationMax = correlation return langFinal + ',corr:' + str(correlationMax)
def main(): english = udhr.raw("English-Latin1") # loading data for English from UDHR textTokens = re.split( " |\n", english) # split text based on space or next line feen \n for i in range(len(textTokens)): listNgrams = findNGrams(textTokens[i], ngram) for j in range(len(listNgrams)): if (listNgrams[j] not in dictNGram): dictNGram[listNgrams[j]] = list() # each ngram in dictionary contain a list of words that contain that ngram dictNGram[listNgrams[j]].append(textTokens[i]) ### now I have all list of words that contain possible ngrams ## testLine = input("Enter a sentence for suggestions: ") nSuggestions = int(input("Number of suggestions you want: ")) testWords = testLine.split(" ") for i in range(len(testWords)): candidateWordsDist = list() # cadidate words with edit distance candidateWords = list() # unique candidate word list testNgrams = findNGrams(testWords[i], ngram) for ng in testNgrams: if (ng in dictNGram): for wr in dictNGram[ng]: if (wr not in candidateWords): candidateWords.append(wr) candidateWordsDist.append( [wr, editDistance(testWords[i], wr)] ) # compute the edit distance test word and train data candidateWordsDist.sort(key=itemgetter(1)) #if(candidateWordsDist[0][1]==0): # print("The word: "+testWords[i]+" is correct no need suggestions") #else: # line below display possible suggestions for all words in sentence print("possible correct spellings for word => " + testWords[i] + " are given below: ") print(candidateWordsDist[:nSuggestions]) # print possible corrections
def fun15(): """freq dist plot""" raw_text = udhr.raw('Chinese_Mandarin-GB2312') FreqDist(raw_text).plot()
cfd.plot(cumulative=True) languages = [ 'Chickasaw-Latin1', 'English-Latin1', 'German_Deutsch-Latin1', 'Greenlandic_Inuktikut-Latin1', 'Hungarian_Magyar-Latin1', 'Ibibio_Efik-Latin1' ] # , 'Chinese_Mandarin-UTF8'] cfd = nltk.ConditionalFreqDist( (lang, len(word)) for lang in languages for word in udhr.words(lang)) cfd.plot(cumulative=True) cfd.tabulate(samples=range(10), cumulative=True) cfd.tabulate(conditions=['English-Latin1', 'German_Deutsch-Latin1'], samples=range(10), cumulative=True) # 中文是字符型的,不能使用单词读入 chinese_mandarin_raw = udhr.raw('Chinese_Mandarin-UTF8') print(chinese_mandarin_raw) chinese_mandarin_words = udhr.words('Chinese_Mandarin-UTF8') chinese_mandarin_words chinese_mandarin_sents = udhr.sents('Chinese_Mandarin-UTF8') chinese_mandarin_sents def generate_model(cfdist, word, num=15): for i in range(num): print(word, end=' ') word = cfdist[word].max() # 1.8. 文本语料库的结构 raw = gutenberg.raw('burgess-busterbrown.txt')
#nltk.download('cess_esp') #nltk.download('floresta') #nltk.download('indian') #nltk.download('udhr') #print(nltk.corpus.cess_esp.words(), '\n') #print(nltk.corpus.floresta.words(), '\n') #print(nltk.corpus.indian.words('hindi.pos'), '\n') #print(nltk.corpus.udhr.fileids(), '\n') #print(nltk.corpus.udhr.words('Javanese-Latin1')[:11], '\n') from nltk.corpus import udhr #languages = ['Chickasaw', 'English', 'German_Deutsch', # 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik'] #cfd = nltk.ConditionalFreqDist( # (lang, len(word)) # for lang in languages # for word in udhr.words(lang + '-Latin1')) #cfd.plot(cumulative=True) raw_text = udhr.raw('Italian-Latin1') nltk.FreqDist(raw_text).plot()
__author__ = 'User' import nltk from nltk.corpus import udhr # Russian-Cyrillic', 'Russian-UTF8', 'Russian_Russky-Cyrillic', 'Russian_Russky-UTF8 languages = ['English-Latin1', 'Hungarian_Magyar-UTF8', 'Russian_Russky-UTF8', 'Russian-Cyrillic'] # # cfd = nltk.ConditionalFreqDist( # (lang, len(word)) # for lang in languages # for word in udhr.words(lang)) # cfd.plot(cumulative=True) raw_text = udhr.raw('English-Latin1') nltk.FreqDist(raw_text).plot()
cfd.tabulate(conditions=genres, samples=modals) # plots with CFD from nltk.corpus import inaugural cfd = nltk.ConditionalFreqDist((target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'citizen'] if w.lower().startswith(target)) cfd.plot() # more plots, universal declaration of human rights # cumulative word length distributions from nltk.corpus import udhr languages = [ 'Chickasaw', 'Greenlandic_Inuktikut', 'Quechua', 'Indonesian', 'French_Francais' ] cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages for word in udhr.words(lang + '-Latin1')) cfd.plot(cumulative=True) raw_text = udhr.raw('Javanese-Latin1') nltk.FreqDist(raw_text).plot() udhr.fileids()
import nltk, re from nltk.corpus import udhr, PlaintextCorpusReader hung = udhr.raw('Spanish-Latin1') corpus_root = '/Users/jbbe/interesa/txt_books' file_names = [ 'amuleto.txt', 'estrella_distante.txt', 'putas_asesinas.txt', 'det_salv_bol.txt', 'sav_det.txt', 'la_invencion_de_morel.txt', 'don_quixote.txt', 'borges_ficc.txt', 'hot_sur.txt', 'diez_muj.txt', 'rayuela-cortazar.txt' ] vowels = ['a', 'e', 'i', 'o', 'u'] books = PlaintextCorpusReader(corpus_root, file_names) big_spanish_text = ''.join([ books.raw('det_salv_bol.txt'), books.raw('don_quixote.txt'), books.raw('rayuela-cortazar.txt'), books.raw('borges_ficc.txt'), books.raw('amuleto.txt'), books.raw('estrella_distante.txt'), books.raw('la_invencion_de_morel.txt'), books.raw('hot_sur.txt') ]) hung = big_spanish_text pattern = r'(?x)[aeiou]{2}' # vowel_seqs = nltk.regexp_tokenize(hung, pattern) vowel_seqs = re.findall(pattern, hung) seq_lists = [(seq[0], seq[1]) for seq in vowel_seqs] print(seq_lists)
#!/usr/bin/python3 from nltk.corpus import wordnet as wn from nltk.corpus import udhr import sys, nltk def find_synonym(token): synsets = wn.synsets(token) for synset in synsets: for lemma in synset.lemmas(): if lemma.name() != token: return lemma.name() return token def main(text): for sent in text.split('\n'): tokens = [token.lower() for token in nltk.word_tokenize(sent)] new_text = "" for token in tokens: new_text+=find_synonym(token)+' ' print(new_text) if __name__ == '__main__': if len(sys.argv) > 1: main(sys.argv[1]) else: main(udhr.raw('English-Latin1'))
def problem1(): from nltk.corpus import udhr import re import string english = udhr.raw('English-Latin1') french = udhr.raw('French_Francais-Latin1') italian = udhr.raw('Italian_Italiano-Latin1') spanish = udhr.raw('Spanish_Espanol-Latin1') english_train, english_dev = english[0:1000], english[1000:1100] french_train, french_dev = french[0:1000], french[1000:1100] italian_train, italian_dev = italian[0:1000], italian[1000:1100] spanish_train, spanish_dev = spanish[0:1000], spanish[1000:1100] english_test = udhr.words('English-Latin1')[0:1000] french_test = udhr.words('French_Francais-Latin1')[0:1000] italian_test = udhr.words('Italian_Italiano-Latin1')[0:1000] spanish_test = udhr.words('Spanish_Espanol-Latin1')[0:1000] #english_test = english_test.split(" ") #spanish_test = spanish_test.split(" ") #italian_test = italian_test.split(" ") #spanish_test = spanish_test.split(" ") """Englis tain Model""" words_all = [] translate_table = dict((ord(char), None) for char in string.punctuation) for line in english_train: line = line.lower() if len(line) != 0: line = line.translate(translate_table) words_all += line # words_all.append(" ") all_str = ''.join(words_all) all_str = re.sub(' +', ' ', all_str) all_str = re.sub('\n', ' ', all_str) all_str = re.sub(' +', ' ', all_str) all_str = re.sub(' ', ' ', all_str) #print(all_str) onecharfreqdict = char_frequency(all_str) #print(onecharfreqdict) bilist = (word2grams(all_str)) bicharfreqdict = ngramfreq(bilist) #print(bicharfreqdict) trilist = (word3grams(all_str)) tricharfreqdict = ngramfreq(trilist) """French tain Model""" words_all = [] translate_table = dict((ord(char), None) for char in string.punctuation) for line in french_train: line = line.lower() if len(line) != 0: line = line.translate(translate_table) words_all += line all_str = ''.join(words_all) all_str = re.sub(' +', ' ', all_str) all_str = re.sub('\n', ' ', all_str) all_str = re.sub(' +', ' ', all_str) all_str = re.sub(' ', ' ', all_str) print(all_str) french_onecharfreqdict = char_frequency(all_str) print(onecharfreqdict) bilist = (word2grams(all_str)) french_bicharfreqdict = ngramfreq(bilist) print(bicharfreqdict) trilist = (word3grams(all_str)) french_tricharfreqdict = ngramfreq(trilist) """English test on English vs French Unigram models:""" english_freq = 0 french_freq = 0 for i in english_test: #print("next word") i = "".join(c for c in i if c not in ('!', '.', ':', ',', ' ')) eng_wordprob = 1 french_wordprob = 1 for unichar in (word1grams(i)): if unichar in onecharfreqdict.keys(): eng_unicharprob = round( (onecharfreqdict[unichar] / len(english_test)), 4) # print(trichar,onecharfreqdict[trichar],unichar[0],onecharfreqdict[trichar[0]],round(eng_bicharprob,4)) eng_wordprob *= eng_unicharprob if unichar in french_onecharfreqdict.keys(): french_unicharprob = round( (french_onecharfreqdict[unichar]) / len(english_test), 4) french_wordprob *= french_unicharprob if eng_wordprob >= french_wordprob: english_freq += 1 else: french_freq += 1 print(i, round(eng_wordprob, 10), round(french_wordprob, 10), english_freq, french_freq) eng_wordprob = round(eng_wordprob, 10) french_wordprob = round(french_wordprob, 10) eng_unigram_probability = ((english_freq / (english_freq + french_freq) * 100)) # print(english_freq,french_freq) print("Accuracy of English test on English vs French Uni-gram models: ", eng_unigram_probability, "%") """English test on English vs French bigram models """ english_freq = 0 french_freq = 0 for i in english_test: #print("next word") i = "".join(c for c in i if c not in ('!', '.', ':', ',', ' ')) eng_wordprob = 1 french_wordprob = 1 for bichar in (word2grams(i)): if bichar in bicharfreqdict.keys(): eng_bicharprob = round(bicharfreqdict[bichar], 4) / round( onecharfreqdict[bichar[0]], 4) print(bichar, bicharfreqdict[bichar], bichar[0], onecharfreqdict[bichar[0]], round(eng_bicharprob, 4)) eng_wordprob *= eng_bicharprob if bichar in french_bicharfreqdict.keys(): french_bicharprob = round( french_bicharfreqdict[bichar], 4) / round( french_onecharfreqdict[bichar[0]], 4) french_wordprob *= french_bicharprob if eng_wordprob >= french_wordprob: english_freq += 1 else: french_freq += 1 print(i, round(eng_wordprob, 10), round(french_wordprob, 10), english_freq, french_freq) eng_wordprob = round(eng_wordprob, 10) french_wordprob = round(french_wordprob, 10) eng_bigram_probability = ((english_freq / (english_freq + french_freq) * 100)) #print(english_freq,french_freq) print("Accuracy of English test on English vs French bigram models: ", eng_bigram_probability, "%") """Accuracy of English test on English vs French Tri-gram models: """ english_freq = 0 french_freq = 0 for i in english_test: #print("next word") i = "".join(c for c in i if c not in ('!', '.', ':', ',', ' ')) eng_wordprob = 1 french_wordprob = 1 for trichar in (word3grams(i)): if trichar in tricharfreqdict.keys(): eng_bicharprob = round(tricharfreqdict[trichar], 4) / round( onecharfreqdict[trichar[0]], 4) print(trichar, tricharfreqdict[trichar], trichar[0], onecharfreqdict[trichar[0]], round(eng_bicharprob, 4)) eng_wordprob *= eng_bicharprob if trichar in french_tricharfreqdict.keys(): french_bicharprob = round( french_tricharfreqdict[trichar], 4) / round( french_onecharfreqdict[trichar[0]], 4) french_wordprob *= french_bicharprob if eng_wordprob >= french_wordprob: english_freq += 1 else: french_freq += 1 print(i, round(eng_wordprob, 10), round(french_wordprob, 10), english_freq, french_freq) eng_wordprob = round(eng_wordprob, 10) french_wordprob = round(french_wordprob, 10) eng_trigram_probability = ((english_freq / (english_freq + french_freq) * 100)) print(english_freq, french_freq) print("Accuracy of English test on English vs French Tri-gram models: ", eng_trigram_probability, "%") """ same experiment as above for Spanish vs. Italian """ """italian_train Model""" words_all = [] translate_table = dict((ord(char), None) for char in string.punctuation) for line in italian_train: line = line.lower() if len(line) != 0: line = line.translate(translate_table) words_all += line all_str = ''.join(words_all) all_str = re.sub(' +', ' ', all_str) all_str = re.sub('\n', ' ', all_str) all_str = re.sub(' +', ' ', all_str) all_str = re.sub(' ', ' ', all_str) print(all_str) italian_onecharfreqdict = char_frequency(all_str) print(onecharfreqdict) bilist = (word2grams(all_str)) italian_bicharfreqdict = ngramfreq(bilist) print(bicharfreqdict) trilist = (word3grams(all_str)) italian_tricharfreqdict = ngramfreq(trilist) """spanish_train Model""" words_all = [] translate_table = dict((ord(char), None) for char in string.punctuation) for line in spanish_train: line = line.lower() if len(line) != 0: line = line.translate(translate_table) words_all += line all_str = ''.join(words_all) all_str = re.sub(' +', ' ', all_str) all_str = re.sub('\n', ' ', all_str) all_str = re.sub(' +', ' ', all_str) all_str = re.sub(' ', ' ', all_str) print(all_str) spanish_onecharfreqdict = char_frequency(all_str) print(onecharfreqdict) bilist = (word2grams(all_str)) spanish_bicharfreqdict = ngramfreq(bilist) print(bicharfreqdict) trilist = (word3grams(all_str)) spanish_tricharfreqdict = ngramfreq(trilist) """spanish test on Spanish vs Italian Unigram models:""" spanish_freq = 0 italian_freq = 0 for i in spanish_test: #print("next word") i = "".join(c for c in i if c not in ('!', '.', ':', ',', ' ')) spanish_wordprob = 1 italian_wordprob = 1 for unichar in (word1grams(i)): if unichar in spanish_onecharfreqdict.keys(): spanish_unicharprob = round( (spanish_onecharfreqdict[unichar] / 1000), 4) # print(trichar,spanish_onecharfreqdict[trichar],unichar[0],spanish_onecharfreqdict[trichar[0]],round(eng_bicharprob,4)) spanish_wordprob *= spanish_unicharprob if unichar in italian_onecharfreqdict.keys(): italian_unicharprob = round( (italian_onecharfreqdict[unichar]) / 1000, 4) italian_wordprob *= italian_unicharprob if spanish_wordprob >= italian_wordprob: spanish_freq += 1 else: italian_freq += 1 #print(i,round(spanish_wordprob,10), round(italian_wordprob,10),spanish_freq,italian_freq) spanish_wordprob = round(spanish_wordprob, 10) italian_wordprob = round(italian_wordprob, 10) spanish_unigram_probability = ((spanish_freq / (spanish_freq + italian_freq) * 100)) #print("Accuracy of Spanish test on Spanish vs Italian Uni-gram models: ", spanish_unigram_probability,"%") """spanish test on Spanish vs Italian Bi-gram models:""" spanish_freq = 0 italian_freq = 0 for i in spanish_test: #print("next word") i = "".join(c for c in i if c not in ('!', '.', ':', ',', ' ')) spanish_wordprob = 1 italian_wordprob = 1 for bichar in (word2grams(i)): if bichar in spanish_bicharfreqdict.keys(): spanish_bicharprob = round( spanish_bicharfreqdict[bichar], 4) / round( spanish_onecharfreqdict[bichar[0]], 4) # print(trichar,spanish_onecharfreqdict[trichar],bichar[0],spanish_onecharfreqdict[trichar[0]],round(eng_bicharprob,4)) spanish_wordprob *= spanish_bicharprob if bichar in italian_bicharfreqdict.keys(): italian_bicharprob = round( italian_bicharfreqdict[bichar], 4) / round( italian_onecharfreqdict[bichar[0]], 4) italian_wordprob *= italian_bicharprob if spanish_wordprob > italian_wordprob: spanish_freq += 1 else: italian_freq += 1 #print(i,round(spanish_wordprob,10), round(italian_wordprob,10),spanish_freq,italian_freq) spanish_wordprob = round(spanish_wordprob, 10) italian_wordprob = round(italian_wordprob, 10) spanish_bigram_probability = ((spanish_freq / (spanish_freq + italian_freq) * 100)) #print("Accuracy of Spanish test on Spanish vs Italian bi-gram models: ", spanish_bigram_probability,"%") """spanish test on Spanish vs Italian Tri-gram models:""" spanish_freq = 0 italian_freq = 0 for i in spanish_test: #print("next word") i = "".join(c for c in i if c not in ('!', '.', ':', ',', ' ')) spanish_wordprob = 1 italian_wordprob = 1 for bichar in (word3grams(i)): if bichar in spanish_tricharfreqdict.keys(): spanish_bicharprob = round( spanish_tricharfreqdict[bichar], 4) / round( spanish_onecharfreqdict[bichar[0]], 4) print(bichar, spanish_tricharfreqdict[bichar], bichar[0], spanish_onecharfreqdict[trichar[0]], (spanish_bicharprob)) spanish_wordprob *= spanish_bicharprob if bichar in italian_tricharfreqdict.keys(): italian_bicharprob = round( italian_tricharfreqdict[bichar], 4) / round( italian_onecharfreqdict[bichar[0]], 4) italian_wordprob *= italian_bicharprob if spanish_wordprob >= italian_wordprob: spanish_freq += 1 else: italian_freq += 1 #print(i,round(spanish_wordprob,10), round(italian_wordprob,10),spanish_freq,italian_freq) spanish_wordprob = round(spanish_wordprob, 10) italian_wordprob = round(italian_wordprob, 10) spanish_trigram_probability = ((spanish_freq / (spanish_freq + italian_freq) * 100)) #print("Accuracy of Spanish test on Spanish vs Italian tri-gram models: ", spanish_trigram_probability,"%") print("Accuracy of English test on English vs French Uni-gram models: ", eng_unigram_probability, "%") print("Accuracy of English test on English vs French bigram models: ", eng_bigram_probability, "%") print("Accuracy of English test on English vs French Tri-gram models: ", eng_trigram_probability, "%") print( "\nAccuracy of Spanish test on Spanish vs Italian Uni-gram models: ", spanish_unigram_probability, "%") print("Accuracy of Spanish test on Spanish vs Italian bi-gram models: ", spanish_bigram_probability, "%") print("Accuracy of Spanish test on Spanish vs Italian tri-gram models: ", spanish_trigram_probability, "%") return
import nltk from nltk.corpus import udhr as u #The full text of the declaration in Ibibio-Efik print(u.raw('Ibibio_Efik-Latin1')) #The length (in words) of the text in Amahuaca and in Greenlandic, and which one is longer word_lenA = len(u.words('Amahuaca')) word_lenG = len(u.words('Greenlandic_Inuktikut-Latin1')) print('\nAmahuaca one has %s words and Greenland one has %s words.' % (word_lenA, word_lenG)) if word_lenA > word_lenG: print('Amahuaca one is longer.') else: print('Greenland one is longer.') #The first sentence of the text in Turkish sentences = u.sents('Turkish_Turkce-Turkish') sentence1 = ' '.join(sentences[1]) print('\n', sentence1)
def nltk_corpora(): ## 1. PROJECT GUTENBERG << Formal Language - Literature;ebooks 60K++ emma = nltk.corpus.gutenberg.words("austen-emma.txt") emma = nltk.Text(emma) len(emma) lexical_diversity(emma) emma.concordance("brave") emma.collocation_list() ## traits of the corpus text for each def corp_content(corporad): print( "{0} File {0} \t\tWord len Sent len Vocab Lexical Complexity" .format(" " * 6)) print("{}".format("-" * 100)) for i, txt in enumerate(corporad.fileids()): sents_l = len(corporad.words(txt)) try: sents_l = len(corporad.sents(txt)) except: sents_l = len(corporad.posts(txt)) w_len = round(len(corporad.raw(txt)) / len(corporad.words(txt))) s_len = round(len(corporad.words(txt)) / sents_l) voc = len(set(w.lower() for w in corporad.words(txt))) # lexp = round( voc / len( [w.lower() for w in gutenberg.words(txt)] ) * 100 ) lexp = round(voc / len(corporad.words(txt)) * 100) print("{}. {} \t\t{}\t{}\t{}\t{}%\t{}".format( i, txt, w_len, s_len, voc, lexp, corporad.raw(txt)[:30])) # print( "{}. {} \t\t{}\t{}\t{}\t{}%\t{}".format(i, txt, w_len, s_len, voc, lexp, corporad.words(txt)[:5] ) ) # 1. Formal Language - Project Gutenberg ebooks 60K++, 16+ languages corp_content(gutenberg) # 2. Informal Language - Web content and Chat rooms corp_content(webtext) corp_content(nps_chat) # 3. Brown Corpus - 15+ Multi-genre, 500+ sources, En_lang << http://icame.uib.no/brown/bcm-los.html # for studying systematic differences between genres I.E. stylistics corp_content(brown) brown.categories() brown.words(categories="news") brown.words(categories=["news", "editorial", "reviews"]) # example stylistics - modal verbs usage between genres def modalz(modals): print("\tCategory\t", end=" ") for m in modals: print("\t{}".format(m), end=" ") print("\n" + "-" * 100) for i, cat in enumerate(brown.categories()): print("{}.{}\t\t".format(i, cat), end=" ") fdist = nltk.FreqDist(w.lower() for w in brown.words(categories=cat)) for m in modals: print("\t{}".format(fdist[m]), end=" ") print("") modalz(["can", "could", "may", "might", "must", "will"]) modalz(["should", "ought", "would", "could", "do", "did", "does"]) modalz(["what", "when", "where", "why", "who"]) ## ditto using nltk conditional frequency distributions cfdist = nltk.ConditionalFreqDist( (genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) genz = ["news", "religion", "hobbies", "humor", "romance"] modz = ["can", "could", "may", "might", "must", "will"] cfdist.tabulate(conditions=genz, samples=modz) # 4. Reuters Corpus - news articles, 90 topics, grouped into training and testing sets # << Apparent goal is to predict the category/topic of a given article?? corp_content(reuters) # retrieve topic(s) of a given article reuters.categories("training/9865") reuters.categories(["training/9865", "training/9880"]) # find articles that cover some topic(s) reuters.fileids("barley") reuters.fileids(["barley", "corn"]) # the first words are in all CAPs and are the titles of the article. The rest is the story text for i, txt in enumerate(reuters.fileids(["barley", "oil"])): print("{}. {}\t{}".format(i, txt, reuters.words(txt)[:10])) # 5. Speeches - Inaugral Address Corpus << 55 USA Presidential addresses # << interesting in that there's a time horizon element from 1789 to 2009 (first 4 xters of fileid = year) ; can study how language changes with time; could reflect on priorities, culture, ??? corp_content(inaugural) # how America and Citizen ar eused over time cfdist = nltk.ConditionalFreqDist((target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'citizen'] if w.lower().startswith(target)) cfdist.plot() # 6. Annotated Text Corpora # annotations: POS, named entities, syntatic structures, semantic roles, # 7. Other Languages Corpora # includes udhr = Universal Declaration of Human Rights in over 300 languages # word length freq by diff languages langz = [ "English", "Chickasaw", "German_Deutsch", "Kinyarwanda", "Swahili_Kiswahili" ] cfdist = nltk.ConditionalFreqDist((lang, len(word)) for lang in langz for word in udhr.words(lang + "-Latin1")) cfdist.plot() cfdist.plot(cumulative=True) # alphabet freq nltk.FreqDist(udhr.raw("Kinyarwanda-Latin1")).plot() # 8. Loading your own Corpora # << txt files. Use PlaintextCorpusReader. Check dir location # my_corpus = PlaintextCorpusReader( "root_dir_path_here", ".*" ) # second param is a list of fileids defined as a list or an ls pattern eg_corpus = PlaintextCorpusReader( "D:/zRepoz/dataSaysWhat/DocReader/res/txt_corpus", ".txt") eg_corpus.fileids() eg_corpus.words("example1.txt") len(eg_corpus.sents()) #BracketParseCorpusReader my_corpus = nltk.corpus.BracketParseCorpusReader("path", "file_pattern")
modals = ['can', 'could', 'may', 'might', 'must', 'will'] cfd.tabulate(conditions=genres, samples=modals) # plots with CFD from nltk.corpus import inaugural cfd = nltk.ConditionalFreqDist( (target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'citizen'] if w.lower().startswith(target)) cfd.plot() # more plots, universal declaration of human rights # cumulative word length distributions from nltk.corpus import udhr languages = ['Chickasaw', 'Greenlandic_Inuktikut', 'Quechua', 'Indonesian', 'French_Francais'] cfd = nltk.ConditionalFreqDist( (lang, len(word)) for lang in languages for word in udhr.words(lang + '-Latin1')) cfd.plot(cumulative=True) raw_text = udhr.raw('Javanese-Latin1') nltk.FreqDist(raw_text).plot() udhr.fileids()
nltk.corpus.indian.words('hindi.pos') nltk.corpus.udhr.fileids() nltk.corpus.udhr.words('Javanese-Latin1')[11:] from nltk.corpus import udhr languages = [ 'Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik' ] cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages for word in udhr.words(lang + '-Latin1')) cfd.plot(cumulative=True) #Exercise raw_text = udhr.raw('Romani-Latin1') nltk.FreqDist(raw_text).plot() # # # # # # # # # # # # # # # # # # # # # Text Corpus Structure # # # # # # # # # # # # # # # # # # # # #Gutenberg raw = gutenberg.raw('burgess-busterbrown.txt') raw[1:20] words = gutenberg.words('burgess-busterbrown.txt') words[1:20] sents = gutenberg.sents('burgess-busterbrown.txt') sents[1:20] #With own local copy
if string.strip().lower() == 'q': print("Goodbye :)") exit() ## Clean the text and send it for processing else: cleaned_text = string.strip().lower() Sentence_splitting(cleaned_text) if __name__ == "__main__": ## Used UDHR corpus for training print("\nTraining on UDHR corpus, Please Wait . . .") var = udhr.raw("English-Latin1") tokenized_values = Tokenizer(var.lower()) Ngrams = NgramCalculator(tokenized_values) aggregator(Ngrams) print("\nTraining Done.") ## Used Gulliver's Travels for training as well to increase my training data set print("\nTraining on Gulliver's Travels book, Please Wait . . .") file = open("Gulliver.txt", "r", encoding="utf-8") for line in file: tokenized_values = Tokenizer(line.strip().lower()) Ngrams = NgramCalculator(tokenized_values) aggregator(Ngrams)
def main(): ###################### in this block I load train data from text files #f = open("eng.txt") #alltext=f.read() #nGramsEng=nGrams(alltext,3) #f = open("ger.txt") #alltext=f.read() #nGramsGer=nGrams(alltext,3) #f = open("spn.txt") #alltext=f.read() #nGramsSpn=nGrams(alltext,3) #f = open("itn.txt") #alltext=f.read() #nGramsItn=nGrams(alltext,3) #f = open("frn.txt") #alltext=f.read() #nGramsFrn=nGrams(alltext,3) #f = open("danish.txt") #alltext=f.read() #nGramsDanish=nGrams(alltext,3) #f = open("swedish.txt") #alltext=f.read() #nGramsSwedin=nGrams(alltext,3) ################# in this block train data load from nltk.corpus.udhr (Universal Declaration of Human Rights) english = udhr.raw("English-Latin1") french = udhr.raw("French_Francais-Latin1") german = udhr.raw("German_Deutsch-Latin1") italian = udhr.raw("Italian-Latin1") spanish = udhr.raw("Spanish-Latin1") swedish = udhr.raw("Swedish_Svenska-Latin1") danish = udhr.raw("Danish_Dansk-Latin1") # nGrams() will generate all up to 3 (1-3) ngrams an compute frequencies of all ngrams # User can vary the up to ngrams value and can see difference in output upToNgrams = 3 nGramsEng = nGrams(english, upToNgrams) nGramsGer = nGrams(german, upToNgrams) nGramsSpn = nGrams(spanish, upToNgrams) nGramsItn = nGrams(italian, upToNgrams) nGramsFrn = nGrams(french, upToNgrams) nGramsDanish = nGrams(danish, upToNgrams) nGramsSwedin = nGrams(swedish, upToNgrams) # in above part we compute ngrams up to 3 and we get best matching result till 1- 3 # here input from user, whatever string user want to test inputStr = input( "Write a string to detect language (larger string gives good result): " ) ngramsOfInput = nGrams(inputStr, 3) # below part computer similarities of test string from all languages result = {} result["English"] = cosineSim(nGramsEng, ngramsOfInput) result["German"] = cosineSim(nGramsGer, ngramsOfInput) result["Spanish"] = cosineSim(nGramsSpn, ngramsOfInput) result["Italian"] = cosineSim(nGramsItn, ngramsOfInput) result["french"] = cosineSim(nGramsFrn, ngramsOfInput) result["Danish"] = cosineSim(nGramsDanish, ngramsOfInput) result["Swedish"] = cosineSim(nGramsSwedin, ngramsOfInput) return result
#Stemming words from nltk.stem import PorterStemmer ps = PorterStemmer() example = ['ride', 'rides', 'rider', 'riding'] for w in example: print(ps.stem(w)) sentence = "When riders are riding their horses, they often think of how cowboy rode horses." words = word_tokenize(sentence) for w in words: print(ps.stem(w)) #Video 2 from nltk.corpus import udhr print(udhr.raw('English-Latin1')) from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer train_text = state_union.raw('2005-GWBush.txt') sample_text = state_union.raw('2006-GWBush.txt') #Train the PunktTokenizer custom_sent_tokenizer = PunktSentenceTokenizer(train_text) tokenized = custom_sent_tokenizer.tokenize(sample_text) #This function tags each tokenized word with a part of speech def process_content(): try:
def test_raw_unicode(self): for name in udhr.fileids(): txt = udhr.raw(name) assert not isinstance(txt, bytes), name
print(nltk.corpus.indian.words('hindi.pos')) print(nltk.corpus.udhr.fileids() ) #universal declaration of human rights in > 300 languages print(nltk.corpus.udhr.words('Javanese-Latin1')) #cfd for udhr languages = [ 'Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik' ] cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages for word in udhr.words(lang + '-Latin1')) cfd.plot(cumulative=True) #frequency distributions of letters in a text raw_text = udhr.raw('Afrikaans-Latin1') nltk.FreqDist(raw_text).plot() #loading your own corpus #for later (need to download a text corpus) #conditional frequency distributions (theory) genre_word = [(genre, word) for genre in ['news', 'romance'] for word in brown.words(categories=genre)] print(genre_word[:4]) print(genre_word[-4:]) cfd = nltk.ConditionalFreqDist(genre_word) print(cfd) print(cfd.conditions()) print(cfd["news"]) print(cfd["romance"])
# coding: utf-8 # In[91]: import nltk import string from nltk.util import ngrams from nltk.corpus import udhr english = udhr.raw('English-Latin1') french = udhr.raw('French_Francais-Latin1') italian = udhr.raw('Italian_Italiano-Latin1') spanish = udhr.raw('Spanish_Espanol-Latin1') english_train, english_dev = english[0:1000], english[1000:1100] french_train, french_dev = french[0:1000], french[1000:1100] italian_train, italian_dev = italian[0:1000], italian[1000:1100] spanish_train, spanish_dev = spanish[0:1000], spanish[1000:1100] english_test = udhr.words('English-Latin1')[0:1000] french_test = udhr.words('French_Francais-Latin1')[0:1000] italian_test = udhr.words('Italian_Italiano-Latin1')[0:1000] spanish_test = udhr.words('Spanish_Espanol-Latin1')[0:1000] eng_train = list(english_train) #print(eng_train) eng_train = [ ''.join(c for c in s if c not in string.punctuation) for s in eng_train ]
def train(lang, n): langCorpus = [] for x in lang: langCorpus.append(udhr.raw(x + '-Latin1')) return multiNgram(langCorpus, n)
""" cfd.plot() """ # examining length differences in translated languages from nltk.corpus import udhr languages = [ 'Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik' ] cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages for word in udhr.words(lang + '-Latin1')) cfd.plot(cumulative=True) # plot frequency distribution of the letters raw_text = udhr.raw('English-Latin1') nltk.FreqDist(raw_text).plot() # the basic functions of nltk are raw, words, and sents """ # loading your own corpus from nltk.corpus import PlaintextCorpurReader corpus_root = '/data' wordlists = PlaintextCorpusReader(corpus_root, '.*') print(wordlists.fileids()) # reads all the file names print(wordlists.words('filename')) # prints words in file called filename """ # conditional frequency distributions # counting words by genre from nltk.corpus import brown