Ejemplo n.º 1
0
def Training():

    global weight

    print("\nTraining the model using the given data , Please Wait . . . \n")

    ## Read the corpora
    english = udhr.raw("English-Latin1")
    german = udhr.raw("German_Deutsch-Latin1")
    italian = udhr.raw("Italian-Latin1")
    spanish = udhr.raw("Spanish-Latin1")

    ## Pass these to NgramCalculator to calculate n-grams
    NgramCalculator(english, 1)
    NgramCalculator(german, 2)
    NgramCalculator(italian, 3)
    NgramCalculator(spanish, 4)

    print("Taking " + str(weight) + " grams")

    ## Read the news files sequentially
    for i in range(len(names)):
        filename = names[i] + ".txt"
        string = ""
        with open(filename, encoding="utf-8") as file:
            content = file.readlines()
            for line in content:
                string += "".join(line)  # Append to the string

        NgramCalculator(string, i + 1)

    print("\nTraining Completed . . .\n")
Ejemplo n.º 2
0
 def test_lang_similar(self):
     print 'lang_similar'
     ld = LDChar(languages)
     t_ls = [udhr.raw(language + '-Latin1') for language in languages]
     for t in t_ls:
         print ld.guess_language(t)
     print 'lang_similar FINISH'
Ejemplo n.º 3
0
    def __init__(self, fileid):
        try:
            # Reads the UDHR file
            corpus = udhr.raw(fileid)
        except:
            print("UDHR language file " + fileid + " does not exist",
                  file=sys.stderr)
            sys.exit(1)

        # Generate training dataset, lowercase and newlines converted to space
        self.train = re.sub(r'[\n]+', ' ', corpus[0:1000].strip().lower())
        # Generate dev dataset
        self.dev = corpus[1000:1100]

        # Convert training words to single characters
        tokens = list(self.train)
        self.unigram = tokens
        self.bigram = list(nltk.bigrams(tokens))
        self.trigram = list(nltk.trigrams(tokens))
        # Generate unigram frequency distirbution
        self.unigramFreq = FreqDist(self.unigram)
        # Generate bigram frequency distribution
        self.bigramFreq = ConditionalFreqDist(self.bigram)
        # Generate trigram frequency distribution
        self.trigramFreq = ConditionalFreqDist(
            list(((w0, w1), w2) for w0, w1, w2 in self.trigram))
Ejemplo n.º 4
0
def test_digest_processor_fr():
    processor = Processor()
    text = udhr.raw('French_Francais-Latin1')
    text_process_params = TextProcessParams(SummarySize.new_relative(0.1),
                                            keywords_number=10)
    document = processor.process_text(text, text_process_params)
    assert isinstance(document, Document)
    assert 5 <= len(document.sentences)
Ejemplo n.º 5
0
def test_digest_processor_de():
    processor = Processor()
    # text = open(path.join(__location__, 'de_text.txt'), 'r', encoding='utf8').read()
    text = udhr.raw('German_Deutsch-Latin1')
    text_process_params = TextProcessParams(SummarySize.new_absolute(3),
                                            keywords_number=10)
    document = processor.process_text(text, text_process_params)
    assert isinstance(document, Document)
    assert 5 <= len(document.sentences)
Ejemplo n.º 6
0
 def __init__(self, file):
     corpus = udhr.raw(file)
     self.training_set = corpus[0:1000]
     token = list(self.training_set)
     self.unigram = token
     self.bigram = list(nltk.bigrams(token))
     self.trigram = list(nltk.trigrams(token))
     self.unigram_frequency = FreqDist(self.unigram)
     self.bigram_frequency = ConditionalFreqDist(self.bigram)
     self.trigam_frequency = ConditionalFreqDist(
         list(((x, y), z) for x, y, z in self.trigram))
    def __init__(self, corpura):

        corpus = udhr.raw(corpura)

        self.TrainingSet = corpus[0:1000]
        token = list(self.TrainingSet)

        self.Uni = token
        self.Bi = list(nltk.bigrams(token))
        self.Tri = list(nltk.trigrams(token))

        self.UniFreq = FreqDist(self.Uni)
        self.BiFreq = ConditionalFreqDist(self.Bi)
        self.TriFreq = ConditionalFreqDist(
            list(((w1, w2), w3) for w1, w2, w3 in self.Tri))
Ejemplo n.º 8
0
def language(texto):

    #fd = nltk.FreqDist(texto)
    fd = nltk.FreqDist(word_tokenize(texto))
    #print(list(fd))
    #print(fd.most_common(50))
    correlationMax = -10000
    langFinal = '-Latin1'

    for lang in udhr.fileids():
        if lang[-7:] == '-Latin1':
            fdu = nltk.FreqDist(word_tokenize(udhr.raw(lang)))
            #fdu = nltk.FreqDist(udhr.raw(lang))
            correlation = nltk.spearman_correlation(
                list(ranks_from_sequence(fd)), list(ranks_from_sequence(fdu)))
            #print(fdu.most_common(50))
            #print(lang,correlation)
            if correlation > correlationMax:
                langFinal = lang
                correlationMax = correlation

    return langFinal + ',corr:' + str(correlationMax)
def main():
    english = udhr.raw("English-Latin1")  # loading data for English from UDHR
    textTokens = re.split(
        " |\n", english)  # split text based on space or next line feen \n
    for i in range(len(textTokens)):
        listNgrams = findNGrams(textTokens[i], ngram)
        for j in range(len(listNgrams)):
            if (listNgrams[j] not in dictNGram):
                dictNGram[listNgrams[j]] = list()
                # each ngram in dictionary contain a list of words that contain that ngram
            dictNGram[listNgrams[j]].append(textTokens[i])

    ### now I have all list of words that contain possible ngrams ##
    testLine = input("Enter a sentence for suggestions: ")
    nSuggestions = int(input("Number of suggestions you want: "))
    testWords = testLine.split(" ")
    for i in range(len(testWords)):
        candidateWordsDist = list()  # cadidate words with edit distance
        candidateWords = list()  # unique candidate word list
        testNgrams = findNGrams(testWords[i], ngram)

        for ng in testNgrams:
            if (ng in dictNGram):
                for wr in dictNGram[ng]:
                    if (wr not in candidateWords):
                        candidateWords.append(wr)
                        candidateWordsDist.append(
                            [wr, editDistance(testWords[i], wr)]
                        )  # compute the edit distance test word and train data
        candidateWordsDist.sort(key=itemgetter(1))
        #if(candidateWordsDist[0][1]==0):
        #   print("The word: "+testWords[i]+" is correct no need suggestions")
        #else:
        # line below display possible suggestions for all words in sentence
        print("possible correct spellings for word => " + testWords[i] +
              " are given below: ")
        print(candidateWordsDist[:nSuggestions])  # print possible corrections
Ejemplo n.º 10
0
def fun15():
    """freq dist plot"""
    raw_text = udhr.raw('Chinese_Mandarin-GB2312')
    FreqDist(raw_text).plot()
Ejemplo n.º 11
0
cfd.plot(cumulative=True)

languages = [
    'Chickasaw-Latin1', 'English-Latin1', 'German_Deutsch-Latin1',
    'Greenlandic_Inuktikut-Latin1', 'Hungarian_Magyar-Latin1',
    'Ibibio_Efik-Latin1'
]  # , 'Chinese_Mandarin-UTF8']
cfd = nltk.ConditionalFreqDist(
    (lang, len(word)) for lang in languages for word in udhr.words(lang))
cfd.plot(cumulative=True)
cfd.tabulate(samples=range(10), cumulative=True)
cfd.tabulate(conditions=['English-Latin1', 'German_Deutsch-Latin1'],
             samples=range(10),
             cumulative=True)
# 中文是字符型的,不能使用单词读入
chinese_mandarin_raw = udhr.raw('Chinese_Mandarin-UTF8')
print(chinese_mandarin_raw)
chinese_mandarin_words = udhr.words('Chinese_Mandarin-UTF8')
chinese_mandarin_words
chinese_mandarin_sents = udhr.sents('Chinese_Mandarin-UTF8')
chinese_mandarin_sents


def generate_model(cfdist, word, num=15):
    for i in range(num):
        print(word, end=' ')
        word = cfdist[word].max()


# 1.8. 文本语料库的结构
raw = gutenberg.raw('burgess-busterbrown.txt')
Ejemplo n.º 12
0
#nltk.download('cess_esp')
#nltk.download('floresta')
#nltk.download('indian')
#nltk.download('udhr')

#print(nltk.corpus.cess_esp.words(), '\n')

#print(nltk.corpus.floresta.words(), '\n')

#print(nltk.corpus.indian.words('hindi.pos'), '\n')

#print(nltk.corpus.udhr.fileids(), '\n')

#print(nltk.corpus.udhr.words('Javanese-Latin1')[:11], '\n')

from nltk.corpus import udhr

#languages = ['Chickasaw', 'English', 'German_Deutsch',
#             'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']

#cfd = nltk.ConditionalFreqDist(
#    (lang, len(word))
#    for lang in languages
#    for word in udhr.words(lang + '-Latin1'))

#cfd.plot(cumulative=True)

raw_text = udhr.raw('Italian-Latin1')

nltk.FreqDist(raw_text).plot()
Ejemplo n.º 13
0
__author__ = 'User'

import nltk
from nltk.corpus import udhr

# Russian-Cyrillic', 'Russian-UTF8', 'Russian_Russky-Cyrillic', 'Russian_Russky-UTF8

languages = ['English-Latin1', 'Hungarian_Magyar-UTF8', 'Russian_Russky-UTF8', 'Russian-Cyrillic'] #
# cfd = nltk.ConditionalFreqDist(
#     (lang, len(word))
#     for lang in languages
#     for word in udhr.words(lang))
# cfd.plot(cumulative=True)
raw_text = udhr.raw('English-Latin1')
nltk.FreqDist(raw_text).plot()
Ejemplo n.º 14
0
cfd.tabulate(conditions=genres, samples=modals)

# plots with CFD
from nltk.corpus import inaugural

cfd = nltk.ConditionalFreqDist((target, fileid[:4])
                               for fileid in inaugural.fileids()
                               for w in inaugural.words(fileid)
                               for target in ['america', 'citizen']
                               if w.lower().startswith(target))

cfd.plot()

# more plots, universal declaration of human rights
# cumulative word length distributions
from nltk.corpus import udhr

languages = [
    'Chickasaw', 'Greenlandic_Inuktikut', 'Quechua', 'Indonesian',
    'French_Francais'
]
cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages
                               for word in udhr.words(lang + '-Latin1'))

cfd.plot(cumulative=True)
raw_text = udhr.raw('Javanese-Latin1')
nltk.FreqDist(raw_text).plot()

udhr.fileids()
Ejemplo n.º 15
0
Archivo: 3-26.py Proyecto: jbbe/lang
import nltk, re
from nltk.corpus import udhr, PlaintextCorpusReader

hung = udhr.raw('Spanish-Latin1')

corpus_root = '/Users/jbbe/interesa/txt_books'
file_names = [
    'amuleto.txt', 'estrella_distante.txt', 'putas_asesinas.txt',
    'det_salv_bol.txt', 'sav_det.txt', 'la_invencion_de_morel.txt',
    'don_quixote.txt', 'borges_ficc.txt', 'hot_sur.txt', 'diez_muj.txt',
    'rayuela-cortazar.txt'
]
vowels = ['a', 'e', 'i', 'o', 'u']

books = PlaintextCorpusReader(corpus_root, file_names)
big_spanish_text = ''.join([
    books.raw('det_salv_bol.txt'),
    books.raw('don_quixote.txt'),
    books.raw('rayuela-cortazar.txt'),
    books.raw('borges_ficc.txt'),
    books.raw('amuleto.txt'),
    books.raw('estrella_distante.txt'),
    books.raw('la_invencion_de_morel.txt'),
    books.raw('hot_sur.txt')
])
hung = big_spanish_text
pattern = r'(?x)[aeiou]{2}'
# vowel_seqs = nltk.regexp_tokenize(hung, pattern)
vowel_seqs = re.findall(pattern, hung)
seq_lists = [(seq[0], seq[1]) for seq in vowel_seqs]
print(seq_lists)
Ejemplo n.º 16
0
#!/usr/bin/python3

from nltk.corpus import wordnet as wn
from nltk.corpus import udhr
import sys, nltk

def find_synonym(token):
    synsets = wn.synsets(token)
    for synset in synsets:
        for lemma in synset.lemmas():
            if lemma.name() != token:
                return lemma.name()
    return token

def main(text):
    for sent in text.split('\n'):
        tokens = [token.lower() for token in nltk.word_tokenize(sent)]
        new_text = ""
        for token in tokens:
            new_text+=find_synonym(token)+' '
        print(new_text)

if __name__ == '__main__':
    if len(sys.argv) > 1:
        main(sys.argv[1])
    else:
        main(udhr.raw('English-Latin1'))
def problem1():

    from nltk.corpus import udhr
    import re
    import string

    english = udhr.raw('English-Latin1')
    french = udhr.raw('French_Francais-Latin1')
    italian = udhr.raw('Italian_Italiano-Latin1')
    spanish = udhr.raw('Spanish_Espanol-Latin1')

    english_train, english_dev = english[0:1000], english[1000:1100]
    french_train, french_dev = french[0:1000], french[1000:1100]
    italian_train, italian_dev = italian[0:1000], italian[1000:1100]
    spanish_train, spanish_dev = spanish[0:1000], spanish[1000:1100]

    english_test = udhr.words('English-Latin1')[0:1000]
    french_test = udhr.words('French_Francais-Latin1')[0:1000]
    italian_test = udhr.words('Italian_Italiano-Latin1')[0:1000]
    spanish_test = udhr.words('Spanish_Espanol-Latin1')[0:1000]

    #english_test = english_test.split(" ")
    #spanish_test = spanish_test.split(" ")
    #italian_test = italian_test.split(" ")
    #spanish_test = spanish_test.split(" ")
    """Englis tain Model"""
    words_all = []
    translate_table = dict((ord(char), None) for char in string.punctuation)
    for line in english_train:
        line = line.lower()
        if len(line) != 0:
            line = line.translate(translate_table)
        words_all += line
    #  words_all.append(" ")
    all_str = ''.join(words_all)
    all_str = re.sub(' +', ' ', all_str)
    all_str = re.sub('\n', ' ', all_str)
    all_str = re.sub(' +', ' ', all_str)
    all_str = re.sub(' ', ' ', all_str)
    #print(all_str)
    onecharfreqdict = char_frequency(all_str)
    #print(onecharfreqdict)
    bilist = (word2grams(all_str))
    bicharfreqdict = ngramfreq(bilist)
    #print(bicharfreqdict)
    trilist = (word3grams(all_str))
    tricharfreqdict = ngramfreq(trilist)
    """French tain Model"""
    words_all = []
    translate_table = dict((ord(char), None) for char in string.punctuation)
    for line in french_train:
        line = line.lower()
        if len(line) != 0:
            line = line.translate(translate_table)
        words_all += line
    all_str = ''.join(words_all)
    all_str = re.sub(' +', ' ', all_str)
    all_str = re.sub('\n', ' ', all_str)
    all_str = re.sub(' +', ' ', all_str)
    all_str = re.sub(' ', ' ', all_str)
    print(all_str)
    french_onecharfreqdict = char_frequency(all_str)
    print(onecharfreqdict)
    bilist = (word2grams(all_str))
    french_bicharfreqdict = ngramfreq(bilist)
    print(bicharfreqdict)
    trilist = (word3grams(all_str))
    french_tricharfreqdict = ngramfreq(trilist)
    """English test on English vs French Unigram models:"""

    english_freq = 0
    french_freq = 0

    for i in english_test:
        #print("next word")
        i = "".join(c for c in i if c not in ('!', '.', ':', ',', ' '))
        eng_wordprob = 1
        french_wordprob = 1
        for unichar in (word1grams(i)):
            if unichar in onecharfreqdict.keys():
                eng_unicharprob = round(
                    (onecharfreqdict[unichar] / len(english_test)), 4)
                # print(trichar,onecharfreqdict[trichar],unichar[0],onecharfreqdict[trichar[0]],round(eng_bicharprob,4))
                eng_wordprob *= eng_unicharprob
            if unichar in french_onecharfreqdict.keys():
                french_unicharprob = round(
                    (french_onecharfreqdict[unichar]) / len(english_test), 4)
                french_wordprob *= french_unicharprob

        if eng_wordprob >= french_wordprob:
            english_freq += 1
        else:
            french_freq += 1

        print(i, round(eng_wordprob, 10), round(french_wordprob, 10),
              english_freq, french_freq)
        eng_wordprob = round(eng_wordprob, 10)
        french_wordprob = round(french_wordprob, 10)
        eng_unigram_probability = ((english_freq /
                                    (english_freq + french_freq) * 100))

# print(english_freq,french_freq)
    print("Accuracy of English test on English vs French Uni-gram models:  ",
          eng_unigram_probability, "%")
    """English test on English vs French bigram models """
    english_freq = 0
    french_freq = 0

    for i in english_test:
        #print("next word")
        i = "".join(c for c in i if c not in ('!', '.', ':', ',', ' '))
        eng_wordprob = 1
        french_wordprob = 1
        for bichar in (word2grams(i)):
            if bichar in bicharfreqdict.keys():
                eng_bicharprob = round(bicharfreqdict[bichar], 4) / round(
                    onecharfreqdict[bichar[0]], 4)
                print(bichar, bicharfreqdict[bichar], bichar[0],
                      onecharfreqdict[bichar[0]], round(eng_bicharprob, 4))
                eng_wordprob *= eng_bicharprob
            if bichar in french_bicharfreqdict.keys():
                french_bicharprob = round(
                    french_bicharfreqdict[bichar], 4) / round(
                        french_onecharfreqdict[bichar[0]], 4)
                french_wordprob *= french_bicharprob

        if eng_wordprob >= french_wordprob:
            english_freq += 1
        else:
            french_freq += 1

        print(i, round(eng_wordprob, 10), round(french_wordprob, 10),
              english_freq, french_freq)
        eng_wordprob = round(eng_wordprob, 10)
        french_wordprob = round(french_wordprob, 10)
        eng_bigram_probability = ((english_freq /
                                   (english_freq + french_freq) * 100))
    #print(english_freq,french_freq)
    print("Accuracy of English test on English vs French bigram models:  ",
          eng_bigram_probability, "%")
    """Accuracy of English test on English vs French Tri-gram models: """

    english_freq = 0
    french_freq = 0

    for i in english_test:
        #print("next word")
        i = "".join(c for c in i if c not in ('!', '.', ':', ',', ' '))
        eng_wordprob = 1
        french_wordprob = 1
        for trichar in (word3grams(i)):
            if trichar in tricharfreqdict.keys():
                eng_bicharprob = round(tricharfreqdict[trichar], 4) / round(
                    onecharfreqdict[trichar[0]], 4)
                print(trichar, tricharfreqdict[trichar], trichar[0],
                      onecharfreqdict[trichar[0]], round(eng_bicharprob, 4))
                eng_wordprob *= eng_bicharprob
            if trichar in french_tricharfreqdict.keys():
                french_bicharprob = round(
                    french_tricharfreqdict[trichar], 4) / round(
                        french_onecharfreqdict[trichar[0]], 4)
                french_wordprob *= french_bicharprob

        if eng_wordprob >= french_wordprob:
            english_freq += 1
        else:
            french_freq += 1

        print(i, round(eng_wordprob, 10), round(french_wordprob, 10),
              english_freq, french_freq)
        eng_wordprob = round(eng_wordprob, 10)
        french_wordprob = round(french_wordprob, 10)
        eng_trigram_probability = ((english_freq /
                                    (english_freq + french_freq) * 100))
    print(english_freq, french_freq)
    print("Accuracy of English test on English vs French Tri-gram models:  ",
          eng_trigram_probability, "%")
    """ same experiment as above for Spanish vs. Italian """
    """italian_train  Model"""
    words_all = []
    translate_table = dict((ord(char), None) for char in string.punctuation)
    for line in italian_train:
        line = line.lower()
        if len(line) != 0:
            line = line.translate(translate_table)
        words_all += line
    all_str = ''.join(words_all)
    all_str = re.sub(' +', ' ', all_str)
    all_str = re.sub('\n', ' ', all_str)
    all_str = re.sub(' +', ' ', all_str)
    all_str = re.sub(' ', ' ', all_str)
    print(all_str)
    italian_onecharfreqdict = char_frequency(all_str)
    print(onecharfreqdict)
    bilist = (word2grams(all_str))
    italian_bicharfreqdict = ngramfreq(bilist)
    print(bicharfreqdict)
    trilist = (word3grams(all_str))
    italian_tricharfreqdict = ngramfreq(trilist)
    """spanish_train  Model"""
    words_all = []
    translate_table = dict((ord(char), None) for char in string.punctuation)
    for line in spanish_train:
        line = line.lower()
        if len(line) != 0:
            line = line.translate(translate_table)
        words_all += line
    all_str = ''.join(words_all)
    all_str = re.sub(' +', ' ', all_str)
    all_str = re.sub('\n', ' ', all_str)
    all_str = re.sub(' +', ' ', all_str)
    all_str = re.sub(' ', ' ', all_str)
    print(all_str)
    spanish_onecharfreqdict = char_frequency(all_str)
    print(onecharfreqdict)
    bilist = (word2grams(all_str))
    spanish_bicharfreqdict = ngramfreq(bilist)
    print(bicharfreqdict)
    trilist = (word3grams(all_str))
    spanish_tricharfreqdict = ngramfreq(trilist)
    """spanish test on Spanish vs Italian Unigram models:"""

    spanish_freq = 0
    italian_freq = 0

    for i in spanish_test:
        #print("next word")
        i = "".join(c for c in i if c not in ('!', '.', ':', ',', ' '))
        spanish_wordprob = 1
        italian_wordprob = 1
        for unichar in (word1grams(i)):
            if unichar in spanish_onecharfreqdict.keys():
                spanish_unicharprob = round(
                    (spanish_onecharfreqdict[unichar] / 1000), 4)
                # print(trichar,spanish_onecharfreqdict[trichar],unichar[0],spanish_onecharfreqdict[trichar[0]],round(eng_bicharprob,4))
                spanish_wordprob *= spanish_unicharprob
            if unichar in italian_onecharfreqdict.keys():
                italian_unicharprob = round(
                    (italian_onecharfreqdict[unichar]) / 1000, 4)
                italian_wordprob *= italian_unicharprob

        if spanish_wordprob >= italian_wordprob:
            spanish_freq += 1
        else:
            italian_freq += 1

        #print(i,round(spanish_wordprob,10), round(italian_wordprob,10),spanish_freq,italian_freq)
        spanish_wordprob = round(spanish_wordprob, 10)
        italian_wordprob = round(italian_wordprob, 10)
        spanish_unigram_probability = ((spanish_freq /
                                        (spanish_freq + italian_freq) * 100))
    #print("Accuracy of Spanish test on Spanish vs Italian Uni-gram models:  ", spanish_unigram_probability,"%")
    """spanish test on Spanish vs Italian Bi-gram models:"""

    spanish_freq = 0
    italian_freq = 0

    for i in spanish_test:
        #print("next word")
        i = "".join(c for c in i if c not in ('!', '.', ':', ',', ' '))
        spanish_wordprob = 1
        italian_wordprob = 1
        for bichar in (word2grams(i)):
            if bichar in spanish_bicharfreqdict.keys():
                spanish_bicharprob = round(
                    spanish_bicharfreqdict[bichar], 4) / round(
                        spanish_onecharfreqdict[bichar[0]], 4)
                # print(trichar,spanish_onecharfreqdict[trichar],bichar[0],spanish_onecharfreqdict[trichar[0]],round(eng_bicharprob,4))
                spanish_wordprob *= spanish_bicharprob
            if bichar in italian_bicharfreqdict.keys():
                italian_bicharprob = round(
                    italian_bicharfreqdict[bichar], 4) / round(
                        italian_onecharfreqdict[bichar[0]], 4)
                italian_wordprob *= italian_bicharprob

        if spanish_wordprob > italian_wordprob:
            spanish_freq += 1
        else:
            italian_freq += 1

        #print(i,round(spanish_wordprob,10), round(italian_wordprob,10),spanish_freq,italian_freq)
        spanish_wordprob = round(spanish_wordprob, 10)
        italian_wordprob = round(italian_wordprob, 10)
        spanish_bigram_probability = ((spanish_freq /
                                       (spanish_freq + italian_freq) * 100))
    #print("Accuracy of Spanish test on Spanish vs Italian bi-gram models:  ", spanish_bigram_probability,"%")
    """spanish test on Spanish vs Italian Tri-gram models:"""

    spanish_freq = 0
    italian_freq = 0

    for i in spanish_test:
        #print("next word")
        i = "".join(c for c in i if c not in ('!', '.', ':', ',', ' '))
        spanish_wordprob = 1
        italian_wordprob = 1
        for bichar in (word3grams(i)):
            if bichar in spanish_tricharfreqdict.keys():
                spanish_bicharprob = round(
                    spanish_tricharfreqdict[bichar], 4) / round(
                        spanish_onecharfreqdict[bichar[0]], 4)
                print(bichar, spanish_tricharfreqdict[bichar], bichar[0],
                      spanish_onecharfreqdict[trichar[0]],
                      (spanish_bicharprob))
                spanish_wordprob *= spanish_bicharprob
            if bichar in italian_tricharfreqdict.keys():
                italian_bicharprob = round(
                    italian_tricharfreqdict[bichar], 4) / round(
                        italian_onecharfreqdict[bichar[0]], 4)
                italian_wordprob *= italian_bicharprob

        if spanish_wordprob >= italian_wordprob:
            spanish_freq += 1
        else:
            italian_freq += 1

        #print(i,round(spanish_wordprob,10), round(italian_wordprob,10),spanish_freq,italian_freq)
        spanish_wordprob = round(spanish_wordprob, 10)
        italian_wordprob = round(italian_wordprob, 10)
        spanish_trigram_probability = ((spanish_freq /
                                        (spanish_freq + italian_freq) * 100))
    #print("Accuracy of Spanish test on Spanish vs Italian tri-gram models:  ", spanish_trigram_probability,"%")

    print("Accuracy of English test on English vs French Uni-gram models:  ",
          eng_unigram_probability, "%")
    print("Accuracy of English test on English vs French bigram models:  ",
          eng_bigram_probability, "%")
    print("Accuracy of English test on English vs French Tri-gram models:  ",
          eng_trigram_probability, "%")
    print(
        "\nAccuracy of Spanish test on Spanish vs Italian Uni-gram models:  ",
        spanish_unigram_probability, "%")
    print("Accuracy of Spanish test on Spanish vs Italian bi-gram models:  ",
          spanish_bigram_probability, "%")
    print("Accuracy of Spanish test on Spanish vs Italian tri-gram models:  ",
          spanish_trigram_probability, "%")

    return
Ejemplo n.º 18
0
import nltk
from nltk.corpus import udhr as u
#The full text of the declaration in Ibibio-Efik
print(u.raw('Ibibio_Efik-Latin1'))

#The length (in words) of the text in Amahuaca and in Greenlandic, and which one is longer
word_lenA = len(u.words('Amahuaca'))
word_lenG = len(u.words('Greenlandic_Inuktikut-Latin1'))
print('\nAmahuaca one has %s words and Greenland one has %s words.' %
      (word_lenA, word_lenG))
if word_lenA > word_lenG:
    print('Amahuaca one is longer.')
else:
    print('Greenland one is longer.')

#The first sentence of the text in Turkish
sentences = u.sents('Turkish_Turkce-Turkish')
sentence1 = ' '.join(sentences[1])
print('\n', sentence1)
Ejemplo n.º 19
0
def nltk_corpora():
    ## 1. PROJECT GUTENBERG << Formal Language - Literature;ebooks 60K++
    emma = nltk.corpus.gutenberg.words("austen-emma.txt")
    emma = nltk.Text(emma)

    len(emma)
    lexical_diversity(emma)

    emma.concordance("brave")
    emma.collocation_list()

    ## traits of the corpus text for each
    def corp_content(corporad):
        print(
            "{0} File {0} \t\tWord len   Sent len   Vocab   Lexical Complexity"
            .format(" " * 6))
        print("{}".format("-" * 100))
        for i, txt in enumerate(corporad.fileids()):
            sents_l = len(corporad.words(txt))
            try:
                sents_l = len(corporad.sents(txt))
            except:
                sents_l = len(corporad.posts(txt))
            w_len = round(len(corporad.raw(txt)) / len(corporad.words(txt)))
            s_len = round(len(corporad.words(txt)) / sents_l)
            voc = len(set(w.lower() for w in corporad.words(txt)))
            # lexp = round( voc / len( [w.lower() for w in gutenberg.words(txt)] ) * 100 )
            lexp = round(voc / len(corporad.words(txt)) * 100)
            print("{}. {} \t\t{}\t{}\t{}\t{}%\t{}".format(
                i, txt, w_len, s_len, voc, lexp,
                corporad.raw(txt)[:30]))
            # print( "{}. {} \t\t{}\t{}\t{}\t{}%\t{}".format(i, txt, w_len, s_len, voc, lexp, corporad.words(txt)[:5] ) )

    # 1. Formal Language - Project Gutenberg ebooks 60K++, 16+ languages
    corp_content(gutenberg)

    # 2. Informal Language - Web content and Chat rooms
    corp_content(webtext)
    corp_content(nps_chat)

    # 3. Brown Corpus - 15+ Multi-genre, 500+ sources, En_lang << http://icame.uib.no/brown/bcm-los.html
    # for studying systematic differences between genres I.E. stylistics
    corp_content(brown)

    brown.categories()
    brown.words(categories="news")
    brown.words(categories=["news", "editorial", "reviews"])

    # example stylistics - modal verbs usage between genres
    def modalz(modals):
        print("\tCategory\t", end=" ")
        for m in modals:
            print("\t{}".format(m), end=" ")
        print("\n" + "-" * 100)
        for i, cat in enumerate(brown.categories()):
            print("{}.{}\t\t".format(i, cat), end=" ")
            fdist = nltk.FreqDist(w.lower()
                                  for w in brown.words(categories=cat))
            for m in modals:
                print("\t{}".format(fdist[m]), end=" ")
            print("")

    modalz(["can", "could", "may", "might", "must", "will"])
    modalz(["should", "ought", "would", "could", "do", "did", "does"])
    modalz(["what", "when", "where", "why", "who"])

    ## ditto using nltk conditional frequency distributions
    cfdist = nltk.ConditionalFreqDist(
        (genre, word) for genre in brown.categories()
        for word in brown.words(categories=genre))

    genz = ["news", "religion", "hobbies", "humor", "romance"]
    modz = ["can", "could", "may", "might", "must", "will"]
    cfdist.tabulate(conditions=genz, samples=modz)

    # 4. Reuters Corpus - news articles, 90 topics, grouped into training and testing sets
    # << Apparent goal is to predict the category/topic of a given article??
    corp_content(reuters)
    # retrieve topic(s) of a given article
    reuters.categories("training/9865")
    reuters.categories(["training/9865", "training/9880"])
    # find articles that cover some topic(s)
    reuters.fileids("barley")
    reuters.fileids(["barley", "corn"])

    # the first words are in all CAPs and are the titles of the article. The rest is the story text
    for i, txt in enumerate(reuters.fileids(["barley", "oil"])):
        print("{}. {}\t{}".format(i, txt, reuters.words(txt)[:10]))

    # 5. Speeches - Inaugral Address Corpus << 55 USA Presidential addresses
    # << interesting in that there's a time horizon element from 1789  to 2009 (first 4 xters of fileid = year) ; can study how language changes with time; could reflect on priorities, culture, ???
    corp_content(inaugural)
    # how America and Citizen ar eused over time
    cfdist = nltk.ConditionalFreqDist((target, fileid[:4])
                                      for fileid in inaugural.fileids()
                                      for w in inaugural.words(fileid)
                                      for target in ['america', 'citizen']
                                      if w.lower().startswith(target))
    cfdist.plot()

    # 6. Annotated Text Corpora
    # annotations: POS, named entities, syntatic structures, semantic roles,

    # 7. Other Languages Corpora
    # includes udhr = Universal Declaration of Human Rights in over 300 languages

    # word length freq by diff languages
    langz = [
        "English", "Chickasaw", "German_Deutsch", "Kinyarwanda",
        "Swahili_Kiswahili"
    ]
    cfdist = nltk.ConditionalFreqDist((lang, len(word)) for lang in langz
                                      for word in udhr.words(lang + "-Latin1"))
    cfdist.plot()
    cfdist.plot(cumulative=True)

    # alphabet freq
    nltk.FreqDist(udhr.raw("Kinyarwanda-Latin1")).plot()

    # 8. Loading your own Corpora
    # << txt files. Use PlaintextCorpusReader. Check dir location
    #
    my_corpus = PlaintextCorpusReader(
        "root_dir_path_here", ".*"
    )  # second param is a list of fileids defined as a list or an ls pattern
    eg_corpus = PlaintextCorpusReader(
        "D:/zRepoz/dataSaysWhat/DocReader/res/txt_corpus", ".txt")
    eg_corpus.fileids()
    eg_corpus.words("example1.txt")
    len(eg_corpus.sents())

    #BracketParseCorpusReader
    my_corpus = nltk.corpus.BracketParseCorpusReader("path", "file_pattern")
Ejemplo n.º 20
0
modals = ['can', 'could', 'may', 'might', 'must', 'will']

cfd.tabulate(conditions=genres, samples=modals)

# plots with CFD
from nltk.corpus import inaugural
cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    for target in ['america', 'citizen']
    if w.lower().startswith(target))

cfd.plot()

# more plots, universal declaration of human rights
# cumulative word length distributions
from nltk.corpus import udhr
languages = ['Chickasaw', 'Greenlandic_Inuktikut', 'Quechua', 'Indonesian', 'French_Francais']
cfd = nltk.ConditionalFreqDist(
    (lang, len(word))
    for lang in languages 
    for word in udhr.words(lang + '-Latin1'))

cfd.plot(cumulative=True)
raw_text = udhr.raw('Javanese-Latin1')
nltk.FreqDist(raw_text).plot()

udhr.fileids()

Ejemplo n.º 21
0
nltk.corpus.indian.words('hindi.pos')
nltk.corpus.udhr.fileids()
nltk.corpus.udhr.words('Javanese-Latin1')[11:]

from nltk.corpus import udhr

languages = [
    'Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut',
    'Hungarian_Magyar', 'Ibibio_Efik'
]
cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages
                               for word in udhr.words(lang + '-Latin1'))
cfd.plot(cumulative=True)

#Exercise
raw_text = udhr.raw('Romani-Latin1')
nltk.FreqDist(raw_text).plot()

# # # # # # # # # # # # # # # # # # # #
# 	Text Corpus Structure
# # # # # # # # # # # # # # # # # # # #

#Gutenberg
raw = gutenberg.raw('burgess-busterbrown.txt')
raw[1:20]
words = gutenberg.words('burgess-busterbrown.txt')
words[1:20]
sents = gutenberg.sents('burgess-busterbrown.txt')
sents[1:20]

#With own local copy
Ejemplo n.º 22
0
        if string.strip().lower() == 'q':
            print("Goodbye :)")
            exit()

        ## Clean the text and send it for processing
        else:
            cleaned_text = string.strip().lower()
            Sentence_splitting(cleaned_text)


if __name__ == "__main__":

    ## Used UDHR corpus for training
    print("\nTraining on UDHR corpus, Please Wait . . .")

    var = udhr.raw("English-Latin1")
    tokenized_values = Tokenizer(var.lower())
    Ngrams = NgramCalculator(tokenized_values)
    aggregator(Ngrams)

    print("\nTraining Done.")

    ## Used Gulliver's Travels for training as well to increase my training data set
    print("\nTraining on Gulliver's Travels book, Please Wait . . .")

    file = open("Gulliver.txt", "r", encoding="utf-8")
    for line in file:
        tokenized_values = Tokenizer(line.strip().lower())
        Ngrams = NgramCalculator(tokenized_values)
        aggregator(Ngrams)
Ejemplo n.º 23
0
def main():
    ###################### in this block I load train data from text files
    #f = open("eng.txt")
    #alltext=f.read()
    #nGramsEng=nGrams(alltext,3)
    #f = open("ger.txt")
    #alltext=f.read()
    #nGramsGer=nGrams(alltext,3)
    #f = open("spn.txt")
    #alltext=f.read()
    #nGramsSpn=nGrams(alltext,3)
    #f = open("itn.txt")
    #alltext=f.read()
    #nGramsItn=nGrams(alltext,3)
    #f = open("frn.txt")
    #alltext=f.read()
    #nGramsFrn=nGrams(alltext,3)
    #f = open("danish.txt")
    #alltext=f.read()
    #nGramsDanish=nGrams(alltext,3)
    #f = open("swedish.txt")
    #alltext=f.read()
    #nGramsSwedin=nGrams(alltext,3)
    ################# in this block train data load from nltk.corpus.udhr (Universal Declaration of Human Rights)

    english = udhr.raw("English-Latin1")
    french = udhr.raw("French_Francais-Latin1")
    german = udhr.raw("German_Deutsch-Latin1")
    italian = udhr.raw("Italian-Latin1")
    spanish = udhr.raw("Spanish-Latin1")
    swedish = udhr.raw("Swedish_Svenska-Latin1")
    danish = udhr.raw("Danish_Dansk-Latin1")

    # nGrams() will generate all up to 3 (1-3) ngrams an compute frequencies of all ngrams
    # User can vary the up to ngrams value and can see difference in output
    upToNgrams = 3
    nGramsEng = nGrams(english, upToNgrams)
    nGramsGer = nGrams(german, upToNgrams)
    nGramsSpn = nGrams(spanish, upToNgrams)
    nGramsItn = nGrams(italian, upToNgrams)
    nGramsFrn = nGrams(french, upToNgrams)
    nGramsDanish = nGrams(danish, upToNgrams)
    nGramsSwedin = nGrams(swedish, upToNgrams)
    # in above part we compute ngrams up to 3 and we get best matching result till 1- 3

    # here input from user, whatever string user want to test
    inputStr = input(
        "Write a string to detect language (larger string gives good result): "
    )
    ngramsOfInput = nGrams(inputStr, 3)

    # below part computer similarities of test string from all languages
    result = {}
    result["English"] = cosineSim(nGramsEng, ngramsOfInput)
    result["German"] = cosineSim(nGramsGer, ngramsOfInput)
    result["Spanish"] = cosineSim(nGramsSpn, ngramsOfInput)
    result["Italian"] = cosineSim(nGramsItn, ngramsOfInput)
    result["french"] = cosineSim(nGramsFrn, ngramsOfInput)
    result["Danish"] = cosineSim(nGramsDanish, ngramsOfInput)
    result["Swedish"] = cosineSim(nGramsSwedin, ngramsOfInput)

    return result
Ejemplo n.º 24
0
Archivo: ch02.py Proyecto: gree2/hobby
def fun15():
    """freq dist plot"""
    raw_text = udhr.raw('Chinese_Mandarin-GB2312')
    FreqDist(raw_text).plot()
Ejemplo n.º 25
0
#Stemming words
from nltk.stem import PorterStemmer
ps = PorterStemmer()
example = ['ride', 'rides', 'rider', 'riding']

for w in example:
    print(ps.stem(w))

sentence = "When riders are riding their horses, they often think of how cowboy rode horses."
words = word_tokenize(sentence)
for w in words:
    print(ps.stem(w))

#Video 2
from nltk.corpus import udhr
print(udhr.raw('English-Latin1'))

from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')

#Train the PunktTokenizer
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)


#This function tags each tokenized word with a part of speech
def process_content():
    try:
 def test_raw_unicode(self):
     for name in udhr.fileids():
         txt = udhr.raw(name)
         assert not isinstance(txt, bytes), name
Ejemplo n.º 27
0
print(nltk.corpus.indian.words('hindi.pos'))
print(nltk.corpus.udhr.fileids()
      )  #universal declaration of human rights in > 300 languages
print(nltk.corpus.udhr.words('Javanese-Latin1'))

#cfd for udhr
languages = [
    'Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut',
    'Hungarian_Magyar', 'Ibibio_Efik'
]
cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages
                               for word in udhr.words(lang + '-Latin1'))
cfd.plot(cumulative=True)

#frequency distributions of letters in a text
raw_text = udhr.raw('Afrikaans-Latin1')
nltk.FreqDist(raw_text).plot()

#loading your own corpus
#for later (need to download a text corpus)

#conditional frequency distributions (theory)
genre_word = [(genre, word) for genre in ['news', 'romance']
              for word in brown.words(categories=genre)]
print(genre_word[:4])
print(genre_word[-4:])
cfd = nltk.ConditionalFreqDist(genre_word)
print(cfd)
print(cfd.conditions())
print(cfd["news"])
print(cfd["romance"])
Ejemplo n.º 28
0
# coding: utf-8

# In[91]:

import nltk
import string

from nltk.util import ngrams

from nltk.corpus import udhr

english = udhr.raw('English-Latin1')
french = udhr.raw('French_Francais-Latin1')
italian = udhr.raw('Italian_Italiano-Latin1')
spanish = udhr.raw('Spanish_Espanol-Latin1')

english_train, english_dev = english[0:1000], english[1000:1100]
french_train, french_dev = french[0:1000], french[1000:1100]
italian_train, italian_dev = italian[0:1000], italian[1000:1100]
spanish_train, spanish_dev = spanish[0:1000], spanish[1000:1100]
english_test = udhr.words('English-Latin1')[0:1000]
french_test = udhr.words('French_Francais-Latin1')[0:1000]
italian_test = udhr.words('Italian_Italiano-Latin1')[0:1000]
spanish_test = udhr.words('Spanish_Espanol-Latin1')[0:1000]

eng_train = list(english_train)
#print(eng_train)

eng_train = [
    ''.join(c for c in s if c not in string.punctuation) for s in eng_train
]
Ejemplo n.º 29
0
def train(lang, n):
    langCorpus = []
    for x in lang:
        langCorpus.append(udhr.raw(x + '-Latin1'))
    return multiNgram(langCorpus, n)
Ejemplo n.º 30
0
 def test_raw_unicode(self):
     for name in udhr.fileids():
         txt = udhr.raw(name)
         assert not isinstance(txt, bytes), name
Ejemplo n.º 31
0
"""
cfd.plot()
"""

# examining length differences in translated languages
from nltk.corpus import udhr
languages = [
    'Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut',
    'Hungarian_Magyar', 'Ibibio_Efik'
]
cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages
                               for word in udhr.words(lang + '-Latin1'))
cfd.plot(cumulative=True)

# plot frequency distribution of the letters
raw_text = udhr.raw('English-Latin1')
nltk.FreqDist(raw_text).plot()

# the basic functions of nltk are raw, words, and sents
"""
# loading your own corpus
from nltk.corpus import PlaintextCorpurReader
corpus_root = '/data'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
print(wordlists.fileids()) # reads all the file names
print(wordlists.words('filename')) # prints words in file called filename
"""

# conditional frequency distributions
# counting words by genre
from nltk.corpus import brown