Example #1
0
 def test_simple_tokenize(self):
     print("id: " + self.id())
     result_array_text = [
         "This", "text", "is", "written", "only", "for", "testing.",
         "There", "should", "not", "be", "any", "stop", "words", "at", "all"
     ]
     self.assertEqual(corpus.tokenize(self.input_text), result_array_text)
Example #2
0
 def test_tokenize_with_lemmatization(self):
     print("id: " + self.id())
     self.input_text += " adding words for testing lemmatization functions"
     result_array_text = [
         "This", "text", "be", "write", "only", "for", "testing.", "There",
         "should", "not", "be", "any", "stop", "word", "at", "all", "add",
         "word", "for", "test", "lemmatization", "function"
     ]
     self.assertEqual(corpus.tokenize(self.input_text, lemmatization=True),
                      result_array_text)
Example #3
0
 def test_tokenize_with_stop_words(self):
     print("id: " + self.id())
     stop_words = [
         "the", "a", "on", "is", "all", "for", "not", "no", "if", "in", "at"
     ]
     result_array_text = [
         "This", "text", "written", "only", "testing.", "There", "should",
         "be", "any", "stop", "words"
     ]
     self.assertEqual(
         corpus.tokenize(self.input_text, stop_words=stop_words),
         result_array_text)
Example #4
0
 def train_file(self, sentences):
     i = 1
     size = len(sentences)
     tmp = ""
     for line in sentences:
         # print(line)
         if i % 20 == 0 or i == size:
             self.train(corpus.tokenize(tmp.lower()))
             tmp = ""
         if i % 1000 == 0 or i == size:
             print(round((i / size) * 100), "% done")
         tmp += line + " "
         i += 1
Example #5
0
def test_subsequent_training():
    lm = LanguageModel(2)
    data = open_file('kn_test.txt')
    lm.train(data)
    model = lm.models[-1]
    wh1_len = len(model.word_hists_dct)
    hw1_len = len(model.hist_words_dct)
    data = tokenize('This sample.')
    lm.train(data)
    model = lm.models[-1]
    wh2_len = len(model.word_hists_dct)
    hw2_len = len(model.hist_words_dct)
    assert wh2_len - wh1_len == 1
    assert hw2_len - hw1_len == 1
    assert sorted(list(model.word_hists_dct['.'].keys())) \
     == sorted(['text', 'sample'])
    assert sorted(list(model.hist_words_dct['this'].keys())) \
     == sorted(['text', 'sample'])
Example #6
0
def generate():
    file_name = input("Please enter a training set's filename:")
    file = open(file_name)
    n = int(input("Please input a n-grams value 'n':"))
    model = init(n)
    sequences = []
    for line in file:
        tokens = corpus.tokenize(line)
        sequences.append(tokens)
    '''
    train_result = lm.train(sequences)
    print(lm.train(sequences))
    '''
    model.train(sequences)
    # print(model.counts)
    new_text_list = model.generate()
    # print(lm.generate())
    new_text = corpus.detokenize(new_text_list)
    return new_text
Example #7
0
def translate(string, model):

    string = tokenize(string)
    seq = []
    for word in string:

        similar_words = []

        try:
            similar_words = model.wv.most_similar(positive=[word])
        except KeyError:
            pass

        if Corpus.is_stopword(word) or len(similar_words) == 0:
            new = word
        else:
            new = similar_words[0][0]

        seq.append(new)

    return untokenize(seq)
Example #8
0
def generate_save():
    new_text = ''
    filename = input('Please input a filename:')
    number = int(input('Please input number of desire text:'))

    file_name = input("Please enter a training set's filename:")
    file = open(file_name)
    n = int(input("Please input a n-grams value 'n':"))
    model = init(n)
    sequences = []
    for line in file:
        tokens = (corpus.tokenize(line))
        sequences.append(tokens)
    model.train(sequences)

    for i in range(0, number):
        new_text_list = model.generate()
        new_text += corpus.detokenize(new_text_list) + '\n'

    file = open(filename, 'w')
    file.write(new_text)

    file.close()
Example #9
0
    def start_preprocessing(self,
                            extra_whitespace=True,
                            lowercase=True,
                            numbers=True,
                            special_chars=True,
                            stop_words=True,
                            lemmatization=True):
        self.processed_text = self.input_text
        if lowercase == True:
            self.processed_text = self.processed_text.lower()
        if numbers == True:
            self.processed_text = self.replace_numbers(self.processed_text)
        if special_chars == True:
            self.processed_text = self.remove_special_chars(
                self.processed_text)
        if extra_whitespace == True:
            self.processed_text = self.remove_whitespace(self.processed_text)
        if stop_words == True:
            self.init_stop_words()

        tokens = tokenize(self.processed_text, self.stop_words, lemmatization)

        return tokens
Example #10
0
 def test_tokenize_with_stop_words_drop_except(self):
     self.assertRaises(TypeError,
                       corpus.tokenize(self.input_text, stop_words=True),
                       ["some", "array"])
Example #11
0
 def test_tokenize(self):
     text = "Oh you can't help that said the Cat"
     lst = corpus.tokenize(text)
     self.assertEqual(len(lst), 8, "Should be same length ie 8")
Example #12
0
 def test_apple(self):
     self.assertEqual(corpus.tokenize('This is an apple.'),
                      ['This', 'is', 'an', 'apple', '.'])
Example #13
0
    'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 'do', 'does',
    'either', 'else', 'ever', 'every', 'for', 'from', 'get', 'got', 'had',
    'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how', 'however', 'i',
    'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like',
    'likely', 'may', 'me', 'might', 'most', 'must', 'my', 'neither', 'no',
    'nor', 'not', 'of', 'off', 'often', 'on', 'only', 'or', 'other', 'our',
    'own', 'rather', 'said', 'say', 'says', 'she', 'should', 'since', 'so',
    'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there', 'these',
    'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants', 'was', 'we',
    'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why',
    'will', 'with', 'would', 'yet', 'you', 'your'
]
stopwordsEN = stopwordsEN + [x.capitalize() for x in stopwordsEN]
#print(stopwordsEN)

mytokens = tokenize(getTextFromFile("pg873.txt"))

# filter out empty string tokens
mytokens = [x for x in mytokens if x]
#print(mytokens)

# filter out stopwords
mytokens = [x for x in mytokens if x not in stopwordsEN]
#print(mytokens)

unigrams = getNGramModel(mytokens, 1)
bigrams = getNGramModel(mytokens, 2)

#print(unigrams)

# prettyPrintFRP(bigrams, myreverse=False)
Example #14
0
def main():
    while True:

        print("Press 1 : Create a new language model with a user-specified n")
        print(
            "Press 2 : Load texts from a file, and train the language model on those texts"
        )
        print(
            "Press 3 : Generate a text from the language model, and print it to the screen"
        )
        print(
            "Press 4 : Generate a user-specified number of texts from the language model, and write them to a file"
        )
        print(
            "Press 5 : Print the predicted  next word's probability distribution"
        )
        print("Press 6 : Perplexity of language model")
        print("Press 7 : Exit")
        print("Enter your choice (integer) ")
        text = input()
        if text == "1":
            print()
            print("Enter the value of n(integer value)")
            n = int(input())
            c = lm.LanguageModel(n)
            print("The value for ngram language model is ", n, "gram model")

        elif text == "2":
            print()
            print("You have pressed 2")
            print("Enter the filename")
            filename = input()
            # filename = "dev_shakespeare.txt"
            # lst = c.load(filename)
            c.load(filename)
            # print(lst)
            # c.train(lst)
            # print((c.counts))

        elif text == "3":
            print()
            print("You have pressed 3 ")
            print("Generate a random text")
            print(corpus.detokenize(c.generate()))

        elif text == "4":
            print()
            print("You have pressed 4 ")
            print("Enter the number for how many random texts you want")
            number_random = int(input())
            print("Enter the filename you want to save for random text")
            filename = input()
            file = open(filename, "w")
            while True:
                if number_random == 0:
                    break
                file.write(corpus.detokenize(c.generate()) + "\n")
                number_random -= 1
            file.close()
            # print(c.generate())

        elif text == "5":
            print()
            print("You have pressed 5 ")
            print(
                "Enter the text and predict the next word's probability distribution"
            )
            # s = "venture forth, The better part of my affections"
            s = input().lower()
            print(c.p_next(corpus.tokenize(s)))

        elif text == "6":
            print()
            print("You have pressed 6 ")
            print("Perplexity of the current language model is ",
                  round(c.perplexity()))

        elif text == "7":
            print()
            print("You have pressed 7 for exit")
            # for x in c.pdf:
            #     print(x, c.pdf[x])
            #
            # print(len(c.pdf))
            print("Exiting the main program")
            sys.exit(0)

        else:
            print(
                "Incorrect input. Please enter correct input for selecting option"
            )
Example #15
0
 def test_empty(self):
     self.assertEqual(corpus.tokenize(''), [])
Example #16
0
def test_tokenize_returns_list_of_strings():
    tokens = tokenize('this string')
    assert type(tokens) == list
    for item in tokens:
        assert type(item) == str
Example #17
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from corpus import relativizeFP, getTextFromFile, tokenize, removeJunk
from operator import itemgetter



#mylist = [ "A", "B", "C", "D", "E", "A", "B", "C" ]

mytokens = tokenize(getTextFromFile("pg873.txt"))


# use this:

#junk = " \n\t"
#mynewtokens = []
#for x in mytokens:
#   if x in junk:
#      continue
#   mynewtokens.append(x)
#mytokens = mynewtokens[:]

# or this:
mytokens = [e for e in mytokens if e not in junk]



def getMeTheNGramModel(tokens, n):
   mydict = {}
   position = 0
Example #18
0
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

import sys, codecs
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.detach())

from corpus import loadTextFromFile, tokenize, getNGrams

myngrams = getNGrams(tokenize( loadTextFromFile("bbc-1.txt") ))

try:
    outfile = open("test2.dot", mode="w", encoding="utf-8")
    print("digraph g {", file=outfile)
    for bigram in myngrams:
        tokens = bigram.split()
        print('"' + tokens[0] + "\" -> \"" + tokens[1] + "\";", file=outfile)
    print("}", file=outfile)
    outfile.close()
except IOError:
    pass

Example #19
0
def correct_string(string):
    """
    Correct the spelling of a string of words.
    """
    return print_words(correct_words(corpus.tokenize(string)))
Example #20
0
def test_tokenize_produces_correct_tokens(string, tokens):
    assert tokenize(string) == tokens
Example #21
0
def test_tokenize_handles_arbitrary_texts(text):
    tokens = tokenize(text)
    assert (tokens == None) or tokens
Example #22
0
def test_data_has_no_empty_tokens_given_hypothesis(text):
    data = tokenize(text)
    if data is not None:
        for line in data:
            for token in line:
                assert len(token) != 0
Example #23
0
import sys, codecs

sys.stdout = codecs.getwriter('utf-8')(sys.stdout.detach())

# use this and then prefix all function names with corpus.
#import corpus
# or use this
from corpus import loadTextFromFile, tokenize, getTokenCounts, prettyPrintFrequencyProfile, relativizeTokenCounts

from math import log


mytext = loadTextFromFile("pg873.txt")

# tokenize mytext and return list of tokens
tokens = tokenize(mytext)

# count tokens
mydict = getTokenCounts(tokens)
relativizeTokenCounts(mydict)

# pretty-print tokens and frequencies
#prettyPrintFrequencyProfile(mydict, sortbyfrq=True, myreverse=True)

mytext = loadTextFromFile("sports-bbc.txt")
mysportsdict = getTokenCounts(tokenize(mytext))
relativizeTokenCounts(mysportsdict)

unknowntext = """Yesterday we scored ten goals in the last 45 minutest of the game."""

Example #24
0
def test_data_has_no_empty_lists_given_hypothesis(text):
    data = tokenize(text)
    if data:
        for line in data:
            assert len(line) != 0
Example #25
0
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

import sys, codecs
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.detach())

from corpus import loadTextFromFile, tokenize, getNGrams

mytext = loadTextFromFile("cnn-1.txt")
mytokens = tokenize(mytext)
myngrams = getNGrams(mytokens, 2)

try:
   outputfile = open("sample-a1.xml", mode='w', encoding='utf-8')

   print("<bigrams>", file=outputfile)
   for bigram in myngrams:
      lefttoken, righttoken = bigram.split()
      print("<bigram>", file=outputfile)
      print("<lefttoken>" + lefttoken + "</lefttoken>", file=outputfile)
      print("<righttoken>" + righttoken + "</righttoken>", file=outputfile)
      print("<frequency>" + str(myngrams[bigram]) + "</frequency>", file=outputfile)
      print("</bigram>", file=outputfile)
   print("</bigrams>", file=outputfile)

   outputfile.close()
except IOError:
   print("Cannot open file...")

#!/usr/bin/env python3


from corpus import getTextFromFile, tokenize, makeFrequencyProfile, removeJunk, prettyPrintFRP


for x in range (1,6):
    loadSpam.split_data( x , 5, spamPath)

for file in spamList:
    mytokens = tokenize(getTextFromFile(file) )

mydict = makeFrequencyProfile(mytokens)

junk = " ,;:-+=()[]'\"?!%.<>"

removeJunk(mydict, junk)

if "" in mydict:
   del mydict[""]

prettyPrintFRP (mydict)
Example #27
0
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

from math import log


#import corpus
from corpus import getTextFromFile, makeFrequencyProfile, tokenize, relativizeFP

mydict = makeFrequencyProfile( tokenize( getTextFromFile("pg873.txt") ) )   
relativizeFP(mydict)

#for key in mydict:
#   print(key, mydict[key], sep="\t")

mysportsdict = makeFrequencyProfile( tokenize( getTextFromFile("sports.txt") ) )
relativizeFP(mysportsdict)

unktokens = tokenize("""
The young King was eating pomegranates and talking about his soul and other emotional issues.
""")

probpomeg = 0.0
probsports = 0.0
for token in unktokens:
   probpomeg += log(mydict.get(token, 0.00000000000001))
   probsports += log(mysportsdict.get(token, 0.00000000000001))

if probpomeg > probsports:
   print("This text is probably House of Pomeg.")
else:
Example #28
0
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

import sys, codecs
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.detach())

from corpus import loadTextFromFile, tokenize, getNGrams

myngrams = getNGrams(tokenize(loadTextFromFile("bbc-1.txt")))

try:
    outfile = open("test2.dot", mode="w", encoding="utf-8")
    print("digraph g {", file=outfile)
    for bigram in myngrams:
        tokens = bigram.split()
        print('"' + tokens[0] + "\" -> \"" + tokens[1] + "\";", file=outfile)
    print("}", file=outfile)
    outfile.close()
except IOError:
    pass
Example #29
0
from corpus import loadTextFromFile, getNGrams, getTokenCounts, prettyPrintFrequencyProfile, relativizeTokenCounts, tokenize
from math import log



def getMIScore(bigramprob, unigramprobaA, unigramprobB):
    return bigramprob * log(bigramprob / (unigramprobaA * unigramprobB) )

def getMIScoreFromFQP( bigrams, unigrams, bigram):
    tokenA, tokenB = bigram.split()
    return bigrams[bigram] * log( bigrams[bigram] / (unigrams[tokenA] * unigrams[tokenB]) )



tokens = tokenize( loadTextFromFile("pg873.txt") )

unigrams = getNGrams(tokens, 1)
relativizeTokenCounts( unigrams )

bigrams = getNGrams(tokens, 2)
#prettyPrintFrequencyProfile(bigrams, myreverse=False)

relativizeTokenCounts( bigrams )


# young King: likelihood ratio
lhr = bigrams["young Fisherman"] / (unigrams["young"] * unigrams["Fisherman"])

# young King - pointwise Mutual Information
pmi = bigrams["young Fisherman"] * log( lhr )
Example #30
0
from corpus import loadTextFromFile, getNGrams, getTokenCounts, prettyPrintFrequencyProfile, relativizeTokenCounts, tokenize
from math import log


def getMIScore(bigramprob, unigramprobaA, unigramprobB):
    return bigramprob * log(bigramprob / (unigramprobaA * unigramprobB))


def getMIScoreFromFQP(bigrams, unigrams, bigram):
    tokenA, tokenB = bigram.split()
    return bigrams[bigram] * log(bigrams[bigram] /
                                 (unigrams[tokenA] * unigrams[tokenB]))


tokens = tokenize(loadTextFromFile("pg873.txt"))

unigrams = getNGrams(tokens, 1)
relativizeTokenCounts(unigrams)

bigrams = getNGrams(tokens, 2)
#prettyPrintFrequencyProfile(bigrams, myreverse=False)

relativizeTokenCounts(bigrams)

# young King: likelihood ratio
lhr = bigrams["young Fisherman"] / (unigrams["young"] * unigrams["Fisherman"])

# young King - pointwise Mutual Information
pmi = bigrams["young Fisherman"] * log(lhr)