def test_simple_tokenize(self): print("id: " + self.id()) result_array_text = [ "This", "text", "is", "written", "only", "for", "testing.", "There", "should", "not", "be", "any", "stop", "words", "at", "all" ] self.assertEqual(corpus.tokenize(self.input_text), result_array_text)
def test_tokenize_with_lemmatization(self): print("id: " + self.id()) self.input_text += " adding words for testing lemmatization functions" result_array_text = [ "This", "text", "be", "write", "only", "for", "testing.", "There", "should", "not", "be", "any", "stop", "word", "at", "all", "add", "word", "for", "test", "lemmatization", "function" ] self.assertEqual(corpus.tokenize(self.input_text, lemmatization=True), result_array_text)
def test_tokenize_with_stop_words(self): print("id: " + self.id()) stop_words = [ "the", "a", "on", "is", "all", "for", "not", "no", "if", "in", "at" ] result_array_text = [ "This", "text", "written", "only", "testing.", "There", "should", "be", "any", "stop", "words" ] self.assertEqual( corpus.tokenize(self.input_text, stop_words=stop_words), result_array_text)
def train_file(self, sentences): i = 1 size = len(sentences) tmp = "" for line in sentences: # print(line) if i % 20 == 0 or i == size: self.train(corpus.tokenize(tmp.lower())) tmp = "" if i % 1000 == 0 or i == size: print(round((i / size) * 100), "% done") tmp += line + " " i += 1
def test_subsequent_training(): lm = LanguageModel(2) data = open_file('kn_test.txt') lm.train(data) model = lm.models[-1] wh1_len = len(model.word_hists_dct) hw1_len = len(model.hist_words_dct) data = tokenize('This sample.') lm.train(data) model = lm.models[-1] wh2_len = len(model.word_hists_dct) hw2_len = len(model.hist_words_dct) assert wh2_len - wh1_len == 1 assert hw2_len - hw1_len == 1 assert sorted(list(model.word_hists_dct['.'].keys())) \ == sorted(['text', 'sample']) assert sorted(list(model.hist_words_dct['this'].keys())) \ == sorted(['text', 'sample'])
def generate(): file_name = input("Please enter a training set's filename:") file = open(file_name) n = int(input("Please input a n-grams value 'n':")) model = init(n) sequences = [] for line in file: tokens = corpus.tokenize(line) sequences.append(tokens) ''' train_result = lm.train(sequences) print(lm.train(sequences)) ''' model.train(sequences) # print(model.counts) new_text_list = model.generate() # print(lm.generate()) new_text = corpus.detokenize(new_text_list) return new_text
def translate(string, model): string = tokenize(string) seq = [] for word in string: similar_words = [] try: similar_words = model.wv.most_similar(positive=[word]) except KeyError: pass if Corpus.is_stopword(word) or len(similar_words) == 0: new = word else: new = similar_words[0][0] seq.append(new) return untokenize(seq)
def generate_save(): new_text = '' filename = input('Please input a filename:') number = int(input('Please input number of desire text:')) file_name = input("Please enter a training set's filename:") file = open(file_name) n = int(input("Please input a n-grams value 'n':")) model = init(n) sequences = [] for line in file: tokens = (corpus.tokenize(line)) sequences.append(tokens) model.train(sequences) for i in range(0, number): new_text_list = model.generate() new_text += corpus.detokenize(new_text_list) + '\n' file = open(filename, 'w') file.write(new_text) file.close()
def start_preprocessing(self, extra_whitespace=True, lowercase=True, numbers=True, special_chars=True, stop_words=True, lemmatization=True): self.processed_text = self.input_text if lowercase == True: self.processed_text = self.processed_text.lower() if numbers == True: self.processed_text = self.replace_numbers(self.processed_text) if special_chars == True: self.processed_text = self.remove_special_chars( self.processed_text) if extra_whitespace == True: self.processed_text = self.remove_whitespace(self.processed_text) if stop_words == True: self.init_stop_words() tokens = tokenize(self.processed_text, self.stop_words, lemmatization) return tokens
def test_tokenize_with_stop_words_drop_except(self): self.assertRaises(TypeError, corpus.tokenize(self.input_text, stop_words=True), ["some", "array"])
def test_tokenize(self): text = "Oh you can't help that said the Cat" lst = corpus.tokenize(text) self.assertEqual(len(lst), 8, "Should be same length ie 8")
def test_apple(self): self.assertEqual(corpus.tokenize('This is an apple.'), ['This', 'is', 'an', 'apple', '.'])
'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 'do', 'does', 'either', 'else', 'ever', 'every', 'for', 'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how', 'however', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like', 'likely', 'may', 'me', 'might', 'most', 'must', 'my', 'neither', 'no', 'nor', 'not', 'of', 'off', 'often', 'on', 'only', 'or', 'other', 'our', 'own', 'rather', 'said', 'say', 'says', 'she', 'should', 'since', 'so', 'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there', 'these', 'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants', 'was', 'we', 'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your' ] stopwordsEN = stopwordsEN + [x.capitalize() for x in stopwordsEN] #print(stopwordsEN) mytokens = tokenize(getTextFromFile("pg873.txt")) # filter out empty string tokens mytokens = [x for x in mytokens if x] #print(mytokens) # filter out stopwords mytokens = [x for x in mytokens if x not in stopwordsEN] #print(mytokens) unigrams = getNGramModel(mytokens, 1) bigrams = getNGramModel(mytokens, 2) #print(unigrams) # prettyPrintFRP(bigrams, myreverse=False)
def main(): while True: print("Press 1 : Create a new language model with a user-specified n") print( "Press 2 : Load texts from a file, and train the language model on those texts" ) print( "Press 3 : Generate a text from the language model, and print it to the screen" ) print( "Press 4 : Generate a user-specified number of texts from the language model, and write them to a file" ) print( "Press 5 : Print the predicted next word's probability distribution" ) print("Press 6 : Perplexity of language model") print("Press 7 : Exit") print("Enter your choice (integer) ") text = input() if text == "1": print() print("Enter the value of n(integer value)") n = int(input()) c = lm.LanguageModel(n) print("The value for ngram language model is ", n, "gram model") elif text == "2": print() print("You have pressed 2") print("Enter the filename") filename = input() # filename = "dev_shakespeare.txt" # lst = c.load(filename) c.load(filename) # print(lst) # c.train(lst) # print((c.counts)) elif text == "3": print() print("You have pressed 3 ") print("Generate a random text") print(corpus.detokenize(c.generate())) elif text == "4": print() print("You have pressed 4 ") print("Enter the number for how many random texts you want") number_random = int(input()) print("Enter the filename you want to save for random text") filename = input() file = open(filename, "w") while True: if number_random == 0: break file.write(corpus.detokenize(c.generate()) + "\n") number_random -= 1 file.close() # print(c.generate()) elif text == "5": print() print("You have pressed 5 ") print( "Enter the text and predict the next word's probability distribution" ) # s = "venture forth, The better part of my affections" s = input().lower() print(c.p_next(corpus.tokenize(s))) elif text == "6": print() print("You have pressed 6 ") print("Perplexity of the current language model is ", round(c.perplexity())) elif text == "7": print() print("You have pressed 7 for exit") # for x in c.pdf: # print(x, c.pdf[x]) # # print(len(c.pdf)) print("Exiting the main program") sys.exit(0) else: print( "Incorrect input. Please enter correct input for selecting option" )
def test_empty(self): self.assertEqual(corpus.tokenize(''), [])
def test_tokenize_returns_list_of_strings(): tokens = tokenize('this string') assert type(tokens) == list for item in tokens: assert type(item) == str
#!/usr/bin/env python3 # -*- coding: utf-8 -*- from corpus import relativizeFP, getTextFromFile, tokenize, removeJunk from operator import itemgetter #mylist = [ "A", "B", "C", "D", "E", "A", "B", "C" ] mytokens = tokenize(getTextFromFile("pg873.txt")) # use this: #junk = " \n\t" #mynewtokens = [] #for x in mytokens: # if x in junk: # continue # mynewtokens.append(x) #mytokens = mynewtokens[:] # or this: mytokens = [e for e in mytokens if e not in junk] def getMeTheNGramModel(tokens, n): mydict = {} position = 0
#!/usr/bin/env python3 # -*- coding: UTF-8 -*- import sys, codecs sys.stdout = codecs.getwriter('utf-8')(sys.stdout.detach()) from corpus import loadTextFromFile, tokenize, getNGrams myngrams = getNGrams(tokenize( loadTextFromFile("bbc-1.txt") )) try: outfile = open("test2.dot", mode="w", encoding="utf-8") print("digraph g {", file=outfile) for bigram in myngrams: tokens = bigram.split() print('"' + tokens[0] + "\" -> \"" + tokens[1] + "\";", file=outfile) print("}", file=outfile) outfile.close() except IOError: pass
def correct_string(string): """ Correct the spelling of a string of words. """ return print_words(correct_words(corpus.tokenize(string)))
def test_tokenize_produces_correct_tokens(string, tokens): assert tokenize(string) == tokens
def test_tokenize_handles_arbitrary_texts(text): tokens = tokenize(text) assert (tokens == None) or tokens
def test_data_has_no_empty_tokens_given_hypothesis(text): data = tokenize(text) if data is not None: for line in data: for token in line: assert len(token) != 0
import sys, codecs sys.stdout = codecs.getwriter('utf-8')(sys.stdout.detach()) # use this and then prefix all function names with corpus. #import corpus # or use this from corpus import loadTextFromFile, tokenize, getTokenCounts, prettyPrintFrequencyProfile, relativizeTokenCounts from math import log mytext = loadTextFromFile("pg873.txt") # tokenize mytext and return list of tokens tokens = tokenize(mytext) # count tokens mydict = getTokenCounts(tokens) relativizeTokenCounts(mydict) # pretty-print tokens and frequencies #prettyPrintFrequencyProfile(mydict, sortbyfrq=True, myreverse=True) mytext = loadTextFromFile("sports-bbc.txt") mysportsdict = getTokenCounts(tokenize(mytext)) relativizeTokenCounts(mysportsdict) unknowntext = """Yesterday we scored ten goals in the last 45 minutest of the game."""
def test_data_has_no_empty_lists_given_hypothesis(text): data = tokenize(text) if data: for line in data: assert len(line) != 0
#!/usr/bin/env python3 # -*- coding: UTF-8 -*- import sys, codecs sys.stdout = codecs.getwriter('utf-8')(sys.stdout.detach()) from corpus import loadTextFromFile, tokenize, getNGrams mytext = loadTextFromFile("cnn-1.txt") mytokens = tokenize(mytext) myngrams = getNGrams(mytokens, 2) try: outputfile = open("sample-a1.xml", mode='w', encoding='utf-8') print("<bigrams>", file=outputfile) for bigram in myngrams: lefttoken, righttoken = bigram.split() print("<bigram>", file=outputfile) print("<lefttoken>" + lefttoken + "</lefttoken>", file=outputfile) print("<righttoken>" + righttoken + "</righttoken>", file=outputfile) print("<frequency>" + str(myngrams[bigram]) + "</frequency>", file=outputfile) print("</bigram>", file=outputfile) print("</bigrams>", file=outputfile) outputfile.close() except IOError: print("Cannot open file...")
#!/usr/bin/env python3 from corpus import getTextFromFile, tokenize, makeFrequencyProfile, removeJunk, prettyPrintFRP for x in range (1,6): loadSpam.split_data( x , 5, spamPath) for file in spamList: mytokens = tokenize(getTextFromFile(file) ) mydict = makeFrequencyProfile(mytokens) junk = " ,;:-+=()[]'\"?!%.<>" removeJunk(mydict, junk) if "" in mydict: del mydict[""] prettyPrintFRP (mydict)
#!/usr/bin/env python3 # -*- coding: UTF-8 -*- from math import log #import corpus from corpus import getTextFromFile, makeFrequencyProfile, tokenize, relativizeFP mydict = makeFrequencyProfile( tokenize( getTextFromFile("pg873.txt") ) ) relativizeFP(mydict) #for key in mydict: # print(key, mydict[key], sep="\t") mysportsdict = makeFrequencyProfile( tokenize( getTextFromFile("sports.txt") ) ) relativizeFP(mysportsdict) unktokens = tokenize(""" The young King was eating pomegranates and talking about his soul and other emotional issues. """) probpomeg = 0.0 probsports = 0.0 for token in unktokens: probpomeg += log(mydict.get(token, 0.00000000000001)) probsports += log(mysportsdict.get(token, 0.00000000000001)) if probpomeg > probsports: print("This text is probably House of Pomeg.") else:
#!/usr/bin/env python3 # -*- coding: UTF-8 -*- import sys, codecs sys.stdout = codecs.getwriter('utf-8')(sys.stdout.detach()) from corpus import loadTextFromFile, tokenize, getNGrams myngrams = getNGrams(tokenize(loadTextFromFile("bbc-1.txt"))) try: outfile = open("test2.dot", mode="w", encoding="utf-8") print("digraph g {", file=outfile) for bigram in myngrams: tokens = bigram.split() print('"' + tokens[0] + "\" -> \"" + tokens[1] + "\";", file=outfile) print("}", file=outfile) outfile.close() except IOError: pass
from corpus import loadTextFromFile, getNGrams, getTokenCounts, prettyPrintFrequencyProfile, relativizeTokenCounts, tokenize from math import log def getMIScore(bigramprob, unigramprobaA, unigramprobB): return bigramprob * log(bigramprob / (unigramprobaA * unigramprobB) ) def getMIScoreFromFQP( bigrams, unigrams, bigram): tokenA, tokenB = bigram.split() return bigrams[bigram] * log( bigrams[bigram] / (unigrams[tokenA] * unigrams[tokenB]) ) tokens = tokenize( loadTextFromFile("pg873.txt") ) unigrams = getNGrams(tokens, 1) relativizeTokenCounts( unigrams ) bigrams = getNGrams(tokens, 2) #prettyPrintFrequencyProfile(bigrams, myreverse=False) relativizeTokenCounts( bigrams ) # young King: likelihood ratio lhr = bigrams["young Fisherman"] / (unigrams["young"] * unigrams["Fisherman"]) # young King - pointwise Mutual Information pmi = bigrams["young Fisherman"] * log( lhr )
from corpus import loadTextFromFile, getNGrams, getTokenCounts, prettyPrintFrequencyProfile, relativizeTokenCounts, tokenize from math import log def getMIScore(bigramprob, unigramprobaA, unigramprobB): return bigramprob * log(bigramprob / (unigramprobaA * unigramprobB)) def getMIScoreFromFQP(bigrams, unigrams, bigram): tokenA, tokenB = bigram.split() return bigrams[bigram] * log(bigrams[bigram] / (unigrams[tokenA] * unigrams[tokenB])) tokens = tokenize(loadTextFromFile("pg873.txt")) unigrams = getNGrams(tokens, 1) relativizeTokenCounts(unigrams) bigrams = getNGrams(tokens, 2) #prettyPrintFrequencyProfile(bigrams, myreverse=False) relativizeTokenCounts(bigrams) # young King: likelihood ratio lhr = bigrams["young Fisherman"] / (unigrams["young"] * unigrams["Fisherman"]) # young King - pointwise Mutual Information pmi = bigrams["young Fisherman"] * log(lhr)