コード例 #1
0
 def test_simple_tokenize(self):
     print("id: " + self.id())
     result_array_text = [
         "This", "text", "is", "written", "only", "for", "testing.",
         "There", "should", "not", "be", "any", "stop", "words", "at", "all"
     ]
     self.assertEqual(corpus.tokenize(self.input_text), result_array_text)
コード例 #2
0
 def test_tokenize_with_lemmatization(self):
     print("id: " + self.id())
     self.input_text += " adding words for testing lemmatization functions"
     result_array_text = [
         "This", "text", "be", "write", "only", "for", "testing.", "There",
         "should", "not", "be", "any", "stop", "word", "at", "all", "add",
         "word", "for", "test", "lemmatization", "function"
     ]
     self.assertEqual(corpus.tokenize(self.input_text, lemmatization=True),
                      result_array_text)
コード例 #3
0
 def test_tokenize_with_stop_words(self):
     print("id: " + self.id())
     stop_words = [
         "the", "a", "on", "is", "all", "for", "not", "no", "if", "in", "at"
     ]
     result_array_text = [
         "This", "text", "written", "only", "testing.", "There", "should",
         "be", "any", "stop", "words"
     ]
     self.assertEqual(
         corpus.tokenize(self.input_text, stop_words=stop_words),
         result_array_text)
コード例 #4
0
ファイル: lm.py プロジェクト: erishan6/ProgCL
 def train_file(self, sentences):
     i = 1
     size = len(sentences)
     tmp = ""
     for line in sentences:
         # print(line)
         if i % 20 == 0 or i == size:
             self.train(corpus.tokenize(tmp.lower()))
             tmp = ""
         if i % 1000 == 0 or i == size:
             print(round((i / size) * 100), "% done")
         tmp += line + " "
         i += 1
コード例 #5
0
def test_subsequent_training():
    lm = LanguageModel(2)
    data = open_file('kn_test.txt')
    lm.train(data)
    model = lm.models[-1]
    wh1_len = len(model.word_hists_dct)
    hw1_len = len(model.hist_words_dct)
    data = tokenize('This sample.')
    lm.train(data)
    model = lm.models[-1]
    wh2_len = len(model.word_hists_dct)
    hw2_len = len(model.hist_words_dct)
    assert wh2_len - wh1_len == 1
    assert hw2_len - hw1_len == 1
    assert sorted(list(model.word_hists_dct['.'].keys())) \
     == sorted(['text', 'sample'])
    assert sorted(list(model.hist_words_dct['this'].keys())) \
     == sorted(['text', 'sample'])
コード例 #6
0
def generate():
    file_name = input("Please enter a training set's filename:")
    file = open(file_name)
    n = int(input("Please input a n-grams value 'n':"))
    model = init(n)
    sequences = []
    for line in file:
        tokens = corpus.tokenize(line)
        sequences.append(tokens)
    '''
    train_result = lm.train(sequences)
    print(lm.train(sequences))
    '''
    model.train(sequences)
    # print(model.counts)
    new_text_list = model.generate()
    # print(lm.generate())
    new_text = corpus.detokenize(new_text_list)
    return new_text
コード例 #7
0
def translate(string, model):

    string = tokenize(string)
    seq = []
    for word in string:

        similar_words = []

        try:
            similar_words = model.wv.most_similar(positive=[word])
        except KeyError:
            pass

        if Corpus.is_stopword(word) or len(similar_words) == 0:
            new = word
        else:
            new = similar_words[0][0]

        seq.append(new)

    return untokenize(seq)
コード例 #8
0
def generate_save():
    new_text = ''
    filename = input('Please input a filename:')
    number = int(input('Please input number of desire text:'))

    file_name = input("Please enter a training set's filename:")
    file = open(file_name)
    n = int(input("Please input a n-grams value 'n':"))
    model = init(n)
    sequences = []
    for line in file:
        tokens = (corpus.tokenize(line))
        sequences.append(tokens)
    model.train(sequences)

    for i in range(0, number):
        new_text_list = model.generate()
        new_text += corpus.detokenize(new_text_list) + '\n'

    file = open(filename, 'w')
    file.write(new_text)

    file.close()
コード例 #9
0
    def start_preprocessing(self,
                            extra_whitespace=True,
                            lowercase=True,
                            numbers=True,
                            special_chars=True,
                            stop_words=True,
                            lemmatization=True):
        self.processed_text = self.input_text
        if lowercase == True:
            self.processed_text = self.processed_text.lower()
        if numbers == True:
            self.processed_text = self.replace_numbers(self.processed_text)
        if special_chars == True:
            self.processed_text = self.remove_special_chars(
                self.processed_text)
        if extra_whitespace == True:
            self.processed_text = self.remove_whitespace(self.processed_text)
        if stop_words == True:
            self.init_stop_words()

        tokens = tokenize(self.processed_text, self.stop_words, lemmatization)

        return tokens
コード例 #10
0
 def test_tokenize_with_stop_words_drop_except(self):
     self.assertRaises(TypeError,
                       corpus.tokenize(self.input_text, stop_words=True),
                       ["some", "array"])
コード例 #11
0
 def test_tokenize(self):
     text = "Oh you can't help that said the Cat"
     lst = corpus.tokenize(text)
     self.assertEqual(len(lst), 8, "Should be same length ie 8")
コード例 #12
0
ファイル: test.py プロジェクト: zhiyin121/Language_Model
 def test_apple(self):
     self.assertEqual(corpus.tokenize('This is an apple.'),
                      ['This', 'is', 'an', 'apple', '.'])
コード例 #13
0
ファイル: collocations1.py プロジェクト: dcavar/Py3L
    'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 'do', 'does',
    'either', 'else', 'ever', 'every', 'for', 'from', 'get', 'got', 'had',
    'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how', 'however', 'i',
    'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like',
    'likely', 'may', 'me', 'might', 'most', 'must', 'my', 'neither', 'no',
    'nor', 'not', 'of', 'off', 'often', 'on', 'only', 'or', 'other', 'our',
    'own', 'rather', 'said', 'say', 'says', 'she', 'should', 'since', 'so',
    'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there', 'these',
    'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants', 'was', 'we',
    'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why',
    'will', 'with', 'would', 'yet', 'you', 'your'
]
stopwordsEN = stopwordsEN + [x.capitalize() for x in stopwordsEN]
#print(stopwordsEN)

mytokens = tokenize(getTextFromFile("pg873.txt"))

# filter out empty string tokens
mytokens = [x for x in mytokens if x]
#print(mytokens)

# filter out stopwords
mytokens = [x for x in mytokens if x not in stopwordsEN]
#print(mytokens)

unigrams = getNGramModel(mytokens, 1)
bigrams = getNGramModel(mytokens, 2)

#print(unigrams)

# prettyPrintFRP(bigrams, myreverse=False)
コード例 #14
0
def main():
    while True:

        print("Press 1 : Create a new language model with a user-specified n")
        print(
            "Press 2 : Load texts from a file, and train the language model on those texts"
        )
        print(
            "Press 3 : Generate a text from the language model, and print it to the screen"
        )
        print(
            "Press 4 : Generate a user-specified number of texts from the language model, and write them to a file"
        )
        print(
            "Press 5 : Print the predicted  next word's probability distribution"
        )
        print("Press 6 : Perplexity of language model")
        print("Press 7 : Exit")
        print("Enter your choice (integer) ")
        text = input()
        if text == "1":
            print()
            print("Enter the value of n(integer value)")
            n = int(input())
            c = lm.LanguageModel(n)
            print("The value for ngram language model is ", n, "gram model")

        elif text == "2":
            print()
            print("You have pressed 2")
            print("Enter the filename")
            filename = input()
            # filename = "dev_shakespeare.txt"
            # lst = c.load(filename)
            c.load(filename)
            # print(lst)
            # c.train(lst)
            # print((c.counts))

        elif text == "3":
            print()
            print("You have pressed 3 ")
            print("Generate a random text")
            print(corpus.detokenize(c.generate()))

        elif text == "4":
            print()
            print("You have pressed 4 ")
            print("Enter the number for how many random texts you want")
            number_random = int(input())
            print("Enter the filename you want to save for random text")
            filename = input()
            file = open(filename, "w")
            while True:
                if number_random == 0:
                    break
                file.write(corpus.detokenize(c.generate()) + "\n")
                number_random -= 1
            file.close()
            # print(c.generate())

        elif text == "5":
            print()
            print("You have pressed 5 ")
            print(
                "Enter the text and predict the next word's probability distribution"
            )
            # s = "venture forth, The better part of my affections"
            s = input().lower()
            print(c.p_next(corpus.tokenize(s)))

        elif text == "6":
            print()
            print("You have pressed 6 ")
            print("Perplexity of the current language model is ",
                  round(c.perplexity()))

        elif text == "7":
            print()
            print("You have pressed 7 for exit")
            # for x in c.pdf:
            #     print(x, c.pdf[x])
            #
            # print(len(c.pdf))
            print("Exiting the main program")
            sys.exit(0)

        else:
            print(
                "Incorrect input. Please enter correct input for selecting option"
            )
コード例 #15
0
ファイル: test.py プロジェクト: zhiyin121/Language_Model
 def test_empty(self):
     self.assertEqual(corpus.tokenize(''), [])
コード例 #16
0
def test_tokenize_returns_list_of_strings():
    tokens = tokenize('this string')
    assert type(tokens) == list
    for item in tokens:
        assert type(item) == str
コード例 #17
0
ファイル: list-loop-1.py プロジェクト: dcavar/Py3L
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from corpus import relativizeFP, getTextFromFile, tokenize, removeJunk
from operator import itemgetter



#mylist = [ "A", "B", "C", "D", "E", "A", "B", "C" ]

mytokens = tokenize(getTextFromFile("pg873.txt"))


# use this:

#junk = " \n\t"
#mynewtokens = []
#for x in mytokens:
#   if x in junk:
#      continue
#   mynewtokens.append(x)
#mytokens = mynewtokens[:]

# or this:
mytokens = [e for e in mytokens if e not in junk]



def getMeTheNGramModel(tokens, n):
   mydict = {}
   position = 0
コード例 #18
0
ファイル: make-ngram.py プロジェクト: dcavar/Py3L
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

import sys, codecs
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.detach())

from corpus import loadTextFromFile, tokenize, getNGrams

myngrams = getNGrams(tokenize( loadTextFromFile("bbc-1.txt") ))

try:
    outfile = open("test2.dot", mode="w", encoding="utf-8")
    print("digraph g {", file=outfile)
    for bigram in myngrams:
        tokens = bigram.split()
        print('"' + tokens[0] + "\" -> \"" + tokens[1] + "\";", file=outfile)
    print("}", file=outfile)
    outfile.close()
except IOError:
    pass

コード例 #19
0
ファイル: correct.py プロジェクト: dwgill/dspell
def correct_string(string):
    """
    Correct the spelling of a string of words.
    """
    return print_words(correct_words(corpus.tokenize(string)))
コード例 #20
0
def test_tokenize_produces_correct_tokens(string, tokens):
    assert tokenize(string) == tokens
コード例 #21
0
def test_tokenize_handles_arbitrary_texts(text):
    tokens = tokenize(text)
    assert (tokens == None) or tokens
コード例 #22
0
def test_data_has_no_empty_tokens_given_hypothesis(text):
    data = tokenize(text)
    if data is not None:
        for line in data:
            for token in line:
                assert len(token) != 0
コード例 #23
0
import sys, codecs

sys.stdout = codecs.getwriter('utf-8')(sys.stdout.detach())

# use this and then prefix all function names with corpus.
#import corpus
# or use this
from corpus import loadTextFromFile, tokenize, getTokenCounts, prettyPrintFrequencyProfile, relativizeTokenCounts

from math import log


mytext = loadTextFromFile("pg873.txt")

# tokenize mytext and return list of tokens
tokens = tokenize(mytext)

# count tokens
mydict = getTokenCounts(tokens)
relativizeTokenCounts(mydict)

# pretty-print tokens and frequencies
#prettyPrintFrequencyProfile(mydict, sortbyfrq=True, myreverse=True)

mytext = loadTextFromFile("sports-bbc.txt")
mysportsdict = getTokenCounts(tokenize(mytext))
relativizeTokenCounts(mysportsdict)

unknowntext = """Yesterday we scored ten goals in the last 45 minutest of the game."""

コード例 #24
0
def test_data_has_no_empty_lists_given_hypothesis(text):
    data = tokenize(text)
    if data:
        for line in data:
            assert len(line) != 0
コード例 #25
0
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

import sys, codecs
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.detach())

from corpus import loadTextFromFile, tokenize, getNGrams

mytext = loadTextFromFile("cnn-1.txt")
mytokens = tokenize(mytext)
myngrams = getNGrams(mytokens, 2)

try:
   outputfile = open("sample-a1.xml", mode='w', encoding='utf-8')

   print("<bigrams>", file=outputfile)
   for bigram in myngrams:
      lefttoken, righttoken = bigram.split()
      print("<bigram>", file=outputfile)
      print("<lefttoken>" + lefttoken + "</lefttoken>", file=outputfile)
      print("<righttoken>" + righttoken + "</righttoken>", file=outputfile)
      print("<frequency>" + str(myngrams[bigram]) + "</frequency>", file=outputfile)
      print("</bigram>", file=outputfile)
   print("</bigrams>", file=outputfile)

   outputfile.close()
except IOError:
   print("Cannot open file...")

コード例 #26
0
#!/usr/bin/env python3


from corpus import getTextFromFile, tokenize, makeFrequencyProfile, removeJunk, prettyPrintFRP


for x in range (1,6):
    loadSpam.split_data( x , 5, spamPath)

for file in spamList:
    mytokens = tokenize(getTextFromFile(file) )

mydict = makeFrequencyProfile(mytokens)

junk = " ,;:-+=()[]'\"?!%.<>"

removeJunk(mydict, junk)

if "" in mydict:
   del mydict[""]

prettyPrintFRP (mydict)
コード例 #27
0
ファイル: generateFRQP.py プロジェクト: dcavar/Py3L
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

from math import log


#import corpus
from corpus import getTextFromFile, makeFrequencyProfile, tokenize, relativizeFP

mydict = makeFrequencyProfile( tokenize( getTextFromFile("pg873.txt") ) )   
relativizeFP(mydict)

#for key in mydict:
#   print(key, mydict[key], sep="\t")

mysportsdict = makeFrequencyProfile( tokenize( getTextFromFile("sports.txt") ) )
relativizeFP(mysportsdict)

unktokens = tokenize("""
The young King was eating pomegranates and talking about his soul and other emotional issues.
""")

probpomeg = 0.0
probsports = 0.0
for token in unktokens:
   probpomeg += log(mydict.get(token, 0.00000000000001))
   probsports += log(mysportsdict.get(token, 0.00000000000001))

if probpomeg > probsports:
   print("This text is probably House of Pomeg.")
else:
コード例 #28
0
ファイル: make-ngram.py プロジェクト: dcavar/Py3L
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

import sys, codecs
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.detach())

from corpus import loadTextFromFile, tokenize, getNGrams

myngrams = getNGrams(tokenize(loadTextFromFile("bbc-1.txt")))

try:
    outfile = open("test2.dot", mode="w", encoding="utf-8")
    print("digraph g {", file=outfile)
    for bigram in myngrams:
        tokens = bigram.split()
        print('"' + tokens[0] + "\" -> \"" + tokens[1] + "\";", file=outfile)
    print("}", file=outfile)
    outfile.close()
except IOError:
    pass
コード例 #29
0
ファイル: test-ngrams-1.py プロジェクト: dcavar/Py3L
from corpus import loadTextFromFile, getNGrams, getTokenCounts, prettyPrintFrequencyProfile, relativizeTokenCounts, tokenize
from math import log



def getMIScore(bigramprob, unigramprobaA, unigramprobB):
    return bigramprob * log(bigramprob / (unigramprobaA * unigramprobB) )

def getMIScoreFromFQP( bigrams, unigrams, bigram):
    tokenA, tokenB = bigram.split()
    return bigrams[bigram] * log( bigrams[bigram] / (unigrams[tokenA] * unigrams[tokenB]) )



tokens = tokenize( loadTextFromFile("pg873.txt") )

unigrams = getNGrams(tokens, 1)
relativizeTokenCounts( unigrams )

bigrams = getNGrams(tokens, 2)
#prettyPrintFrequencyProfile(bigrams, myreverse=False)

relativizeTokenCounts( bigrams )


# young King: likelihood ratio
lhr = bigrams["young Fisherman"] / (unigrams["young"] * unigrams["Fisherman"])

# young King - pointwise Mutual Information
pmi = bigrams["young Fisherman"] * log( lhr )
コード例 #30
0
from corpus import loadTextFromFile, getNGrams, getTokenCounts, prettyPrintFrequencyProfile, relativizeTokenCounts, tokenize
from math import log


def getMIScore(bigramprob, unigramprobaA, unigramprobB):
    return bigramprob * log(bigramprob / (unigramprobaA * unigramprobB))


def getMIScoreFromFQP(bigrams, unigrams, bigram):
    tokenA, tokenB = bigram.split()
    return bigrams[bigram] * log(bigrams[bigram] /
                                 (unigrams[tokenA] * unigrams[tokenB]))


tokens = tokenize(loadTextFromFile("pg873.txt"))

unigrams = getNGrams(tokens, 1)
relativizeTokenCounts(unigrams)

bigrams = getNGrams(tokens, 2)
#prettyPrintFrequencyProfile(bigrams, myreverse=False)

relativizeTokenCounts(bigrams)

# young King: likelihood ratio
lhr = bigrams["young Fisherman"] / (unigrams["young"] * unigrams["Fisherman"])

# young King - pointwise Mutual Information
pmi = bigrams["young Fisherman"] * log(lhr)