Python tokenize Exemples, corpus.tokenize Python Exemples

Exemple #1

0

Afficher le fichier

 def test_simple_tokenize(self):
     print("id: " + self.id())
     result_array_text = [
         "This", "text", "is", "written", "only", "for", "testing.",
         "There", "should", "not", "be", "any", "stop", "words", "at", "all"
     ]
     self.assertEqual(corpus.tokenize(self.input_text), result_array_text)

Exemple #2

0

Afficher le fichier

 def test_tokenize_with_lemmatization(self):
     print("id: " + self.id())
     self.input_text += " adding words for testing lemmatization functions"
     result_array_text = [
         "This", "text", "be", "write", "only", "for", "testing.", "There",
         "should", "not", "be", "any", "stop", "word", "at", "all", "add",
         "word", "for", "test", "lemmatization", "function"
     ]
     self.assertEqual(corpus.tokenize(self.input_text, lemmatization=True),
                      result_array_text)

Exemple #3

0

Afficher le fichier

 def test_tokenize_with_stop_words(self):
     print("id: " + self.id())
     stop_words = [
         "the", "a", "on", "is", "all", "for", "not", "no", "if", "in", "at"
     ]
     result_array_text = [
         "This", "text", "written", "only", "testing.", "There", "should",
         "be", "any", "stop", "words"
     ]
     self.assertEqual(
         corpus.tokenize(self.input_text, stop_words=stop_words),
         result_array_text)

Exemple #4

0

Afficher le fichier

Fichier : lm.py Projet : erishan6/ProgCL

 def train_file(self, sentences):
     i = 1
     size = len(sentences)
     tmp = ""
     for line in sentences:
         # print(line)
         if i % 20 == 0 or i == size:
             self.train(corpus.tokenize(tmp.lower()))
             tmp = ""
         if i % 1000 == 0 or i == size:
             print(round((i / size) * 100), "% done")
         tmp += line + " "
         i += 1

Exemple #5

0

Afficher le fichier

def test_subsequent_training():
    lm = LanguageModel(2)
    data = open_file('kn_test.txt')
    lm.train(data)
    model = lm.models[-1]
    wh1_len = len(model.word_hists_dct)
    hw1_len = len(model.hist_words_dct)
    data = tokenize('This sample.')
    lm.train(data)
    model = lm.models[-1]
    wh2_len = len(model.word_hists_dct)
    hw2_len = len(model.hist_words_dct)
    assert wh2_len - wh1_len == 1
    assert hw2_len - hw1_len == 1
    assert sorted(list(model.word_hists_dct['.'].keys())) \
     == sorted(['text', 'sample'])
    assert sorted(list(model.hist_words_dct['this'].keys())) \
     == sorted(['text', 'sample'])

Exemple #6

0

Afficher le fichier

def generate():
    file_name = input("Please enter a training set's filename:")
    file = open(file_name)
    n = int(input("Please input a n-grams value 'n':"))
    model = init(n)
    sequences = []
    for line in file:
        tokens = corpus.tokenize(line)
        sequences.append(tokens)
    '''
    train_result = lm.train(sequences)
    print(lm.train(sequences))
    '''
    model.train(sequences)
    # print(model.counts)
    new_text_list = model.generate()
    # print(lm.generate())
    new_text = corpus.detokenize(new_text_list)
    return new_text

Exemple #7

0

Afficher le fichier

def translate(string, model):

    string = tokenize(string)
    seq = []
    for word in string:

        similar_words = []

        try:
            similar_words = model.wv.most_similar(positive=[word])
        except KeyError:
            pass

        if Corpus.is_stopword(word) or len(similar_words) == 0:
            new = word
        else:
            new = similar_words[0][0]

        seq.append(new)

    return untokenize(seq)

Exemple #8

0

Afficher le fichier

def generate_save():
    new_text = ''
    filename = input('Please input a filename:')
    number = int(input('Please input number of desire text:'))

    file_name = input("Please enter a training set's filename:")
    file = open(file_name)
    n = int(input("Please input a n-grams value 'n':"))
    model = init(n)
    sequences = []
    for line in file:
        tokens = (corpus.tokenize(line))
        sequences.append(tokens)
    model.train(sequences)

    for i in range(0, number):
        new_text_list = model.generate()
        new_text += corpus.detokenize(new_text_list) + '\n'

    file = open(filename, 'w')
    file.write(new_text)

    file.close()

Exemple #9

0

Afficher le fichier

    def start_preprocessing(self,
                            extra_whitespace=True,
                            lowercase=True,
                            numbers=True,
                            special_chars=True,
                            stop_words=True,
                            lemmatization=True):
        self.processed_text = self.input_text
        if lowercase == True:
            self.processed_text = self.processed_text.lower()
        if numbers == True:
            self.processed_text = self.replace_numbers(self.processed_text)
        if special_chars == True:
            self.processed_text = self.remove_special_chars(
                self.processed_text)
        if extra_whitespace == True:
            self.processed_text = self.remove_whitespace(self.processed_text)
        if stop_words == True:
            self.init_stop_words()

        tokens = tokenize(self.processed_text, self.stop_words, lemmatization)

        return tokens

Exemple #10

0

Afficher le fichier

 def test_tokenize_with_stop_words_drop_except(self):
     self.assertRaises(TypeError,
                       corpus.tokenize(self.input_text, stop_words=True),
                       ["some", "array"])

Exemple #11

0

Afficher le fichier

 def test_tokenize(self):
     text = "Oh you can't help that said the Cat"
     lst = corpus.tokenize(text)
     self.assertEqual(len(lst), 8, "Should be same length ie 8")

Exemple #12

0

Afficher le fichier

Fichier : test.py Projet : zhiyin121/Language_Model

 def test_apple(self):
     self.assertEqual(corpus.tokenize('This is an apple.'),
                      ['This', 'is', 'an', 'apple', '.'])

Exemple #13

0

Afficher le fichier

Fichier : collocations1.py Projet : dcavar/Py3L

    'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 'do', 'does',
    'either', 'else', 'ever', 'every', 'for', 'from', 'get', 'got', 'had',
    'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how', 'however', 'i',
    'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like',
    'likely', 'may', 'me', 'might', 'most', 'must', 'my', 'neither', 'no',
    'nor', 'not', 'of', 'off', 'often', 'on', 'only', 'or', 'other', 'our',
    'own', 'rather', 'said', 'say', 'says', 'she', 'should', 'since', 'so',
    'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there', 'these',
    'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants', 'was', 'we',
    'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why',
    'will', 'with', 'would', 'yet', 'you', 'your'
]
stopwordsEN = stopwordsEN + [x.capitalize() for x in stopwordsEN]
#print(stopwordsEN)

mytokens = tokenize(getTextFromFile("pg873.txt"))

# filter out empty string tokens
mytokens = [x for x in mytokens if x]
#print(mytokens)

# filter out stopwords
mytokens = [x for x in mytokens if x not in stopwordsEN]
#print(mytokens)

unigrams = getNGramModel(mytokens, 1)
bigrams = getNGramModel(mytokens, 2)

#print(unigrams)

# prettyPrintFRP(bigrams, myreverse=False)

Exemple #14

0

Afficher le fichier

def main():
    while True:

        print("Press 1 : Create a new language model with a user-specified n")
        print(
            "Press 2 : Load texts from a file, and train the language model on those texts"
        )
        print(
            "Press 3 : Generate a text from the language model, and print it to the screen"
        )
        print(
            "Press 4 : Generate a user-specified number of texts from the language model, and write them to a file"
        )
        print(
            "Press 5 : Print the predicted  next word's probability distribution"
        )
        print("Press 6 : Perplexity of language model")
        print("Press 7 : Exit")
        print("Enter your choice (integer) ")
        text = input()
        if text == "1":
            print()
            print("Enter the value of n(integer value)")
            n = int(input())
            c = lm.LanguageModel(n)
            print("The value for ngram language model is ", n, "gram model")

        elif text == "2":
            print()
            print("You have pressed 2")
            print("Enter the filename")
            filename = input()
            # filename = "dev_shakespeare.txt"
            # lst = c.load(filename)
            c.load(filename)
            # print(lst)
            # c.train(lst)
            # print((c.counts))

        elif text == "3":
            print()
            print("You have pressed 3 ")
            print("Generate a random text")
            print(corpus.detokenize(c.generate()))

        elif text == "4":
            print()
            print("You have pressed 4 ")
            print("Enter the number for how many random texts you want")
            number_random = int(input())
            print("Enter the filename you want to save for random text")
            filename = input()
            file = open(filename, "w")
            while True:
                if number_random == 0:
                    break
                file.write(corpus.detokenize(c.generate()) + "\n")
                number_random -= 1
            file.close()
            # print(c.generate())

        elif text == "5":
            print()
            print("You have pressed 5 ")
            print(
                "Enter the text and predict the next word's probability distribution"
            )
            # s = "venture forth, The better part of my affections"
            s = input().lower()
            print(c.p_next(corpus.tokenize(s)))

        elif text == "6":
            print()
            print("You have pressed 6 ")
            print("Perplexity of the current language model is ",
                  round(c.perplexity()))

        elif text == "7":
            print()
            print("You have pressed 7 for exit")
            # for x in c.pdf:
            #     print(x, c.pdf[x])
            #
            # print(len(c.pdf))
            print("Exiting the main program")
            sys.exit(0)

        else:
            print(
                "Incorrect input. Please enter correct input for selecting option"
            )

Exemple #15

0

Afficher le fichier

Fichier : test.py Projet : zhiyin121/Language_Model

 def test_empty(self):
     self.assertEqual(corpus.tokenize(''), [])

Exemple #16

0

Afficher le fichier

def test_tokenize_returns_list_of_strings():
    tokens = tokenize('this string')
    assert type(tokens) == list
    for item in tokens:
        assert type(item) == str

Exemple #17

0

Afficher le fichier

Fichier : list-loop-1.py Projet : dcavar/Py3L

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from corpus import relativizeFP, getTextFromFile, tokenize, removeJunk
from operator import itemgetter



#mylist = [ "A", "B", "C", "D", "E", "A", "B", "C" ]

mytokens = tokenize(getTextFromFile("pg873.txt"))


# use this:

#junk = " \n\t"
#mynewtokens = []
#for x in mytokens:
#   if x in junk:
#      continue
#   mynewtokens.append(x)
#mytokens = mynewtokens[:]

# or this:
mytokens = [e for e in mytokens if e not in junk]



def getMeTheNGramModel(tokens, n):
   mydict = {}
   position = 0

Exemple #18

0

Afficher le fichier

Fichier : make-ngram.py Projet : dcavar/Py3L

#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

import sys, codecs
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.detach())

from corpus import loadTextFromFile, tokenize, getNGrams

myngrams = getNGrams(tokenize( loadTextFromFile("bbc-1.txt") ))

try:
    outfile = open("test2.dot", mode="w", encoding="utf-8")
    print("digraph g {", file=outfile)
    for bigram in myngrams:
        tokens = bigram.split()
        print('"' + tokens[0] + "\" -> \"" + tokens[1] + "\";", file=outfile)
    print("}", file=outfile)
    outfile.close()
except IOError:
    pass

Exemple #19

0

Afficher le fichier

Fichier : correct.py Projet : dwgill/dspell

def correct_string(string):
    """
    Correct the spelling of a string of words.
    """
    return print_words(correct_words(corpus.tokenize(string)))

Exemple #20

0

Afficher le fichier

def test_tokenize_produces_correct_tokens(string, tokens):
    assert tokenize(string) == tokens

Exemple #21

0

Afficher le fichier

def test_tokenize_handles_arbitrary_texts(text):
    tokens = tokenize(text)
    assert (tokens == None) or tokens

Exemple #22

0

Afficher le fichier

def test_data_has_no_empty_tokens_given_hypothesis(text):
    data = tokenize(text)
    if data is not None:
        for line in data:
            for token in line:
                assert len(token) != 0

Exemple #23

0

Afficher le fichier

import sys, codecs

sys.stdout = codecs.getwriter('utf-8')(sys.stdout.detach())

# use this and then prefix all function names with corpus.
#import corpus
# or use this
from corpus import loadTextFromFile, tokenize, getTokenCounts, prettyPrintFrequencyProfile, relativizeTokenCounts

from math import log


mytext = loadTextFromFile("pg873.txt")

# tokenize mytext and return list of tokens
tokens = tokenize(mytext)

# count tokens
mydict = getTokenCounts(tokens)
relativizeTokenCounts(mydict)

# pretty-print tokens and frequencies
#prettyPrintFrequencyProfile(mydict, sortbyfrq=True, myreverse=True)

mytext = loadTextFromFile("sports-bbc.txt")
mysportsdict = getTokenCounts(tokenize(mytext))
relativizeTokenCounts(mysportsdict)

unknowntext = """Yesterday we scored ten goals in the last 45 minutest of the game."""

Exemple #24

0

Afficher le fichier

def test_data_has_no_empty_lists_given_hypothesis(text):
    data = tokenize(text)
    if data:
        for line in data:
            assert len(line) != 0

Exemple #25

0

Afficher le fichier

#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

import sys, codecs
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.detach())

from corpus import loadTextFromFile, tokenize, getNGrams

mytext = loadTextFromFile("cnn-1.txt")
mytokens = tokenize(mytext)
myngrams = getNGrams(mytokens, 2)

try:
   outputfile = open("sample-a1.xml", mode='w', encoding='utf-8')

   print("<bigrams>", file=outputfile)
   for bigram in myngrams:
      lefttoken, righttoken = bigram.split()
      print("<bigram>", file=outputfile)
      print("<lefttoken>" + lefttoken + "</lefttoken>", file=outputfile)
      print("<righttoken>" + righttoken + "</righttoken>", file=outputfile)
      print("<frequency>" + str(myngrams[bigram]) + "</frequency>", file=outputfile)
      print("</bigram>", file=outputfile)
   print("</bigrams>", file=outputfile)

   outputfile.close()
except IOError:
   print("Cannot open file...")

Exemple #26

0

Afficher le fichier

Fichier : spamFrequency.py Projet : colinarobinson/WatermelonChaser

#!/usr/bin/env python3


from corpus import getTextFromFile, tokenize, makeFrequencyProfile, removeJunk, prettyPrintFRP


for x in range (1,6):
    loadSpam.split_data( x , 5, spamPath)

for file in spamList:
    mytokens = tokenize(getTextFromFile(file) )

mydict = makeFrequencyProfile(mytokens)

junk = " ,;:-+=()[]'\"?!%.<>"

removeJunk(mydict, junk)

if "" in mydict:
   del mydict[""]

prettyPrintFRP (mydict)

Exemple #27

0

Afficher le fichier

Fichier : generateFRQP.py Projet : dcavar/Py3L

#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

from math import log


#import corpus
from corpus import getTextFromFile, makeFrequencyProfile, tokenize, relativizeFP

mydict = makeFrequencyProfile( tokenize( getTextFromFile("pg873.txt") ) )   
relativizeFP(mydict)

#for key in mydict:
#   print(key, mydict[key], sep="\t")

mysportsdict = makeFrequencyProfile( tokenize( getTextFromFile("sports.txt") ) )
relativizeFP(mysportsdict)

unktokens = tokenize("""
The young King was eating pomegranates and talking about his soul and other emotional issues.
""")

probpomeg = 0.0
probsports = 0.0
for token in unktokens:
   probpomeg += log(mydict.get(token, 0.00000000000001))
   probsports += log(mysportsdict.get(token, 0.00000000000001))

if probpomeg > probsports:
   print("This text is probably House of Pomeg.")
else:

Exemple #28

0

Afficher le fichier

Fichier : make-ngram.py Projet : dcavar/Py3L

#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

import sys, codecs
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.detach())

from corpus import loadTextFromFile, tokenize, getNGrams

myngrams = getNGrams(tokenize(loadTextFromFile("bbc-1.txt")))

try:
    outfile = open("test2.dot", mode="w", encoding="utf-8")
    print("digraph g {", file=outfile)
    for bigram in myngrams:
        tokens = bigram.split()
        print('"' + tokens[0] + "\" -> \"" + tokens[1] + "\";", file=outfile)
    print("}", file=outfile)
    outfile.close()
except IOError:
    pass

Exemple #29

0

Afficher le fichier

Fichier : test-ngrams-1.py Projet : dcavar/Py3L

from corpus import loadTextFromFile, getNGrams, getTokenCounts, prettyPrintFrequencyProfile, relativizeTokenCounts, tokenize
from math import log



def getMIScore(bigramprob, unigramprobaA, unigramprobB):
    return bigramprob * log(bigramprob / (unigramprobaA * unigramprobB) )

def getMIScoreFromFQP( bigrams, unigrams, bigram):
    tokenA, tokenB = bigram.split()
    return bigrams[bigram] * log( bigrams[bigram] / (unigrams[tokenA] * unigrams[tokenB]) )



tokens = tokenize( loadTextFromFile("pg873.txt") )

unigrams = getNGrams(tokens, 1)
relativizeTokenCounts( unigrams )

bigrams = getNGrams(tokens, 2)
#prettyPrintFrequencyProfile(bigrams, myreverse=False)

relativizeTokenCounts( bigrams )


# young King: likelihood ratio
lhr = bigrams["young Fisherman"] / (unigrams["young"] * unigrams["Fisherman"])

# young King - pointwise Mutual Information
pmi = bigrams["young Fisherman"] * log( lhr )

Exemple #30

0

Afficher le fichier

from corpus import loadTextFromFile, getNGrams, getTokenCounts, prettyPrintFrequencyProfile, relativizeTokenCounts, tokenize
from math import log


def getMIScore(bigramprob, unigramprobaA, unigramprobB):
    return bigramprob * log(bigramprob / (unigramprobaA * unigramprobB))


def getMIScoreFromFQP(bigrams, unigrams, bigram):
    tokenA, tokenB = bigram.split()
    return bigrams[bigram] * log(bigrams[bigram] /
                                 (unigrams[tokenA] * unigrams[tokenB]))


tokens = tokenize(loadTextFromFile("pg873.txt"))

unigrams = getNGrams(tokens, 1)
relativizeTokenCounts(unigrams)

bigrams = getNGrams(tokens, 2)
#prettyPrintFrequencyProfile(bigrams, myreverse=False)

relativizeTokenCounts(bigrams)

# young King: likelihood ratio
lhr = bigrams["young Fisherman"] / (unigrams["young"] * unigrams["Fisherman"])

# young King - pointwise Mutual Information
pmi = bigrams["young Fisherman"] * log(lhr)