Exemple #1
0
f = open("poem.txt")
# f = open("howl.txt")

data = f.read()
def removeNonAscii(s): return "".join(i for i in s if ord(i)<128)
data = removeNonAscii(data)

tokens = nltk.word_tokenize(data)
t = Text(tokens)
# t.generate(30)

estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
trigram_model = NgramModel(3, t, estimator = estimator)

token_array = trigram_model.generate(150)[10:]

first_token = token_array[0].strip

if first_token in [".", ",", "?", "(", ")"]:
    token_array = token_array[1:]

joined = " ".join(token_array)
joined = joined.replace(" . ", ".\n")
joined = joined.replace(". ", ".\n")
joined = joined.replace(" ? ", "?\n")
joined = joined.replace(" , ", ",\n")
joined = joined.replace(" ) ", ")\n")
joined = joined.replace(" ( ", "(")
joined = joined.replace(" ! ", "!\n")
from nltk.corpus import brown, shakespeare
from nltk.probability import LidstoneProbDist
from nltk.model.ngram import NgramModel

##todo: try shakespeare corpus

NGRAM_MODEL_N = 3
#TRAIN = brown.words(categories='lore') ## just a list of strings
TRAIN = shakespeare.words()
ESTIMATOR = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)

lm = NgramModel(NGRAM_MODEL_N, TRAIN, estimator=ESTIMATOR)
print lm

print lm.generate(40)
print 'done'
Exemple #3
0

def removeNonAscii(s):
    return "".join(i for i in s if ord(i) < 128)


data = removeNonAscii(data)

tokens = nltk.word_tokenize(data)
t = Text(tokens)
# t.generate(30)

estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
trigram_model = NgramModel(3, t, estimator=estimator)

token_array = trigram_model.generate(150)[10:]

first_token = token_array[0].strip

if first_token in [".", ",", "?", "(", ")"]:
    token_array = token_array[1:]

joined = " ".join(token_array)
joined = joined.replace(" . ", ".\n")
joined = joined.replace(". ", ".\n")
joined = joined.replace(" ? ", "?\n")
joined = joined.replace(" , ", ",\n")
joined = joined.replace(" ) ", ")\n")
joined = joined.replace(" ( ", "(")
joined = joined.replace(" ! ", "!\n")