Exemple #1
0
def shakespeare_words():
    """
    Concatenate all of shakespeare
    :return:
    """
    return itertools.chain.from_iterable(
        shakespeare.words(fileid) for fileid in shakespeare.fileids())
from nltk.corpus import brown, shakespeare
from nltk.probability import LidstoneProbDist
from nltk.model.ngram import NgramModel

##todo: try shakespeare corpus

NGRAM_MODEL_N = 3
#TRAIN = brown.words(categories='lore') ## just a list of strings
TRAIN = shakespeare.words()
ESTIMATOR = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)

lm = NgramModel(NGRAM_MODEL_N, TRAIN, estimator=ESTIMATOR)
print lm

print lm.generate(40)
print 'done'
Exemple #3
0
 def __init__(self, limit=1000):
     # TODO: Read all of shakespeare into words?
     fileid = shakespeare.fileids()[0]
     words = remove_punctuation(shakespeare.words(fileid))
     self.finder = BigramCollocationFinder.from_words(words)
     self.bigrams = self.finder.nbest(bigram_measures.raw_freq, limit)
Exemple #4
0
from nltk.corpus import brown
from nltk.corpus import shakespeare
import string


if __name__ == '__main__':

    num_simulations = 10000
    alphabet = list(string.ascii_lowercase)
    word_length = 5
    start_state = ['t']

    # words = [w.lower() for w in brown.words()]
    all_words = []
    for book in shakespeare.fileids():
        all_words.extend(shakespeare.words(book))
    words = set([w.lower() for w in all_words])

    eval_function = lambda word: 1 if ''.join(word) in words else 0

    mcts = TextMCTS(alphabet, word_length, eval_function)
    state = start_state

    while len(state) < word_length:
        state = mcts.search(state, num_simulations)
        print(state)

    generated_word = ''.join(state)
    print("generated word: %s" % generated_word)
    print("is in corpus: %s" % (generated_word in words))
Exemple #5
0
hamlet_all = nltk.corpus.gutenberg.raw("shakespeare-macbeth.txt")  #entire text
print(hamlet)
print(hamlet_sentences)
print(hamlet_all)

#bringing in your own text in as an NLTK text object
from nltk.corpus import PlaintextCorpusReader
dir = 'C:/Users/adam_/Documents/MSDA/Data Foundations/'
file = 'IS6713_syllabus.txt'
syllabus = PlaintextCorpusReader(dir, file)
syllabus.words()  #ERROR

#conditional frequency distributions provide handy tables and plot
from nltk import ConditionalFreqDist as CFD
cfd=CFD((fileid, len(word)) for fileid in shakespeare.fileids() \
        for word in shakespeare.words(fileid)[:20000] if len(word)>3)
#loop through each file in the Shakespeare collection and return the frequency distribution for words greater than
#3 characters long for each file truncating each file after the first 20,000 words
cfd.tabulate()
cfd.plot()
cfd[u'a_and_c.xml']  #'u' indicates unicode parsing of the source file

from nltk.corpus import stopwords  #Brown stopword list
stopwords.words('english')  #note all stopwords are lowercase

from nltk.tokenize import word_tokenize
sentence = "This is an example showing off stop word filtration"
stopwords = set(stopwords.words('english'))
words = word_tokenize(sentence)
print(words)
filtered_sentences = [
def words():
    return list(
        itertools.chain.from_iterable(
            shakespeare.words(fileid) for fileid in shakespeare.fileids()))