def cross_validate(result_file, root_dir, folds, read_func, sent_num):
    if os.path.isfile(result_file):
        os.system('rm ' + result_file)
    print 'cross-validation started'

    for i, fold in enumerate(folds):
        print 'fold %d/%d started' % (i+1, len(folds))
        # read test
        test = read_func(root_dir, fold, sent_num, local_idx = True, full_info = True)
        # read train
        train_files = reduce(lambda a,b: a+b, folds[:i] + folds[i+1:])
        train = read_func(root_dir, train_files, sent_num)
        # open result file
        fw = open(result_file, 'ab')
        # train and compute
        count = 1
        for key in test.iterkeys():
            lm = NgramModel(3, train[key])
            for s in test[key]:
                if len(s[-1]) > 0:
                    e = lm.entropy(s[-1])
                    # s[0] is xml id
                    # s[1] is div index
                    # key is global index
                    # s[2] is turn index
                    # s[3] is local index
                    fw.write(','.join(map(str, (s[0], s[1], key, s[2], s[3], e))) + '\n')
            # print progress
            sys.stdout.write('\rkey = %d/%d done' % (count, len(test)))
            sys.stdout.flush()
            count += 1
        print 'fold %d/%d done' % (i+1, len(folds))
    def test_MLEEstimator(self):
        est = MLEEstimator()
        dapos_model = NGram(3, estimator=est)
        dapos_model.set_index(AuxiliaryIndex)

        nltk_model = NgramModel(3, self.corpus, estimator=MLEProbDist)
        phrase = 'Stop being stunned'.split()
        x = dapos_model.prob(phrase)
        y = nltk_model.prob(phrase[2], phrase[:2])
def train():
    # parse XML and load up words
    print("Loading words from XML files...")
    sentences = []
    files = glob.glob("data/*.xml")
    i = 0
    for file in files:
        if i > 0 and i % 500 == 0:
            print("%d/%d files loaded, #-sentences: %d" %
                  (i, len(files), len(sentences)))
            break
        dir, file = file.split("/")
        reader = XMLCorpusReader(dir, file)
        sentences.extend(nltk.sent_tokenize(" ".join(reader.words())))
        i += 1
    words = []
    for sentence in sentences:
        words.append(nltk.word_tokenize(sentence))
    # build a trigram Language Model (using default Good-Turing
    # smoothing) with the words array
    print("Building language model...")
    est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    langModel = NgramModel(3, words, estimator=est)
    #  langModel = NgramModel(3, words)
    #  cPickle.dump(langModel, open("lm.bin", 'wb'))
    return langModel
Exemple #4
0
    def build_lm(self, corpus, order=2):
        """
        Create a reasonable English language model on your training data.
        """

        self._lm_order = order
        if order > 0:
            tokens = []
            sentence_count = 0
            for e_sent, f_sent in corpus:
                if sentence_count % 100 == 0:
                    print("LM Sentence %i" % sentence_count)
                    sentence_count += 1

                # Each sentence starts with an empty string
                tokens += [''] + e_sent

            estimator = lambda fdist, bins: \
                LidstoneProbDist(fdist, 0.1)
            self._lm = NgramModel(order,
                                  tokens,
                                  pad_left=False,
                                  pad_right=False,
                                  estimator=estimator)
        else:
            self._lm = StubLanguageModel()
Exemple #5
0
def leftmodel(sequencesinfamilylist):
    print 'Learning left model...'
    model_left = NgramModel(3,
                            sequencesinfamilylist,
                            pad_left=False,
                            pad_right=True,
                            estimator=lid_estimator)
    print 'Done learning left model.'
    return model_left
Exemple #6
0
def store_name_brand_attribute_features(review_data, filepath, sheetnum,
                                        colnum, storepath):
    lm = NgramModel(1, review_data, estimator=None)
    data = tp.seg_fil_excel(filepath, sheetnum, colnum)
    ep = entropy_perplexity(lm, data)
    p = open(storepath, 'w')
    for j in ep:
        p.write(str(j[0]) + '\t' + str(j[1]) + '\n')
    p.close()
Exemple #7
0
def build_model(word_string):
	words = word_string.replace('\n',' ').replace('\t',' ')
	#split_delim = "|".join(["\%s" % s for s in string.punctuation + " "])
	#words = re.split(split_delim,words)
	words = re.findall('[a-zA-Z]+|[%s]+' % string.punctuation, words)
	words = [w.strip() for w in words]
	est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
	model = NgramModel(6, words, estimator=est)
	return model
def get_similarity(q, idx, tf_idf):
    COUNT_WORDS_IN_TEXT = 100
    urls_for_sim = set()
    for info in tf_idf:
        weight = info[0]
        ds2_lst = info[1]
        for ds in ds2_lst:
            urls_for_sim.add(ds.text_id)

    rank_lm = []
    est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    q_sequence = wordpunct_tokenize(q)
    for text_id in urls_for_sim:
        text = ds.text
        sequence = wordpunct_tokenize(text)
        lm = NgramModel(3, sequence, estimator=est)
        ds.weight = lm.entropy(q_sequence)
        # if len(ds.text) > COUNT_WORDS_IN_TEXT:
        rank_lm += [ds]

    return sorted(rank_lm, key=lambda ds: ds.weight)
def store_name_brand_attribute_features(review_data, filepath, sheetnum, colnum, storepath):
	# Building an ngram language model of a certain product category review
	lm = NgramModel(1, review_data, estimator=None) # Need initiallized

	# Read full review dataset
	data =  tp.seg_fil_excel(filepath, sheetnum, colnum)

	ep = entropy_perplexity(lm, data)

	p = open(storepath,'w')
	for j in ep:
	    p.write(str(j[0]) + '\t' + str(j[1]) + '\n')
	p.close()
Exemple #10
0
def buildmodel(textfn, onlywholewords=False):
    """Takes a filename for some input text, tokenizes, and builds a bigram
    model."""
    text = open(textfn, "r").read()
    words = nltk.word_tokenize(text)

    if onlywholewords:
        import string
        isletter = lambda c: c in string.ascii_letters
        words = [word for word in words if all(map(isletter, word))]

    model = NgramModel(2, words)

    return model
Exemple #11
0
 def __init__(self, training_prefix):
     l_sentences = []
     f_sentences = []
     c_sentences = []
     trees = []
     # The set of possible inflections for each lemma.
     self.inflections = defaultdict(set)
     with utf8open(training_prefix + ".lemma") as lemma_file, utf8open(
         training_prefix + ".form"
     ) as form_file, utf8open(training_prefix + ".tree") as tree_file:
         for lemma_line, form_line, tree_line in izip(lemma_file, form_file, tree_file):
             l_sentence = lemma_line.split()
             f_sentence = form_line.split()
             c_sentence = []
             for lemma, form in izip(l_sentence, f_sentence):
                 c_sentence.append("{}~{}".format(lemma, form))
                 self.inflections[lemma].add(form)
             l_sentences.append(l_sentence)
             f_sentences.append(f_sentence)
             c_sentences.append(c_sentence)
             trees.append(DepTree(tree_line))
     self.lr_model = NgramModel(2, c_sentences, pad_left=True, estimator=lidstone_estimator)
     self.dp_model = DependencyNgramModel(2, l_sentences, f_sentences, trees)
Exemple #12
0
 def build_model(self, text):
     c = wordpunct_tokenize(text)
     m = NgramModel(1, c, self.discount)
     return m
Exemple #13
0
class Inflector(object):
    """A simple inflector based on a lemma bigram model."""

    def __init__(self, training_prefix):
        l_sentences = []
        f_sentences = []
        c_sentences = []
        trees = []
        # The set of possible inflections for each lemma.
        self.inflections = defaultdict(set)
        with utf8open(training_prefix + ".lemma") as lemma_file, utf8open(
            training_prefix + ".form"
        ) as form_file, utf8open(training_prefix + ".tree") as tree_file:
            for lemma_line, form_line, tree_line in izip(lemma_file, form_file, tree_file):
                l_sentence = lemma_line.split()
                f_sentence = form_line.split()
                c_sentence = []
                for lemma, form in izip(l_sentence, f_sentence):
                    c_sentence.append("{}~{}".format(lemma, form))
                    self.inflections[lemma].add(form)
                l_sentences.append(l_sentence)
                f_sentences.append(f_sentence)
                c_sentences.append(c_sentence)
                trees.append(DepTree(tree_line))
        self.lr_model = NgramModel(2, c_sentences, pad_left=True, estimator=lidstone_estimator)
        self.dp_model = DependencyNgramModel(2, l_sentences, f_sentences, trees)

    def inflect(self, testing_prefix, dp_weight=0.5):
        """Return a list containing inflected versions of the sentences
        described by the files under *testing_prefix*."""
        lr_weight = 1 - dp_weight
        inflected = []
        with utf8open(testing_prefix + ".lemma") as lemma_file, utf8open(testing_prefix + ".tree") as tree_file:
            for lemma_line, tree_line in izip(lemma_file, tree_file):
                l_sentence = lemma_line.split()
                tree = DepTree(tree_line)
                ngrams = dep_ngrams(2, l_sentence, l_sentence, tree)  # not used here
                forms = []
                last_lemma = None
                for lemma, dep_ngram in izip(l_sentence, ngrams):
                    if not self.inflections[lemma]:
                        # We've never seen this lemma before, so just
                        # output it as-is and move on.
                        forms.append(lemma)
                        continue
                    best_form = None
                    best_score = float("-inf")
                    for form in self.inflections[lemma]:
                        if last_lemma is None:
                            context = [""]
                        else:
                            context = ["{}~{}".format(last_lemma, forms[-1])]
                        score = lr_weight * self.lr_model.prob(
                            "{}~{}".format(lemma, form), context
                        ) + dp_weight * self.dp_model.prob(form, dep_ngram[:-1])
                        if score > best_score:
                            best_form = form
                            best_score = score
                    forms.append(best_form)
                    last_lemma = lemma
                inflected.append(" ".join(forms))
        return inflected
Exemple #14
0
from random import randint
from nltk.tokenize import word_tokenize
import numpy as np

est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)

noOfFiles = 3
fileids = ['bryant-stories.txt', 'carroll-alice.txt', 'shakespeare-hamlet.txt']
Authors = ['Bryant', 'Carroll', 'Shakespeare']
lenFirstSent = [
    len(gutenberg.sents(fileids[i])[0]) - 1 for i in range(noOfFiles)
]
C = [gutenberg.words(fileids[i])[lenFirstSent[i]:] for i in range(noOfFiles)]
lenC = [len(C[i]) for i in range(noOfFiles)]

unigram = [NgramModel(1, C[i], estimator=est) for i in range(noOfFiles)]
bigram = [
    NgramModel(2, C[i], True, True, estimator=est) for i in range(noOfFiles)
]
trigram = [
    NgramModel(3, C[i], True, True, estimator=est) for i in range(noOfFiles)
]


def generateText(model, train):
    pos = []
    for i in range(20):
        pos.append(train[randint(0, len(train))])
    return model.generate(50, set(pos))

Exemple #15
0
data = f.read()


def removeNonAscii(s):
    return "".join(i for i in s if ord(i) < 128)


data = removeNonAscii(data)

tokens = nltk.word_tokenize(data)
t = Text(tokens)
# t.generate(30)

estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
trigram_model = NgramModel(3, t, estimator=estimator)

token_array = trigram_model.generate(150)[10:]

first_token = token_array[0].strip

if first_token in [".", ",", "?", "(", ")"]:
    token_array = token_array[1:]

joined = " ".join(token_array)
joined = joined.replace(" . ", ".\n")
joined = joined.replace(". ", ".\n")
joined = joined.replace(" ? ", "?\n")
joined = joined.replace(" , ", ",\n")
joined = joined.replace(" ) ", ")\n")
joined = joined.replace(" ( ", "(")
Exemple #16
0
    SAMPLE_SENTS_NUM = 100
    for fold_idx, fold in enumerate(folds):  # takes 37 min
        # print progress
        print 'fold %d/%d started' % (fold_idx + 1, len(folds))
        # read test sents
        test_sents = read_firstN_sents(cur,
                                       fold,
                                       SAMPLE_SENTS_NUM,
                                       include_loc=True)
        # read the train sents
        train_convIDs = reduce(lambda a, b: a + b,
                               folds[:fold_idx] + folds[fold_idx + 1:])
        train_sents = read_firstN_sents(cur, train_convIDs, SAMPLE_SENTS_NUM)
        # open the file to write the results
        fw = open(res_file_name, 'ab')
        # train and compute
        key_count = 1
        for key in test_sents.iterkeys():
            lm = NgramModel(3, train_sents[key])
            for s in test_sents[key]:
                e = lm.entropy(s[2])
                fw.write(','.join((str(s[0]), str(key), str(s[1]), str(e))) +
                         '\n')
            # print progress
            sys.stdout.write('\rkey = %d/%d done' %
                             (key_count, len(test_sents)))
            sys.stdout.flush()
            key_count += 1
        # print progress
        print '\nfold %d/%d done\n' % (fold_idx + 1, len(folds))
from nltk.corpus import brown, shakespeare
from nltk.probability import LidstoneProbDist
from nltk.model.ngram import NgramModel

##todo: try shakespeare corpus

NGRAM_MODEL_N = 3
#TRAIN = brown.words(categories='lore') ## just a list of strings
TRAIN = shakespeare.words()
ESTIMATOR = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)

lm = NgramModel(NGRAM_MODEL_N, TRAIN, estimator=ESTIMATOR)
print lm

print lm.generate(40)
print 'done'
    from nltk.model.ngram import NgramModel

    import re

    corpus = {"bill": [], "resolution": []}
    for b in Bill.objects.filter(congress__gte=109):
        title = b.title_no_number + " ###"
        if title.startswith("To "): continue
        title = re.sub(r" \d\d\d\d", " 2015", title)
        title = re.sub(r"\.$", "", title)
        corpus[b.noun].append(title.split(" "))

    # Generate a few separate models.
    models = {
        ("bill", 2): NgramModel(2, corpus["bill"]),
        ("bill", 3): NgramModel(3, corpus["bill"]),
        ("resolution", 2): NgramModel(2, corpus["resolution"]),
        ("resolution", 3): NgramModel(3, corpus["resolution"]),
    }

    def make_random_bill_title(bill_type):
        # Generate a sentence, one word at a time.
        sentence = []
        while True:
            model = models[(bill_type, 2 if (len(sentence) % 2) == 0 else 3)]
            wd = model.choose_random_word(sentence)

            if wd == "###":
                if len(sentence) > 6:
                    # finished
Exemple #19
0
    folds = tmp

    # remove the existing result file
    res_file_name = 'results_swbd_nltk_CV.txt'
    if os.path.isfile(res_file_name):
        os.system('rm ' + res_file_name)

    SAMPLE_SENTS_NUM = 100
    for fold_idx, fold in enumerate(folds): # takes 37 min
        # print progress
        print 'fold %d/%d started' % (fold_idx+1, len(folds))
        # read test sents
        test_sents = read_firstN_sents(cur, fold, SAMPLE_SENTS_NUM, include_loc = True)
        # read the train sents
        train_convIDs = reduce(lambda a,b: a+b, folds[:fold_idx] + folds[fold_idx+1:])
        train_sents = read_firstN_sents(cur, train_convIDs, SAMPLE_SENTS_NUM)
        # open the file to write the results
        fw = open(res_file_name, 'ab')
        # train and compute
        key_count = 1
        for key in test_sents.iterkeys():
            lm = NgramModel(3, train_sents[key])
            for s in test_sents[key]:
                e = lm.entropy(s[2])
                fw.write(','.join((str(s[0]), str(key), str(s[1]), str(e))) + '\n')
            # print progress
            sys.stdout.write('\rkey = %d/%d done' % (key_count, len(test_sents)))
            sys.stdout.flush()
            key_count += 1
        # print progress
        print '\nfold %d/%d done\n' % (fold_idx+1, len(folds))
Exemple #20
0
from nltk.text import Text,LidstoneProbDist
from nltk.model.ngram import NgramModel

f = open("poem.txt")
# f = open("howl.txt")

data = f.read()
def removeNonAscii(s): return "".join(i for i in s if ord(i)<128)
data = removeNonAscii(data)

tokens = nltk.word_tokenize(data)
t = Text(tokens)
# t.generate(30)

estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
trigram_model = NgramModel(3, t, estimator = estimator)

token_array = trigram_model.generate(150)[10:]

first_token = token_array[0].strip

if first_token in [".", ",", "?", "(", ")"]:
    token_array = token_array[1:]

joined = " ".join(token_array)
joined = joined.replace(" . ", ".\n")
joined = joined.replace(". ", ".\n")
joined = joined.replace(" ? ", "?\n")
joined = joined.replace(" , ", ",\n")
joined = joined.replace(" ) ", ")\n")
joined = joined.replace(" ( ", "(")
Exemple #21
0
# 
# An n-gram is a sequence of n words; we count n-grams in a text and
# calculate a conditional probability distribution like:
# 
# $$
# P(X_i|X_{i-1},..,X_{i-n+1})
# $$

# <codecell>

from nltk.model.ngram import NgramModel
from nltk.probability import WittenBellProbDist, LidstoneProbDist

train_words = brown.words()[:-500]
test_words = brown.words()[-500:]
lm = NgramModel(2, train_words, lambda fd, b: LidstoneProbDist(fd, 0.2))

# <codecell>

lm.entropy(test_words)

# <markdowncell>

# ### Counting
# 
# For example, how many words in a corpus are not in WordNet?

# <codecell>

from nltk.corpus import wordnet
from nltk.probability import ConditionalFreqDist