def train():
    # parse XML and load up words
    print("Loading words from XML files...")
    sentences = []
    files = glob.glob("data/*.xml")
    i = 0
    for file in files:
        if i > 0 and i % 500 == 0:
            print("%d/%d files loaded, #-sentences: %d" %
                  (i, len(files), len(sentences)))
            break
        dir, file = file.split("/")
        reader = XMLCorpusReader(dir, file)
        sentences.extend(nltk.sent_tokenize(" ".join(reader.words())))
        i += 1
    words = []
    for sentence in sentences:
        words.append(nltk.word_tokenize(sentence))
    # build a trigram Language Model (using default Good-Turing
    # smoothing) with the words array
    print("Building language model...")
    est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    langModel = NgramModel(3, words, estimator=est)
    #  langModel = NgramModel(3, words)
    #  cPickle.dump(langModel, open("lm.bin", 'wb'))
    return langModel
def cross_validate(result_file, root_dir, folds, read_func, sent_num):
    if os.path.isfile(result_file):
        os.system('rm ' + result_file)
    print 'cross-validation started'

    for i, fold in enumerate(folds):
        print 'fold %d/%d started' % (i+1, len(folds))
        # read test
        test = read_func(root_dir, fold, sent_num, local_idx = True, full_info = True)
        # read train
        train_files = reduce(lambda a,b: a+b, folds[:i] + folds[i+1:])
        train = read_func(root_dir, train_files, sent_num)
        # open result file
        fw = open(result_file, 'ab')
        # train and compute
        count = 1
        for key in test.iterkeys():
            lm = NgramModel(3, train[key])
            for s in test[key]:
                if len(s[-1]) > 0:
                    e = lm.entropy(s[-1])
                    # s[0] is xml id
                    # s[1] is div index
                    # key is global index
                    # s[2] is turn index
                    # s[3] is local index
                    fw.write(','.join(map(str, (s[0], s[1], key, s[2], s[3], e))) + '\n')
            # print progress
            sys.stdout.write('\rkey = %d/%d done' % (count, len(test)))
            sys.stdout.flush()
            count += 1
        print 'fold %d/%d done' % (i+1, len(folds))
Exemple #3
0
    def build_lm(self, corpus, order=2):
        """
        Create a reasonable English language model on your training data.
        """

        self._lm_order = order
        if order > 0:
            tokens = []
            sentence_count = 0
            for e_sent, f_sent in corpus:
                if sentence_count % 100 == 0:
                    print("LM Sentence %i" % sentence_count)
                    sentence_count += 1

                # Each sentence starts with an empty string
                tokens += [''] + e_sent

            estimator = lambda fdist, bins: \
                LidstoneProbDist(fdist, 0.1)
            self._lm = NgramModel(order,
                                  tokens,
                                  pad_left=False,
                                  pad_right=False,
                                  estimator=estimator)
        else:
            self._lm = StubLanguageModel()
Exemple #4
0
def store_name_brand_attribute_features(review_data, filepath, sheetnum,
                                        colnum, storepath):
    lm = NgramModel(1, review_data, estimator=None)
    data = tp.seg_fil_excel(filepath, sheetnum, colnum)
    ep = entropy_perplexity(lm, data)
    p = open(storepath, 'w')
    for j in ep:
        p.write(str(j[0]) + '\t' + str(j[1]) + '\n')
    p.close()
Exemple #5
0
def leftmodel(sequencesinfamilylist):
    print 'Learning left model...'
    model_left = NgramModel(3,
                            sequencesinfamilylist,
                            pad_left=False,
                            pad_right=True,
                            estimator=lid_estimator)
    print 'Done learning left model.'
    return model_left
Exemple #6
0
def build_model(word_string):
	words = word_string.replace('\n',' ').replace('\t',' ')
	#split_delim = "|".join(["\%s" % s for s in string.punctuation + " "])
	#words = re.split(split_delim,words)
	words = re.findall('[a-zA-Z]+|[%s]+' % string.punctuation, words)
	words = [w.strip() for w in words]
	est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
	model = NgramModel(6, words, estimator=est)
	return model
def store_name_brand_attribute_features(review_data, filepath, sheetnum, colnum, storepath):
	# Building an ngram language model of a certain product category review
	lm = NgramModel(1, review_data, estimator=None) # Need initiallized

	# Read full review dataset
	data =  tp.seg_fil_excel(filepath, sheetnum, colnum)

	ep = entropy_perplexity(lm, data)

	p = open(storepath,'w')
	for j in ep:
	    p.write(str(j[0]) + '\t' + str(j[1]) + '\n')
	p.close()
Exemple #8
0
def buildmodel(textfn, onlywholewords=False):
    """Takes a filename for some input text, tokenizes, and builds a bigram
    model."""
    text = open(textfn, "r").read()
    words = nltk.word_tokenize(text)

    if onlywholewords:
        import string
        isletter = lambda c: c in string.ascii_letters
        words = [word for word in words if all(map(isletter, word))]

    model = NgramModel(2, words)

    return model
Exemple #9
0
from random import randint
from nltk.tokenize import word_tokenize
import numpy as np

est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)

noOfFiles = 3
fileids = ['bryant-stories.txt', 'carroll-alice.txt', 'shakespeare-hamlet.txt']
Authors = ['Bryant', 'Carroll', 'Shakespeare']
lenFirstSent = [
    len(gutenberg.sents(fileids[i])[0]) - 1 for i in range(noOfFiles)
]
C = [gutenberg.words(fileids[i])[lenFirstSent[i]:] for i in range(noOfFiles)]
lenC = [len(C[i]) for i in range(noOfFiles)]

unigram = [NgramModel(1, C[i], estimator=est) for i in range(noOfFiles)]
bigram = [
    NgramModel(2, C[i], True, True, estimator=est) for i in range(noOfFiles)
]
trigram = [
    NgramModel(3, C[i], True, True, estimator=est) for i in range(noOfFiles)
]


def generateText(model, train):
    pos = []
    for i in range(20):
        pos.append(train[randint(0, len(train))])
    return model.generate(50, set(pos))

Exemple #10
0
    SAMPLE_SENTS_NUM = 100
    for fold_idx, fold in enumerate(folds):  # takes 37 min
        # print progress
        print 'fold %d/%d started' % (fold_idx + 1, len(folds))
        # read test sents
        test_sents = read_firstN_sents(cur,
                                       fold,
                                       SAMPLE_SENTS_NUM,
                                       include_loc=True)
        # read the train sents
        train_convIDs = reduce(lambda a, b: a + b,
                               folds[:fold_idx] + folds[fold_idx + 1:])
        train_sents = read_firstN_sents(cur, train_convIDs, SAMPLE_SENTS_NUM)
        # open the file to write the results
        fw = open(res_file_name, 'ab')
        # train and compute
        key_count = 1
        for key in test_sents.iterkeys():
            lm = NgramModel(3, train_sents[key])
            for s in test_sents[key]:
                e = lm.entropy(s[2])
                fw.write(','.join((str(s[0]), str(key), str(s[1]), str(e))) +
                         '\n')
            # print progress
            sys.stdout.write('\rkey = %d/%d done' %
                             (key_count, len(test_sents)))
            sys.stdout.flush()
            key_count += 1
        # print progress
        print '\nfold %d/%d done\n' % (fold_idx + 1, len(folds))
    from nltk.model.ngram import NgramModel

    import re

    corpus = {"bill": [], "resolution": []}
    for b in Bill.objects.filter(congress__gte=109):
        title = b.title_no_number + " ###"
        if title.startswith("To "): continue
        title = re.sub(r" \d\d\d\d", " 2015", title)
        title = re.sub(r"\.$", "", title)
        corpus[b.noun].append(title.split(" "))

    # Generate a few separate models.
    models = {
        ("bill", 2): NgramModel(2, corpus["bill"]),
        ("bill", 3): NgramModel(3, corpus["bill"]),
        ("resolution", 2): NgramModel(2, corpus["resolution"]),
        ("resolution", 3): NgramModel(3, corpus["resolution"]),
    }

    def make_random_bill_title(bill_type):
        # Generate a sentence, one word at a time.
        sentence = []
        while True:
            model = models[(bill_type, 2 if (len(sentence) % 2) == 0 else 3)]
            wd = model.choose_random_word(sentence)

            if wd == "###":
                if len(sentence) > 6:
                    # finished
Exemple #12
0
data = f.read()


def removeNonAscii(s):
    return "".join(i for i in s if ord(i) < 128)


data = removeNonAscii(data)

tokens = nltk.word_tokenize(data)
t = Text(tokens)
# t.generate(30)

estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
trigram_model = NgramModel(3, t, estimator=estimator)

token_array = trigram_model.generate(150)[10:]

first_token = token_array[0].strip

if first_token in [".", ",", "?", "(", ")"]:
    token_array = token_array[1:]

joined = " ".join(token_array)
joined = joined.replace(" . ", ".\n")
joined = joined.replace(". ", ".\n")
joined = joined.replace(" ? ", "?\n")
joined = joined.replace(" , ", ",\n")
joined = joined.replace(" ) ", ")\n")
joined = joined.replace(" ( ", "(")
Exemple #13
0
 def build_model(self, text):
     c = wordpunct_tokenize(text)
     m = NgramModel(1, c, self.discount)
     return m