Example #1
0
    def test_update_empty_vocab(self):
        empty = Vocabulary(unk_cutoff=2)
        self.assertEqual(len(empty), 0)
        self.assertFalse(empty)
        self.assertIn(empty.unk_label, empty)

        empty.update(list("abcde"))
        self.assertIn(empty.unk_label, empty)
Example #2
0
    def test_update_empty_vocab(self):
        empty = Vocabulary(unk_cutoff=2)
        self.assertEqual(len(empty), 0)
        self.assertFalse(empty)
        self.assertIn(empty.unk_label, empty)

        empty.update(list("abcde"))
        self.assertIn(empty.unk_label, empty)
Example #3
0
    def test_eqality(self):
        v1 = Vocabulary(["a", "b", "c"], unk_cutoff=1)
        v2 = Vocabulary(["a", "b", "c"], unk_cutoff=1)
        v3 = Vocabulary(["a", "b", "c"], unk_cutoff=1, unk_label="blah")
        v4 = Vocabulary(["a", "b"], unk_cutoff=1)

        self.assertEqual(v1, v2)
        self.assertNotEqual(v1, v3)
        self.assertNotEqual(v1, v4)
Example #4
0
def nltk_ngram_perplexity(train, test):
    # Unigram

    train_sentences = [line.strip() for line in open(train, 'r')]
    tokenized_text = [list(nltk.tokenize.word_tokenize(sent))
                      for sent in train_sentences]
    single_line = [list(itertools.chain.from_iterable(tokenized_text))]

    n = 1
    # train_data = [ngrams(sent, 1) for sent in tokenized_text]
    train_data = [ngrams(sent, 1) for sent in single_line]

    model = Laplace(n)
    words = [word for sent in tokenized_text for word in sent]
    padded_vocab = Vocabulary(words)
    model.fit(train_data, padded_vocab)

    test_sentences = [line.strip() for line in open(test, 'r')]
    tokenized_text = [list(nltk.tokenize.word_tokenize(sent))
                      for sent in test_sentences]
    single_line = [list(itertools.chain.from_iterable(tokenized_text))]

    # test_data = [ngrams(sent, 1) for sent in tokenized_text]
    test_data = [ngrams(sent, 1) for sent in single_line]

    for i, test_d in enumerate(test_data):
        print(f'unigram: {model.perplexity(test_d)}')
        # print(model.entropy(test_d))

    # Bigram
    train_sentences = [line.strip() for line in open(train, 'r')]
    tokenized_text = [list(nltk.tokenize.word_tokenize(sent))
                      for sent in train_sentences]
    single_line = [list(itertools.chain.from_iterable(tokenized_text))]

    n = 2
    # train_data = [ngrams_pad(sent, n) for sent in tokenized_text]
    train_data = [ngrams_pad(sent, n) for sent in single_line]

    model = Laplace(n)
    words = [word for sent in tokenized_text for word in sent]
    words.extend(["<s>", "</s>"])
    padded_vocab = Vocabulary(words)
    model.fit(train_data, padded_vocab)

    test_sentences = [line.strip() for line in open(test, 'r')]
    tokenized_text = [list(nltk.tokenize.word_tokenize(sent))
                      for sent in test_sentences]
    single_line = [list(itertools.chain.from_iterable(tokenized_text))]

    # test_data = [ngrams_pad(sent, n) for sent in tokenized_text]
    test_data = [ngrams_pad(sent, n) for sent in single_line]

    for i, test_d in enumerate(test_data):
        print(f'bigram: {model.perplexity(test_d)}')
 def train(self):
     tokenizer = CharTokenizer()
     char_tokens = tokenizer.tokenize(self.text)
     vocabs = Vocabulary(char_tokens, unk_cutoff=self.unk_threshold)
     char_tokens = [token if token in vocabs else "<UNK>" for token in char_tokens]
     del vocabs  # we dont need it anymore
     self.len = len(char_tokens)
     self.vocabs = Vocabulary(char_tokens)
     # index start from 1 for the sake of simplicity
     for n in range(1, self.n + 1):
         self.multi_grams[n] = nltk.FreqDist(nltk.ngrams(char_tokens, n))
     self.deleted_interpolation()
Example #6
0
    def test_len_is_constant(self):
        # Given an obviously small and an obviously large vocabulary.
        small_vocab = Vocabulary("abcde")
        from nltk.corpus.europarl_raw import english

        large_vocab = Vocabulary(english.words())

        # If we time calling `len` on them.
        small_vocab_len_time = timeit("len(small_vocab)", globals=locals())
        large_vocab_len_time = timeit("len(large_vocab)", globals=locals())

        # The timing should be the same order of magnitude.
        self.assertAlmostEqual(small_vocab_len_time, large_vocab_len_time, places=1)
    def train(self):
        tokenizer = CharTokenizer()
        char_tokens = tokenizer.tokenize(self.text)
        vocabs = Vocabulary(char_tokens, unk_cutoff=self.unk_threshold)
        char_tokens = [token if token in vocabs else "<UNK>" for token in char_tokens]
        del vocabs  # we dont need it anymore
        char_grams = nltk.ngrams(char_tokens, self.n)
        self.len = len(char_tokens)
        self.vocabs = Vocabulary(char_tokens)
        self.dist = nltk.FreqDist(char_grams)

        if self.n > 1:
            self.char_counter = Counter(nltk.ngrams(char_tokens, self.n - 1))
        else:
            self.char_counter = Counter(char_tokens)
Example #8
0
def create_language_model(doc_ids: List[str, ], n: int = 3) -> MLE:
    sentences = []

    # doc_id を 1つず処理していく
    for doc_id in doc_ids:
        # doc_id に紐づく単語を取得
        all_tokens = datastore.get_annotation(doc_id, "token")

        # doc_id に紐づく文を取得
        # find_xs_in_y を使用し, 文に含まれている単語のみを抽出し, sentences に格納
        for sentence in datastore.get_annotation(doc_id, "sentence"):
            tokens = find_xs_in_y(all_tokens, sentence)

            sentences.append(["__BOS__"] +
                             [token['lemma']
                              for token in tokens] + ["__EOS__"])

    # ボキャブラリを作成
    vocab = Vocabulary([word for sentence in sentences for word in sentence])

    # n-gram を利用して, 1組 n 個の単語の組み合わせ作成
    ngram = [ngrams(sentence, n) for sentence in sentences]

    # MLE というモデルを用いて, 言語モデルを作成
    lm = MLE(order=n, vocabulary=vocab)
    lm.fit(ngram)

    return lm
Example #9
0
def buildVocab(trainFolders, numDocs):
    print('Number of training documents:', str(numDocs))
    words = []
    for folder in trainFolders:
        files = os.listdir(os.getcwd() + '/' + folder)
        for i in range(numDocs // 2):
            with open('{}/{}/{}'.format(os.getcwd(), folder, files[i]),
                      'r') as doc:
                words.extend(doc.read().split())
    vocab = Vocabulary(words, unk_cutoff=1)
    print(len(vocab), 'unique words')

    #Set the cutoff the the greatest value where len(vocab is greater than 2500)
    while len(vocab) > 2500:
        vocab._cutoff += 1
    vocab._cutoff -= 1

    print('Length of vocab before trimming:' + str(len(vocab)))
    #Remove words right above the cutoff until the length of the vocab is 2500
    rCount = 0
    for word in words:
        if (vocab[word] == vocab._cutoff):
            del vocab.counts[word]
        if rCount >= len(vocab) - 2500:
            break

    print('dictionary size:', len(vocab))
    return vocab
Example #10
0
def _prepare_test_data(ngram_order):
    return (
        Vocabulary(["a", "b", "c", "d", "z", "<s>", "</s>"], unk_cutoff=1),
        [
            list(padded_everygrams(ngram_order, sent))
            for sent in (list("abcd"), list("egadbe"))
        ],
    )
Example #11
0
 def build_vocab(self):
     out = []
     for col in self.text_cols:
         col_ = self.df[col]
         extend = [w for sent in col_ for w in sent]
         out.extend(extend)
     out = list(Vocabulary(out, unk_cutoff=100))
     out = {out[i]: len(out) - (i + 1) for i in range(len(out))}
     self.vocab = out
Example #12
0
 def test_creation_with_counter(self):
     self.assertEqual(
         self.vocab,
         Vocabulary(
             Counter(
                 ["z", "a", "b", "c", "f", "d", "e", "g", "a", "d", "b", "e", "w"]
             ),
             unk_cutoff=2,
         ),
     )
 def processWords(self):
     self.corpus = self.message_scrape(
         path=save_path + 'chat1.db', ids=100) + ' ' + self.message_scrape(
             path=save_path + 'chat2.db', ids=100)
     print("preprocessing words complete!")
     words = nltk.word_tokenize(self.corpus)
     #automate this later
     self.words = list(filter(lambda a: a != 'bet', words))
     self.vocab = list(Vocabulary(self.words))
     self.embeddings = self.runEmbeddings()
 def create_model(self, model_nm):
     self.model = {
         "lidstone": Lidstone(0.5, self.ngram_order),
         "kneserney": KneserNeyInterpolated(self.ngram_order),
         "wittenbell": WittenBellInterpolated(self.ngram_order)
     }[model_nm]
     train, vocab = padded_everygram_pipeline(self.ngram_order, self.text)
     vocab = Vocabulary(vocab, unk_cutoff=2, unk_label="<UNK>")
     print("Creating ngram...")
     self.model.fit(train, vocab)
     print("done")
    def train(self):
        tokenizer = CharTokenizer()
        char_tokens = tokenizer.tokenize(self.text)
        char_grams = nltk.ngrams(char_tokens, self.n)
        self.len = len(char_tokens)
        self.vocabs = Vocabulary(char_tokens)
        self.dist = nltk.FreqDist(char_grams)

        if self.n > 1:
            self.char_counter = Counter(nltk.ngrams(char_tokens, self.n - 1))
        else:
            self.char_counter = Counter(char_tokens)
Example #16
0
 def fit(self, steps):
     tokens = [step.tree.list() for step in steps]
     train_data = [
         nltk.bigrams(t,
                      pad_right=True,
                      pad_left=True,
                      left_pad_symbol="<s>",
                      right_pad_symbol="</s>") for t in tokens
     ]
     words = [word for sent in tokens for word in sent]
     words.extend(["<s>", "</s>"])
     padded_vocab = Vocabulary(words)
     self.ngram.fit(train_data, padded_vocab)
Example #17
0
def create_language_model(doc_ids, N=3):
    sents = []
    for doc_id in doc_ids:
        all_tokens = datastore.get_annotation(doc_id, 'token')
        for sent in datastore.get_annotation(doc_id, 'sentence'):
            tokens = find_xs_in_y(all_tokens, sent)
            sents.append(['__BOS__'] + [token['lemma']
                                        for token in tokens] + ['__EOS__'])
    vocab = Vocabulary([word for sent in sents for word in sent])
    text_ngrams = [ngrams(sent, N) for sent in sents]
    lm = MLE(order=N, vocabulary=vocab)
    lm.fit(text_ngrams)
    return lm
Example #18
0
    def __init__(self, savedir=None):
        self.train = {}
        self.test = {}
        self.classifier = {}
        self.vocab = Vocabulary(unk_cutoff=1)
        self.prepare_dataset(mode='train')
        self.prepare_dataset(mode="test")
        self.vocab_words = {
            w: 0
            for w in self.vocab.counts.keys() if w in self.vocab
        }
        self.vocab_words['UNK'] = 0  # initially add UNK feature section
        # vocab size is currently 20124
        # uncomment this and erase the below line for full training. Currently training only gender for speed issue
        for mode in [
                'gender', 'age_group', 'extroverted', 'stable', 'agreeable',
                'conscientious', 'openness'
        ]:
            self.run_train(mode)

        if savedir is not None:
            with open(savedir, 'wb') as f:
                pickle.dump(self, f)
Example #19
0
def train_ngram_lm(tokenized_text, models, n=3, a=0.0015, unk_cutoff=10, discount=0.1):
    training_ngrams, padded_sents = padded_everygram_pipeline(n, tokenized_text)
    vocab = Vocabulary(padded_sents, unk_cutoff=unk_cutoff)
    lms = []
    for model in models:
        training_ngrams, padded_sents = padded_everygram_pipeline(n, tokenized_text)
        if model == 'Kneser Ney':
            lm = MKneserNeyInterpolated(order=n, discount=discount, vocabulary=vocab)
        elif model == 'WBI':
            lm = MWittenBellInterpolated(order=n, vocabulary=vocab)
        elif model == 'Lidstone':
            lm = MLidstone(gamma=a, order=n, vocabulary=vocab)
        lm.fit(training_ngrams)
        lms += [lm]
    return lms
Example #20
0
def get_data(n, text):
    train_ngrams = [
        ng(t,
           n,
           pad_right=True,
           pad_left=True,
           left_pad_symbol="<s>",
           right_pad_symbol="</s>") for t in text
    ]
    words = [word for sent in text for word in sent]
    words.extend(["<s>", "</s>"])
    train_vocab = Vocabulary(words)
    #print(sorted(train_vocab))
    #train_vocab =flatten(pad_both_ends(sent, n=n) for sent in text)
    return train_ngrams, train_vocab
Example #21
0
def fit_mle_model(text, text_dict):
    # text dict key: index value: text, nie ma w tokenizer domyslnie trzeba odwrocic slownik
    model = Laplace(2)
    tokenized_text = [[text_dict[index] for index in sentence]
                      for sentence in text]
    train_data = [list(nltk.bigrams(t)) for t in tokenized_text]
    train_data_without_unk = []
    for bigrams in train_data:
        filtered_text = []
        for bigram in bigrams:
            if bigram[0] != 'UNK' and bigram[1] != 'UNK':
                filtered_text.append(bigram)
        train_data_without_unk.append(filtered_text)
    words = [word for sentence in tokenized_text for word in sentence]
    vocab = Vocabulary(words)
    model.fit(train_data_without_unk, vocab)
    return model
def building_vocab(path_vocab_src, vocab_path_out):
    vocab_src = open(path_vocab_src, "r")
    raw = vocab_src.read()
    tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
    tokens = tokenizer.tokenize(raw)
    vocab = Vocabulary(tokens, unk_cutoff=8)

    sorted_vocab = sorted(vocab)
    sorted_vocab.remove('.')
    sorted_vocab.remove('<UNK>')
    with open(vocab_path_out, "w") as f:
        f.write('<pad>' + '\n')
        f.write('<unk>' + '\n')
        f.write('<s>' + '\n')
        f.write('</s>' + '\n')
        f.write('.' + '\n')
        for item in sorted_vocab:
            f.write(item + '\n')
Example #23
0
 def __init__(self,
              beam_width,
              lm=None,
              ngram=0,
              prune=0,
              trie=None,
              gamma=1):
     super().__init__()
     self.beam_width = beam_width
     self.gamma = gamma
     if lm:
         assert ngram
         file_path = PATH.LM_DATA_DIR + str(ngram) + "gram-p" + str(
             prune) + ".pkl"
         with open(file_path, 'rb') as fin:
             counter = pickle.load(fin)
             vocab = Vocabulary(DATA.CHARS)
         lm_switcher = {
             'mle':
             MLE(ngram, counter=counter, vocabulary=vocab),
             'sbo':
             StupidBackoff(ngram,
                           backoff=0.4,
                           counter=counter,
                           vocabulary=vocab),
             'kn':
             KneserNeyInterpolated(ngram, counter=counter,
                                   vocabulary=vocab),
             'knbo':
             KneserNeyBackoff(ngram,
                              backoff=0.4,
                              counter=counter,
                              vocabulary=vocab),
         }
         lm = lm_switcher[lm]
     self.lm = lm
     self.ngram = ngram
     if trie:
         trie_switcher = {
             '100k': "wiki-100k.txt",
             '10k': 'google-10000-english.txt',
         }
         trie = load_trie(PATH.LM_DATA_DIR + trie_switcher[trie])
     self.trie = trie
Example #24
0
    def fit(self, corpus: str = None, counts=None):
        from nltk import sent_tokenize
        sentences = sent_tokenize(corpus)

        from nltk import TweetTokenizer
        tweet_wt = TweetTokenizer()
        sentences = [tweet_wt.tokenize(sent) for sent in sentences]

        from nltk.lm import Vocabulary
        if (sentences is not None):
            counts = self.__generate_word_counts_from_corpus(sentences)

        if (counts is None):
            raise Exception("Invalid arguments exception")

        self._counts = counts
        self._vocabulary = Vocabulary(counts=self.counts,
                                      unk_cutoff=self._cutoff_thresshold,
                                      unk_label=self._cutoff_replacement)

        self._unique = list(self._vocabulary)
        self._size = len(self._unique)
def generateReport(trainSize, output):
    global V, SPAM_CTS, NONSPAM_CTS
    V = Vocabulary(
        pickle.load(open('obj/vocab_{}.p'.format(str(trainSize)), 'rb')))
    SPAM_CTS = dict(
        pickle.load(open('obj/spam_cts_{}.p'.format(str(trainSize)), 'rb')))
    NONSPAM_CTS = dict(
        pickle.load(open('obj/nonspam_cts_{}.p'.format(str(trainSize)), 'rb')))

    (tp, fn) = classifyDocs('data/spam-test')
    (fp, tn) = classifyDocs('data/nonspam-test')
    r = recall(tp, tn, fn)
    p = precision(tp, tn, fp)
    f1 = f1score(p, r)
    output.write(
        '\nResults for model trained on {} documents:\n'.format(trainSize))
    output.write('True positives: ' + str(tp) + '\n')
    output.write('False negatives: ' + str(fn) + '\n')
    output.write('True negatives: ' + str(tn) + '\n')
    output.write('False positives: ' + str(fp) + '\n')
    output.write('Precision: ' + str(p) + '\n')
    output.write('Recall: ' + str(r) + '\n')
    output.write('F score: ' + str(f1) + '\n')
import pandas as pd
import numpy as np

# use brown as training data
tokenized_text = list(brown.sents())
n = 2
train_data = [
    nltk.bigrams(t,
                 pad_right=True,
                 pad_left=True,
                 left_pad_symbol="<s>",
                 right_pad_symbol="</s>") for t in tokenized_text
]
words = [word for sent in tokenized_text for word in sent]
words.extend(["<s>", "</s>"])
padded_vocab = Vocabulary(words)
model = MLE(n)
model.fit(train_data, padded_vocab)

for p in range(1, 11):
    # select test data in certain prompt
    test_df = pd.read_csv(
        'data/asap/test_public_repaired.txt',
        encoding='utf-8',
        sep='\t',
        header=0,
        quoting=csv.QUOTE_NONE,
        names=['Id', 'EssaySet', 'essay_score1', 'essay_score2', 'EssayText'],
        dtype={
            'Id': str,
            'EssaySet': str,
Example #27
0
from nltk.lm.preprocessing import padded_everygram_pipeline
from preprocessing import processed_text
from nltk.lm import MLE
from nltk.lm import Vocabulary
import dill
import time

n = 4

training_data, padded_sents = padded_everygram_pipeline(n, processed_text)


pre_tim=time.time()

print('starting training')
print('#######################################')

vocab=Vocabulary(unk_cutoff=2)
model = MLE(n,vocabulary=vocab) 
model.fit(training_data, padded_sents)


print('#######################################')
print('done training', time.time()-pre_tim)


filename = 'ngram_model.pkl' 
with open(filename, 'wb') as out:
	dill.dump(model, out)

print(model.vocab)
Example #28
0
    return logprob


'''
Corpus perplexity
'''
train = df[df.type == 'answer'].reset_index()
test = df[df.type == 'title'].sample(100).text.values

for line in test:
    print(proba_sentence(line), line)

# ------------------------

train_data = [
    ngrams(t,
           n=n,
           pad_right=True,
           pad_left=True,
           left_pad_symbol="<s>",
           right_pad_symbol="</s>") for t in df.tokens
]

words = [word for sent in df.tokens for word in sent]
words.extend(["<s>", "</s>"])
vocab = Vocabulary(words, unk_cutoff=20)
model = MLE(n)
model.fit(train_data, padded_vocab)

# -------
Example #29
0
 def test_cutoff_setter_checks_value(self):
     with self.assertRaises(ValueError) as exc_info:
         Vocabulary("abc", unk_cutoff=0)
     expected_error_msg = "Cutoff value cannot be less than 1. Got: 0"
     self.assertEqual(expected_error_msg, str(exc_info.exception))
Example #30
0
 def setUpClass(cls):
     cls.vocab = Vocabulary(
         ["z", "a", "b", "c", "f", "d", "e", "g", "a", "d", "b", "e", "w"],
         unk_cutoff=2,
     )
from nltk.lm import NgramCounter, Vocabulary
from nltk.lm.preprocessing import padded_everygram_pipeline
import pickle

model_dir = '../../data/ngrams/'

with open(f'{model_dir}tokenized_text.pickle', 'rb') as file:
    tokenized_text = pickle.load(file)

training_ngrams, padded_sents = padded_everygram_pipeline(3, tokenized_text)
counter = NgramCounter(training_ngrams)
vocabulary = Vocabulary(padded_sents, unk_cutoff=10)

with open(f'{model_dir}counter.pickle', 'wb') as file:
    pickle.dump(counter, file)

with open(f'{model_dir}vocabulary.pickle', 'wb') as file:
    pickle.dump(vocabulary, file)