Beispiel #1
0
def main(argv):
    """Trains an nltk language model.

    Loads in files of normalized text, partitions them into a train partition
    (3/4 of data) and a test partition (last 1/4 of data). Uses Laplace
    smoothing for unseen ngrams.
    """
    if len(argv) > 1:
        raise app.UsageError("Too many command-line arguments.")

    normalized_data = load_normalized_data(FLAGS.language, FLAGS.data_source,
                                           FLAGS.pass_valid, FLAGS.experiment)
    train_partition, test_partition = partition_data(normalized_data)
    train_ngrams, vocab = padded_everygram_pipeline(2, train_partition)
    test_ngrams, _ = padded_everygram_pipeline(2, test_partition)
    language_model = Laplace(2)
    language_model.fit(train_ngrams, vocab)

    avg_perp, count = compute_avg_perplexity(test_ngrams, language_model)
    print("\n----------------------------\n"
          "Language Model Parameters:\n"
          f"\tLanguage={FLAGS.language}\n"
          f"\tData Sources={FLAGS.data_source}\n"
          f"\tPass Valid={FLAGS.pass_valid}\n"
          f"\tExperiment={FLAGS.experiment}\n"
          "----------------------------\n")
    print(f"Average perplexity across {count} ngrams:\t{avg_perp}")
def test_d2_1_gp():
    global food_corpus, natr_corpus

    food_corpus_tk = train.tokenize_corpus(food_corpus)
    natr_corpus_tk = train.tokenize_corpus(natr_corpus)

    food_train, food_vocab = padded_everygram_pipeline(
        3, food_corpus_tk[:int(0.8 * len(food_corpus_tk))])
    natr_train, natr_vocab = padded_everygram_pipeline(
        3, natr_corpus_tk[:int(0.8 * len(natr_corpus_tk))])

    food_test = sum([['<s>'] + x + ['</s>']
                     for x in food_corpus_tk[int(0.8 * len(food_corpus_tk)):]],
                    [])
    natr_test = sum([['<s>'] + x + ['</s>']
                     for x in natr_corpus_tk[int(0.8 * len(natr_corpus_tk)):]],
                    [])

    food_lm = Laplace(3)
    natr_lm = Laplace(3)

    food_lm.fit(food_train, food_vocab)
    natr_lm.fit(natr_train, natr_vocab)

    eq_(int(evaluate.get_perplexity(food_lm, food_test[:2500])), 7318)
    eq_(int(evaluate.get_perplexity(food_lm, natr_test[:2500])), 7309)
    eq_(int(evaluate.get_perplexity(natr_lm, natr_test[:2500])), 5222)
    eq_(int(evaluate.get_perplexity(natr_lm, food_test[:2500])), 5354)
Beispiel #3
0
 def test_d2_1_gp(self):
     nltk.download('punkt')
     food_corpus_tk = lab3.tokenize_corpus(self.food_corpus)
     natr_corpus_tk = lab3.tokenize_corpus(self.natr_corpus)
     food_train, food_vocab = padded_everygram_pipeline(
         3, food_corpus_tk[:int(0.8 * len(food_corpus_tk))])
     natr_train, natr_vocab = padded_everygram_pipeline(
         3, natr_corpus_tk[:int(0.8 * len(natr_corpus_tk))])
     food_test = sum(
         [['<s>'] + x + ['</s>']
          for x in food_corpus_tk[int(0.8 * len(food_corpus_tk)):]], [])
     natr_test = sum(
         [['<s>'] + x + ['</s>']
          for x in natr_corpus_tk[int(0.8 * len(natr_corpus_tk)):]], [])
     food_lm = Laplace(3)
     natr_lm = Laplace(3)
     food_lm.fit(food_train, food_vocab)
     natr_lm.fit(natr_train, natr_vocab)
     self.assertEqual(int(lab3.get_perplexity(food_lm, food_test[:2500])),
                      7318)
     self.assertEqual(int(lab3.get_perplexity(food_lm, natr_test[:2500])),
                      7309)
     self.assertEqual(int(lab3.get_perplexity(natr_lm, natr_test[:2500])),
                      5222)
     self.assertEqual(int(lab3.get_perplexity(natr_lm, food_test[:2500])),
                      5354)
Beispiel #4
0
class LaplaceBigramTests(unittest.TestCase):
    """unit tests for Laplace class"""

    score_tests = [
        # basic sanity-check:
        # count(d | c) = 1
        # *count(d | c) = 2
        # Count(w | c for w in vocab) = 1
        # *Count(w | c for w in vocab) = 9
        ("d", ["c"], 2.0 / 9),
        # Total unigrams: 14
        # Vocab size: 8
        # Denominator: 14 + 8 = 22
        # count("a") = 2
        # *count("a") = 3
        ("a", None, 3.0 / 22),
        # in vocabulary but unseen
        # count("z") = 0
        # *count("z") = 1
        ("z", None, 1.0 / 22),
        # out of vocabulary should use "UNK" score
        # count("<UNK>") = 3
        # *count("<UNK>") = 4
        ("y", None, 4.0 / 22),
    ]

    def setUp(self):
        vocab, training_text = _prepare_test_data(2)
        self.model = Laplace(2, vocabulary=vocab)
        self.model.fit(training_text)

    def test_gamma(self):
        # Make sure the gamma is set to 1
        self.assertEqual(1, self.model.gamma)

    def test_entropy_perplexity(self):
        text = [
            ("<s>", "a"),
            ("a", "c"),
            ("c", "<UNK>"),
            ("<UNK>", "d"),
            ("d", "c"),
            ("c", "</s>"),
        ]
        # Unlike MLE this should be able to handle completely novel ngrams
        # Ngram = score, log score
        # <s>, a    = 0.2, -2.3219
        # a, c      = 0.1, -3.3219
        # c, UNK    = 0.(1), -3.1699
        # UNK, d    = 0.(09), 3.4594
        # d, c      = 0.1 -3.3219
        # c, </s>   = 0.(1), -3.1699
        # Total logscores: −18.7651
        # - AVG logscores: 3.1275
        H = 3.1275
        perplexity = 8.7393
        self.assertAlmostEqual(H, self.model.entropy(text), places=4)
        self.assertAlmostEqual(perplexity,
                               self.model.perplexity(text),
                               places=4)
Beispiel #5
0
class LaplaceBigramTests(unittest.TestCase):
    """unit tests for Laplace class"""

    score_tests = [
        # basic sanity-check:
        # count(d | c) = 1
        # *count(d | c) = 2
        # Count(w | c for w in vocab) = 1
        # *Count(w | c for w in vocab) = 9
        ("d", ["c"], 2.0 / 9),
        # Total unigrams: 14
        # Vocab size: 8
        # Denominator: 14 + 8 = 22
        # count("a") = 2
        # *count("a") = 3
        ("a", None, 3.0 / 22),
        # in vocabulary but unseen
        # count("z") = 0
        # *count("z") = 1
        ("z", None, 1.0 / 22),
        # out of vocabulary should use "UNK" score
        # count("<UNK>") = 3
        # *count("<UNK>") = 4
        ("y", None, 4.0 / 22),
    ]

    def setUp(self):
        vocab, training_text = _prepare_test_data(2)
        self.model = Laplace(2, vocabulary=vocab)
        self.model.fit(training_text)

    def test_gamma(self):
        # Make sure the gamma is set to 1
        self.assertEqual(1, self.model.gamma)

    def test_entropy_perplexity(self):
        text = [
            ("<s>", "a"),
            ("a", "c"),
            ("c", "<UNK>"),
            ("<UNK>", "d"),
            ("d", "c"),
            ("c", "</s>"),
        ]
        # Unlike MLE this should be able to handle completely novel ngrams
        # Ngram = score, log score
        # <s>, a    = 0.2, -2.3219
        # a, c      = 0.1, -3.3219
        # c, UNK    = 0.(1), -3.1699
        # UNK, d    = 0.(09), 3.4594
        # d, c      = 0.1 -3.3219
        # c, </s>   = 0.(1), -3.1699
        # Total logscores: −18.7651
        # - AVG logscores: 3.1275
        H = 3.1275
        perplexity = 8.7393
        self.assertAlmostEqual(H, self.model.entropy(text), places=4)
        self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
Beispiel #6
0
def nltk_ngram_perplexity(train, test):
    # Unigram

    train_sentences = [line.strip() for line in open(train, 'r')]
    tokenized_text = [list(nltk.tokenize.word_tokenize(sent))
                      for sent in train_sentences]
    single_line = [list(itertools.chain.from_iterable(tokenized_text))]

    n = 1
    # train_data = [ngrams(sent, 1) for sent in tokenized_text]
    train_data = [ngrams(sent, 1) for sent in single_line]

    model = Laplace(n)
    words = [word for sent in tokenized_text for word in sent]
    padded_vocab = Vocabulary(words)
    model.fit(train_data, padded_vocab)

    test_sentences = [line.strip() for line in open(test, 'r')]
    tokenized_text = [list(nltk.tokenize.word_tokenize(sent))
                      for sent in test_sentences]
    single_line = [list(itertools.chain.from_iterable(tokenized_text))]

    # test_data = [ngrams(sent, 1) for sent in tokenized_text]
    test_data = [ngrams(sent, 1) for sent in single_line]

    for i, test_d in enumerate(test_data):
        print(f'unigram: {model.perplexity(test_d)}')
        # print(model.entropy(test_d))

    # Bigram
    train_sentences = [line.strip() for line in open(train, 'r')]
    tokenized_text = [list(nltk.tokenize.word_tokenize(sent))
                      for sent in train_sentences]
    single_line = [list(itertools.chain.from_iterable(tokenized_text))]

    n = 2
    # train_data = [ngrams_pad(sent, n) for sent in tokenized_text]
    train_data = [ngrams_pad(sent, n) for sent in single_line]

    model = Laplace(n)
    words = [word for sent in tokenized_text for word in sent]
    words.extend(["<s>", "</s>"])
    padded_vocab = Vocabulary(words)
    model.fit(train_data, padded_vocab)

    test_sentences = [line.strip() for line in open(test, 'r')]
    tokenized_text = [list(nltk.tokenize.word_tokenize(sent))
                      for sent in test_sentences]
    single_line = [list(itertools.chain.from_iterable(tokenized_text))]

    # test_data = [ngrams_pad(sent, n) for sent in tokenized_text]
    test_data = [ngrams_pad(sent, n) for sent in single_line]

    for i, test_d in enumerate(test_data):
        print(f'bigram: {model.perplexity(test_d)}')
Beispiel #7
0
def treinando_modelo_Laplace(lista_de_textos):
    #Salvando todas as frases em uma unica variavel
    todas_as_questoes = " ".join(lista_de_textos)
    #Separando as palavras do texto levando em consideração o espaço em branco
    todas_as_palavras = WhitespaceTokenizer().tokenize(todas_as_questoes)
    #Adicionando os fake chars em cada palavra e gerando o vetor de vocabulos(nesse caso letras de cada palavra)
    treino_bigram,vocab = padded_everygram_pipeline(2,todas_as_palavras)    
    #Criando modelo MLE para bigramas
    modelo = Laplace(2)
    #Treinando os modelos
    modelo.fit(treino_bigram,vocab)

    return modelo
    def compute_pp(self, n, tokenized_train, tokenized_test):
        train_data, padded_sents = padded_everygram_pipeline(
            n, tokenized_train)
        test_data, padded_sents = padded_everygram_pipeline(n, tokenized_test)
        model = Laplace(1)
        model.fit(train_data, padded_sents)

        s = 0
        for i, test in enumerate(test_data):
            p = model.perplexity(test)
            s += p

        perplexity = s / (i + 1)
        return perplexity
Beispiel #9
0
def fit_mle_model(text, text_dict):
    # text dict key: index value: text, nie ma w tokenizer domyslnie trzeba odwrocic slownik
    model = Laplace(2)
    tokenized_text = [[text_dict[index] for index in sentence]
                      for sentence in text]
    train_data = [list(nltk.bigrams(t)) for t in tokenized_text]
    train_data_without_unk = []
    for bigrams in train_data:
        filtered_text = []
        for bigram in bigrams:
            if bigram[0] != 'UNK' and bigram[1] != 'UNK':
                filtered_text.append(bigram)
        train_data_without_unk.append(filtered_text)
    words = [word for sentence in tokenized_text for word in sentence]
    vocab = Vocabulary(words)
    model.fit(train_data_without_unk, vocab)
    return model
    def make_all_model(self):

        texts = []

        for e in self.db:
            for l in e.lexes:
                texts.append(l['text'])

        tokenized_texts = [normalize_text(x).split() for x in texts]

        n = 3
        train_data, padded_texts = padded_everygram_pipeline(
            n, tokenized_texts)

        model = Laplace(n)
        model.fit(train_data, padded_texts)

        return model
Beispiel #11
0
    def __init__(self, n=3):
        tokens = []
        for book in shakespeare.fileids():
            elt = shakespeare.xml(book)
            iterator = elt.getiterator()
            for node in iterator:
                lines = node.findall("LINE")
                for line in lines:
                    line_tokens = list(str(line.text))
                    line_tokens.insert(0, "<L>")
                    line_tokens.append("</L>")
                    tokens.append(line_tokens)
        t = (everygrams(x, max_len=n) for x in tokens)
        v = flatten(tokens)
        lm = Laplace(order=n)  # add-one smoothing
        lm.fit(t, v)

        self._n = n
        self._lm = lm
        self._tokenize_pattern = re.compile(r'(<L>)|(</L>)')
    def make_model(self, e):

        target_triples = set(e.triples)

        texts = []

        for e_ in self.db:
            if target_triples.intersection(e_.triples):
                for l in e_.lexes:
                    texts.append(l['text'])

        tokenized_texts = [normalize_text(x).split() for x in texts]

        n = 3
        train_data, padded_texts = padded_everygram_pipeline(
            n, tokenized_texts)

        model = Laplace(n)
        model.fit(train_data, padded_texts)

        return model
Beispiel #13
0
    def likelihoods_gen(ngrams_dir=NGRAMS_DIR, n_gram=N_GRAM):
        with open(ngrams_dir, 'rb') as pickle_in:
            ngrams = pickle.load(pickle_in, encoding='utf8')

        tokenized_train_corpus = Tokenizer.job_data_tokenizer(TRAIN_CORPUS_DIR)
        train_data, padded_sents = padded_everygram_pipeline(
            N_GRAM, tokenized_train_corpus)
        # Maximum Likelihood Estimator (MLE) model using Laplace Smoothing (gamma is always 1).
        lm = Laplace(n_gram)
        lm.fit(train_data, padded_sents)
        likelihoods = defaultdict(list)

        # Likelihood estimator for ngrams
        for k in ngrams:
            for ng in ngrams[k]:
                # ngram is deteremined by  the number of splits it has in its sentence
                tokens = ng.split(' ')
                # Score a word given some optional context. Unseen words are assigned probability 0.
                x, y = tokens[-1], tuple(tokens[:-1])
                score = lm.unmasked_score(x, context=y)  # P('x'|'y')
                # we create a mapping of given word y to a list of possible next words and their scores
                if score != 0:
                    likelihoods[y].append((score, x))

        with open('bin/likelihoods.pkl', 'wb') as output:
            pickle.dump(likelihoods, output)
            output.close()

        def evaluate():
            with open(TEST_CORPUS_DIR, 'rb') as pickle_in:
                test_corpus = pickle.load(pickle_in, encoding='utf8')

        # Evaluate the total entropy of a corpus with respect to the model.
        # This is the sum of the log probability of each word in the test corpus.
            file = open('bin/model_evaluation.txt', 'w')
            file.write('Model Evaluation Score (Entropy): {}'.format(
                lm.entropy(test_corpus)))
            file.close()

        evaluate()
Beispiel #14
0
def vary_ngram(train_corpus, test_corpus, n_gram_orders):
    '''
    Use the nltk.lm.Laplace for training.
    Returns a dictionary of perplexity values at different order n-gram LMs

    :param train_corpus: list of list of str, corpus to train language model on.
    :param test_corpus: list of list of str, corpus to test language model on.
    :n_gram_orders: list of ints, orders of n-grams desired.
    :returns: a dictionary of perplexities at different orders, key=order, value=perplexity.
    :rtype: dict.

    Hint: Follow the same LM training procedure as in the notebook in the end of Exercise 1.
    '''

    test = sum([['<s>'] + x + ['</s>'] for x in test_corpus], [])
    ret = {}
    for order in n_gram_orders:
        train, vocab = padded_everygram_pipeline(order, train_corpus)
        lm = Laplace(order)
        lm.fit(train, vocab)
        ret[order] = lm.perplexity(test)
    return ret
class NGramSentences:
    def __init__(self, n=3, filename='cache/book.txt'):
        with open(filename) as file:
            text = file.read()

        tokens = [
            list(map(str.lower, word_tokenize(sent)))
            for sent in sent_tokenize(text)
        ]
        train, vocab = padded_everygram_pipeline(3, tokens)

        self.model = Laplace(n)
        self.model.fit(train, vocab)

    def generate(self, prev_word='<s>', max_words=25):
        return detokenize(
            list(
                itertools.takewhile(
                    lambda word: word != '</s>',
                    itertools.dropwhile(
                        lambda word: word == '<s>',
                        (word for word in self.model.generate(
                            max_words, text_seed=[prev_word]))))))
Beispiel #16
0
    entropy = -1 * mean
    perplexity = pow(2.0, entropy)
    return perplexity


def avg_sent_perplexity(corpus, lm):
    perplexities = []
    for sent in corpus:
        ngrams = [ngram for ngram in sent]
        perplexities.append(lm.perplexity(ngrams))
    return sum(perplexities) / len(perplexities)


if __name__ == '__main__':
    args = parse_args()

    lm = Laplace(args.n)  # smoothing

    if args.train is not None:
        train_corpus = load_corpus(args.train)
    else:
        train_corpus = brown.sents()
    train, vocab = padded_everygram_pipeline(args.n, train_corpus)
    lm.fit(train, vocab)

    for test_file in args.corpora:
        test_corpus = load_corpus(test_file)
        test, vocab = padded_everygram_pipeline(args.n, test_corpus)
        perplexity = avg_sent_perplexity(test, lm)
        print('{}: {}'.format(test_file, perplexity))
Beispiel #17
0
class Viterbi():
    def __init__(self, num_of_tags, num_of_vocab, tags, bigram_tags,
                 sents_tags: List[Tuple[List, List]]):
        self.num_of_tags = num_of_tags
        self.num_of_vocab = num_of_vocab
        self.tags = list(tags)
        self.bigram_tags = bigram_tags
        self.sents_tags = sents_tags
        self.laplace = Laplace(2)
        self.laplace.fit(text=[self.bigram_tags], vocabulary_text=self.tags)

        self.cache = {}

    def _get_prob_wi_ti(self, word, tag):
        key = word + tag
        if key in self.cache:
            return self.cache[key]
        tag_cnt = 0
        word_tag_cnt = 0
        for s, t in self.sents_tags:
            if len(s) != len(t):
                raise Exception(
                    f'sentence and tag are not aligned.\n sentence:{s}\n tag:{t}'
                )
            tag_cnt += t.count(tag)
            word_tag_cnt += len(
                [i for i in range(len(s)) if s[i] == word and t[i] == tag])

        self.cache[key] = (word_tag_cnt + 1) / (tag_cnt + self.num_of_vocab)
        return self.cache[key]

    def _get_prob_ti_1_and_ti(self, ti_1: str, ti: str):
        return self.laplace.score(ti, [ti_1])

    def viterbi(self, sentence):
        """
        sentence = ['من','به','مدرسه','رفتم']
        """
        # init viterbi table

        viterbi = [[0 for _ in range(len(sentence) + 1)]
                   for _ in range(self.num_of_tags)]
        backtrace = [[0 for _ in range(len(sentence) + 1)]
                     for _ in range(self.num_of_tags)]

        for index in range(len(self.tags)):
            p_trans = self._get_prob_ti_1_and_ti('<s>', self.tags[index])
            p_emis = self._get_prob_wi_ti(sentence[0], self.tags[index])
            viterbi[index][0] = p_trans * p_emis
            backtrace[index][0] = 0

        for w_index in range(1, len(sentence)):
            cur_word = sentence[w_index]
            for t_index in range(len(self.tags)):
                cur_tag = self.tags[t_index]
                p_emis = self._get_prob_wi_ti(cur_word, cur_tag)
                tmp = [
                    viterbi[i][w_index - 1] *
                    self._get_prob_ti_1_and_ti(self.tags[i], cur_tag) * p_emis
                    for i in range(len(self.tags))
                ]
                viterbi[t_index][w_index] = max(tmp)
                backtrace[t_index][w_index] = np.argmax([
                    viterbi[i][w_index - 1] *
                    self._get_prob_ti_1_and_ti(self.tags[i], cur_tag)
                    for i in range(len(self.tags))
                ])

        viterbi[-1][-1] = max([
            viterbi[i][len(sentence) - 1] *
            self._get_prob_ti_1_and_ti(self.tags[i], '</s>')
            for i in range(len(self.tags))
        ])
        backtrace[-1][-1] = np.argmax([
            viterbi[i][len(sentence) - 1] *
            self._get_prob_ti_1_and_ti(self.tags[i], '</s>')
            for i in range(len(self.tags))
        ])

        result = backtrace[-1][1:]  # last list contain index of tags
        return ['<s>'] + [self.tags[result[i]]
                          for i in range(0, len(result))] + ['</s>']
def laplace_bigram_model(bigram_training_data, vocabulary):
    model = Laplace(2, vocabulary=vocabulary)
    model.fit(bigram_training_data)
    return model
from nltk.lm.preprocessing import flatten
from nltk.util import everygrams
bigramsList = list(map(lambda x: list(trigrams(x)), y))
bigramsList = list(flatten(bigramsList))
#list(everygrams(bigramsList, max_len=2))

vocab = list(flatten(pad_both_ends(sent, n=2) for sent in text))
from nltk.lm import Vocabulary
vocab = list(Vocabulary(vocab, unk_cutoff=1))
'''
from nltk.lm.preprocessing import padded_everygram_pipeline
train, vocab = padded_everygram_pipeline(2, text)
'''

lm = Laplace(3)
lm.fit([bigramsList], vocabulary_text=list(vocab))

lm.generate(4, text_seed=["government", "had"])


def generateSentences(v):
    sent = v
    v = [lm.generate(1, text_seed=v)]
    sent = sent + v
    while v[0] != '</s>':
        l = len(sent)
        v = [lm.generate(1, text_seed=[sent[l - 2], sent[l - 1]])]
        sent = sent + v
    return sent

Beispiel #20
0
def ngram_model(train_data, vocab, n):
    model = Laplace(n)
    model.fit(train_data, vocab)
    return model