コード例 #1
0
def train_ngram_lm(dataset, data, ngram=3, gamma=0.5):
    print(f'[!] max 3-gram, Lidstone smoothing with gamma 0.5')
    train, vocab = padded_everygram_pipeline(ngram, data)
    lm = Lidstone(gamma, ngram)
    lm.fit(train, vocab)
    with open(f'./data/{dataset}/lm.pkl', 'wb') as f:
        pickle.dump(lm, f)
    print(f'[!] ngram language model saved into ./data/{dataset}/lm.pkl')
コード例 #2
0
class LidstoneBigramTests(unittest.TestCase):
    """unit tests for Lidstone class"""

    score_tests = [
        # count(d | c) = 1
        # *count(d | c) = 1.1
        # Count(w | c for w in vocab) = 1
        # *Count(w | c for w in vocab) = 1.8
        ("d", ["c"], 1.1 / 1.8),
        # Total unigrams: 14
        # Vocab size: 8
        # Denominator: 14 + 0.8 = 14.8
        # count("a") = 2
        # *count("a") = 2.1
        ("a", None, 2.1 / 14.8),
        # in vocabulary but unseen
        # count("z") = 0
        # *count("z") = 0.1
        ("z", None, 0.1 / 14.8),
        # out of vocabulary should use "UNK" score
        # count("<UNK>") = 3
        # *count("<UNK>") = 3.1
        ("y", None, 3.1 / 14.8),
    ]

    def setUp(self):
        vocab, training_text = _prepare_test_data(2)
        self.model = Lidstone(0.1, 2, vocabulary=vocab)
        self.model.fit(training_text)

    def test_gamma(self):
        self.assertEqual(0.1, self.model.gamma)

    def test_entropy_perplexity(self):
        text = [
            ("<s>", "a"),
            ("a", "c"),
            ("c", "<UNK>"),
            ("<UNK>", "d"),
            ("d", "c"),
            ("c", "</s>"),
        ]
        # Unlike MLE this should be able to handle completely novel ngrams
        # Ngram = score, log score
        # <s>, a    = 0.3929, -1.3479
        # a, c      = 0.0357, -4.8074
        # c, UNK    = 0.0(5), -4.1699
        # UNK, d    = 0.0263,  -5.2479
        # d, c      = 0.0357, -4.8074
        # c, </s>   = 0.0(5), -4.1699
        # TOTAL logscore: −24.5504
        # - AVG logscore: 4.0917
        H = 4.0917
        perplexity = 17.0504
        self.assertAlmostEqual(H, self.model.entropy(text), places=4)
        self.assertAlmostEqual(perplexity,
                               self.model.perplexity(text),
                               places=4)
コード例 #3
0
class TestLidstoneBigram(metaclass=ParametrizedTests):
    """Unit tests for Lidstone class"""

    score_tests = [
        # count(d | c) = 1
        # *count(d | c) = 1.1
        # Count(w | c for w in vocab) = 1
        # *Count(w | c for w in vocab) = 1.8
        ("d", ["c"], 1.1 / 1.8),
        # Total unigrams: 14
        # Vocab size: 8
        # Denominator: 14 + 0.8 = 14.8
        # count("a") = 2
        # *count("a") = 2.1
        ("a", None, 2.1 / 14.8),
        # in vocabulary but unseen
        # count("z") = 0
        # *count("z") = 0.1
        ("z", None, 0.1 / 14.8),
        # out of vocabulary should use "UNK" score
        # count("<UNK>") = 3
        # *count("<UNK>") = 3.1
        ("y", None, 3.1 / 14.8),
    ]

    @classmethod
    def setup_method(self):
        vocab, training_text = _prepare_test_data(2)
        self.model = Lidstone(0.1, 2, vocabulary=vocab)
        self.model.fit(training_text)

    def test_gamma(self):
        assert 0.1 == self.model.gamma

    def test_entropy_perplexity(self):
        text = [
            ("<s>", "a"),
            ("a", "c"),
            ("c", "<UNK>"),
            ("<UNK>", "d"),
            ("d", "c"),
            ("c", "</s>"),
        ]
        # Unlike MLE this should be able to handle completely novel ngrams
        # Ngram = score, log score
        # <s>, a    = 0.3929, -1.3479
        # a, c      = 0.0357, -4.8074
        # c, UNK    = 0.0(5), -4.1699
        # UNK, d    = 0.0263,  -5.2479
        # d, c      = 0.0357, -4.8074
        # c, </s>   = 0.0(5), -4.1699
        # TOTAL logscore: −24.5504
        # - AVG logscore: 4.0917
        H = 4.0917
        perplexity = 17.0504
        assert pytest.approx(self.model.entropy(text), 1e-4) == H
        assert pytest.approx(self.model.perplexity(text), 1e-4) == perplexity
コード例 #4
0
ファイル: test_models.py プロジェクト: rmalouf/nltk
class LidstoneBigramTests(unittest.TestCase):
    """unit tests for Lidstone class"""

    score_tests = [
        # count(d | c) = 1
        # *count(d | c) = 1.1
        # Count(w | c for w in vocab) = 1
        # *Count(w | c for w in vocab) = 1.8
        ("d", ["c"], 1.1 / 1.8),
        # Total unigrams: 14
        # Vocab size: 8
        # Denominator: 14 + 0.8 = 14.8
        # count("a") = 2
        # *count("a") = 2.1
        ("a", None, 2.1 / 14.8),
        # in vocabulary but unseen
        # count("z") = 0
        # *count("z") = 0.1
        ("z", None, 0.1 / 14.8),
        # out of vocabulary should use "UNK" score
        # count("<UNK>") = 3
        # *count("<UNK>") = 3.1
        ("y", None, 3.1 / 14.8),
    ]

    def setUp(self):
        vocab, training_text = _prepare_test_data(2)
        self.model = Lidstone(0.1, 2, vocabulary=vocab)
        self.model.fit(training_text)

    def test_gamma(self):
        self.assertEqual(0.1, self.model.gamma)

    def test_entropy_perplexity(self):
        text = [
            ("<s>", "a"),
            ("a", "c"),
            ("c", "<UNK>"),
            ("<UNK>", "d"),
            ("d", "c"),
            ("c", "</s>"),
        ]
        # Unlike MLE this should be able to handle completely novel ngrams
        # Ngram = score, log score
        # <s>, a    = 0.3929, -1.3479
        # a, c      = 0.0357, -4.8074
        # c, UNK    = 0.0(5), -4.1699
        # UNK, d    = 0.0263,  -5.2479
        # d, c      = 0.0357, -4.8074
        # c, </s>   = 0.0(5), -4.1699
        # TOTAL logscore: −24.5504
        # - AVG logscore: 4.0917
        H = 4.0917
        perplexity = 17.0504
        self.assertAlmostEqual(H, self.model.entropy(text), places=4)
        self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
コード例 #5
0
def lidstone(v: int, n: int, gamma: float, train_file: str, test_file: str):
    """
    Provides Lidstone-smoothed scores.
    In addition to initialization arguments from BaseNgramModel
    also requires a number by which to increase the counts, gamma.
    :param v: Vocabulary choice
    :param n: ngram choice
    :param gamma: Smoothing choice
    :param train_file: Path to training data
    :param test_file: Path to testing data
    :return:
    """
    validate_params(v, n, gamma, train_file, test_file)

    # Process train data
    train_data = pd.read_csv(train_file,
                             delimiter='\t',
                             names=[DF_COLUMN_ID, DF_COLUMN_NAME, DF_COLUMN_LANG, DF_COLUMN_TWEET])
    train_data.drop(labels=[DF_COLUMN_ID, DF_COLUMN_NAME], inplace=True, axis=1)
    transform_to_vocab(train_data, v)
    train_data[DF_COLUMN_TWEET] = train_data[DF_COLUMN_TWEET].map(lambda tweet: tokenize(tweet, LIDSTONE_TOKENIZE_SPAN))

    # Train model
    models_by_lang = {}
    for language, tweets in train_data.groupby(DF_COLUMN_LANG)[DF_COLUMN_TWEET]:
        tweet_list = tweets.tolist()
        train_ngrams, padded_vocab = padded_everygram_pipeline(n, tweet_list)
        model = Lidstone(gamma=gamma, order=n)
        model.fit(train_ngrams, padded_vocab)
        models_by_lang[language] = model

    # Process test data
    test_data = pd.read_csv(test_file, delimiter='\t',
                            names=[DF_COLUMN_ID, DF_COLUMN_NAME, DF_COLUMN_LANG, DF_COLUMN_TWEET])
    transform_to_vocab(test_data, v)
    test_data[DF_COLUMN_TWEET] = test_data[DF_COLUMN_TWEET].map(lambda tweet: tokenize(tweet=tweet, span=n,
                                                                                       extended_func=True))
    test_data[DF_COLUMN_TWEET] = test_data[DF_COLUMN_TWEET].map(
        lambda tweet_ngrams: [[modify_padding(ngram_char) for ngram_char in list(ngram)] for ngram in tweet_ngrams])

    # Calculate scores
    test_data[DF_COLUMN_TWEET] = test_data[DF_COLUMN_TWEET].map(lambda tweet_ngrams: argmax(models_by_lang,
                                                                                            tweet_ngrams))
    score_lang_df = pd.DataFrame(test_data[DF_COLUMN_TWEET].tolist(), columns=[DF_COLUMN_SCORE, DF_COLUMN_GUESS])

    # Finalize results
    results = prepare_result_df(test_data, score_lang_df)
    results = finalize_result_df(results)

    # Evaluation stats
    print("Evaluating Lidstone with parameters: [vocabulary = {}, ngram size = {}, delta = {}]".format(v, n, gamma))
    format_results(results)
コード例 #6
0
ファイル: test_models.py プロジェクト: rmalouf/nltk
class LidstoneTrigramTests(unittest.TestCase):
    score_tests = [
        # Logic behind this is the same as for bigram model
        ("d", ["c"], 1.1 / 1.8),
        # if we choose a word that hasn't appeared after (b, c)
        ("e", ["c"], 0.1 / 1.8),
        # Trigram score now
        ("d", ["b", "c"], 1.1 / 1.8),
        ("e", ["b", "c"], 0.1 / 1.8),
    ]

    def setUp(self):
        vocab, training_text = _prepare_test_data(3)
        self.model = Lidstone(0.1, 3, vocabulary=vocab)
        self.model.fit(training_text)
コード例 #7
0
class LidstoneTrigramTests(unittest.TestCase):
    score_tests = [
        # Logic behind this is the same as for bigram model
        ("d", ["c"], 1.1 / 1.8),
        # if we choose a word that hasn't appeared after (b, c)
        ("e", ["c"], 0.1 / 1.8),
        # Trigram score now
        ("d", ["b", "c"], 1.1 / 1.8),
        ("e", ["b", "c"], 0.1 / 1.8),
    ]

    def setUp(self):
        vocab, training_text = _prepare_test_data(3)
        self.model = Lidstone(0.1, 3, vocabulary=vocab)
        self.model.fit(training_text)
コード例 #8
0
class TestLidstoneTrigram(metaclass=ParametrizedTests):
    score_tests = [
        # Logic behind this is the same as for bigram model
        ("d", ["c"], 1.1 / 1.8),
        # if we choose a word that hasn't appeared after (b, c)
        ("e", ["c"], 0.1 / 1.8),
        # Trigram score now
        ("d", ["b", "c"], 1.1 / 1.8),
        ("e", ["b", "c"], 0.1 / 1.8),
    ]

    @classmethod
    def setup_method(self):
        vocab, training_text = _prepare_test_data(3)
        self.model = Lidstone(0.1, 3, vocabulary=vocab)
        self.model.fit(training_text)
コード例 #9
0
 def create_model(self, model_nm):
     self.model = {
         "lidstone": Lidstone(0.5, self.ngram_order),
         "kneserney": KneserNeyInterpolated(self.ngram_order),
         "wittenbell": WittenBellInterpolated(self.ngram_order)
     }[model_nm]
     train, vocab = padded_everygram_pipeline(self.ngram_order, self.text)
     vocab = Vocabulary(vocab, unk_cutoff=2, unk_label="<UNK>")
     print("Creating ngram...")
     self.model.fit(train, vocab)
     print("done")
コード例 #10
0
def train_lm_models(n, text):
    models = {}
    discount = 0.75
    gamma = 0.5
    #train_ngrams,train_vocab = get_data(n,text)
    '''model4 = KneserNeyInterpolated(order=n,discount=discount)
	train_ngrams,train_vocab = get_data(n,text)
	model4.fit(train_ngrams,train_vocab)
	models["4"]=model4
	model3= WittenBellInterpolated(order=n)
	train_ngrams,train_vocab = get_data(n,text)
	model3.fit(train_ngrams,train_vocab)
	models["3"]=model3'''
    model2 = Lidstone(order=n, gamma=gamma)
    train_ngrams, train_vocab = get_data(n, text)
    model2.fit(train_ngrams, train_vocab)
    models["2"] = model2
    model1 = MLE(order=n)
    train_ngrams, train_vocab = get_data(n, text)
    model1.fit(train_ngrams, train_vocab)
    models["1"] = model1
    return models
コード例 #11
0
 def setup_method(self):
     vocab, training_text = _prepare_test_data(2)
     self.model = Lidstone(0.1, 2, vocabulary=vocab)
     self.model.fit(training_text)
コード例 #12
0
def trainModel(n, lines):
    ngrams, phrases = pep(n, lines)
    model = Lidstone(0.01, n)
    model.fit(ngrams, phrases)
    model.vocab._cutoff = 2
    return model
コード例 #13
0
    min(inverse_vocabulary_state_union)[1],
    min(inverse_vocabulary_state_union)
    [0])  #poorest vocabulary for Johnson in 1963

# Exercise 2

train, vocab = padded_everygram_pipeline(2, state_union.sents())
lm = MLE(2)
lm.fit(train, vocab)
print(lm.counts['America'])
print(lm.counts[['bless']]['America'])
print(lm.score('the'))
print(lm.score("America", ["bless"]))

train, vocab = padded_everygram_pipeline(2, state_union.sents())
lm = Lidstone(2, 0.5)
lm.fit(train, vocab)
print(lm.counts['America'])
print(lm.counts[['bless']]['America'])
print(lm.score('the'))
print(lm.score("America", ["bless"]))

train, vocab = padded_everygram_pipeline(2, state_union.sents())
lm = Laplace(2)
lm.fit(train, vocab)
print(lm.counts['America'])
print(lm.counts[['bless']]['America'])
print(lm.score('the'))
print(lm.score("America", ["bless"]))

train, vocab = padded_everygram_pipeline(2, state_union.sents())
コード例 #14
0
taiwan_dataset = [list(doc) for doc in taiwan_dataset]

N = 3

print(f"Process: Building n-grams with N={N}")
# Train the China n-gram model
china_train, china_vocab = padded_everygram_pipeline(order=N,
                                                     text=china_dataset)
taiwan_train, taiwan_vocab = padded_everygram_pipeline(order=N,
                                                       text=taiwan_dataset)

print("Process: Train the model")
from nltk.lm import Lidstone

gamma_param = 0.5
china_model = Lidstone(gamma=gamma_param, order=N)
china_model.fit(china_train, china_vocab)
taiwan_model = Lidstone(gamma=gamma_param, order=N)
taiwan_model.fit(taiwan_train, taiwan_vocab)

import math


def log_score(model, N, sentence):
    log_score = 0.0
    sentence = pad_both_ends(list(sentence), n=N)
    ngram_sents = list(ngrams(sentence, n=N))
    for ngram_sent in ngram_sents:
        log_score += math.log(
            model.unmasked_score(word=ngram_sent[-1],
                                 context=ngram_sent[0:-1]))
コード例 #15
0
def lidstone_trigram_model(trigram_training_data, vocabulary):
    model = Lidstone(0.1, order=3, vocabulary=vocabulary)
    model.fit(trigram_training_data)
    return model
コード例 #16
0
ファイル: test_models.py プロジェクト: rmalouf/nltk
 def setUp(self):
     vocab, training_text = _prepare_test_data(3)
     self.model = Lidstone(0.1, 3, vocabulary=vocab)
     self.model.fit(training_text)
コード例 #17
0
 def setUp(self):
     vocab, training_text = _prepare_test_data(3)
     self.model = Lidstone(0.1, 3, vocabulary=vocab)
     self.model.fit(training_text)
コード例 #18
0
def main():
    parser = ArgumentParser()
    parser.add_argument(
        '--data_file',
        default='../../data/facebook-maria/combined_group_data_es_tagged.tsv')
    parser.add_argument('--ngram_order', default=3)
    args = vars(parser.parse_args())

    ## load data
    combined_data = pd.read_csv(args['data_file'], sep='\t', index_col=False)
    # remove URL-only statuses
    min_status_len = 3
    combined_data = combined_data[
        combined_data.loc[:, 'status_message_ne_tagged_stemmed'].apply(
            lambda x: len(str(x).split(' '))) > min_status_len]
    # tmp debugging
    #     combined_data = combined_data.head(100)

    ## compute entropy
    ## (1) per-post
    ## (2) per-mention (within sentence)
    ## (3) per-mention (within fixed window?)
    ## train language model
    ngram_order = args['ngram_order']
    tokenizer = ToktokTokenizer(
    )  # use TokTok for tokens because it's multilingual
    sent_tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')
    sent_tokenize = lambda x: sent_tokenizer.tokenize(x)
    combined_data.loc[:,
                      'status_message_sents'] = combined_data.loc[:,
                                                                  'status_message_ne_tagged_stemmed'].apply(
                                                                      sent_tokenize
                                                                  )
    combined_data_sents = reduce(
        lambda x, y: x + y, combined_data.loc[:,
                                              'status_message_sents'].values)
    combined_data_ngrams = ((
        ngram for ngram in generate_ngrams(sent, tokenizer, n=ngram_order))
                            for sent in combined_data_sents)
    # train Lidstone language model
    gamma = 0.1
    combined_data_lm = Lidstone(order=ngram_order, gamma=gamma)
    vocab = set(
        reduce(lambda x, y: x + y, [
            tokenizer.tokenize(txt) for txt in
            combined_data.loc[:, 'status_message_ne_tagged_stemmed'].values
        ]))
    print('about to fit LM with order=%d and vocab=%d' %
          (ngram_order, len(vocab)))
    combined_data_lm.fit(combined_data_ngrams, vocabulary_text=vocab)

    ## split data to per-sentence for easier handling
    print('about to compute entropy for all sentences')
    # multi-threading for slightly faster performance
    MAX_THREADS = 10
    pool = Pool(MAX_THREADS)
    combined_data_sents_ordered = combined_data.loc[:,
                                                    'status_message_sents'].values
    combined_data_entropy = pool.starmap(
        compute_entropy_sent,
        zip(combined_data_sents_ordered, repeat(combined_data_lm),
            repeat(tokenizer), repeat(ngram_order)))
    print('combined data entropy shape %d' % (len(combined_data_entropy)))
    combined_data.loc[:, 'sent_entropy'] = combined_data_entropy
    # serial threading for LOSERS
    #     combined_data.loc[:, 'sent_entropy'] = combined_data.loc[:, 'status_message_sents'].apply(lambda x: [combined_data_lm.entropy(generate_ngrams(sent, tokenizer, n=ngram_order)) for sent in x])
    ## compute entropy for each (1) post (2) entity
    ## format: status ID, publish time, status message, entity name, post entropy, sentence entropy
    combined_data.loc[:,
                      'post_entropy'] = combined_data.loc[:,
                                                          'sent_entropy'].apply(
                                                              lambda x: np.
                                                              mean(x))
    combined_data_flat = []
    ne_matcher = re.compile('\w+\.<ne\.\w+>')
    ne_null = 'NULL_ENTITY.<ne>'
    for i, combined_data_i in combined_data.iterrows():
        status_id_i = combined_data_i.loc['status_id']
        status_time_i = combined_data_i.loc['status_published']
        status_message_i = combined_data_i.loc[
            'status_message_ne_tagged_stemmed']
        entropy_i = combined_data_i.loc['post_entropy']
        for sent_j, entropy_j in zip(*combined_data_i.loc[
            ['status_message_sents', 'sent_entropy']].values):
            print('processing sent %s' % (sent_j))
            sent_tokens = tokenizer.tokenize(sent_j)
            sent_tokens_ne = list(
                filter(lambda x: ne_matcher.search(x) is not None,
                       sent_tokens))
            if (len(sent_tokens_ne) > 0):
                data_j = pd.DataFrame([[
                    status_id_i, status_time_i, status_message_i, sent_j,
                    entropy_i, sent_token_ne, entropy_j
                ] for sent_token_ne in sent_tokens_ne])
            else:
                data_j = pd.DataFrame([[
                    status_id_i, status_time_i, status_message_i, sent_j,
                    entropy_i, ne_null, entropy_j
                ]])
            print('adding data with shape %s' % (len(data_j)))
            combined_data_flat.append(data_j)
    combined_data_flat_cols = [
        'status_id', 'status_published', 'status_message', 'sent',
        'post_entropy', 'entity', 'sent_entropy'
    ]
    combined_data_flat = pd.concat(combined_data_flat, axis=0)
    combined_data_flat.columns = combined_data_flat_cols

    ## get examples of low/high entropy statuses
    combined_data_flat.sort_values('sent_entropy',
                                   inplace=True,
                                   ascending=False)
    print(combined_data_flat.loc[:, 'status_message'].head(5))
    print(combined_data_flat.loc[:, 'status_message'].tail(5))

    ## save flat data
    out_file = args['data_file'].replace('.tsv', '_entropy.tsv')
    combined_data_flat.to_csv(out_file, sep='\t', index=False)