def train_ngram_lm(dataset, data, ngram=3, gamma=0.5): print(f'[!] max 3-gram, Lidstone smoothing with gamma 0.5') train, vocab = padded_everygram_pipeline(ngram, data) lm = Lidstone(gamma, ngram) lm.fit(train, vocab) with open(f'./data/{dataset}/lm.pkl', 'wb') as f: pickle.dump(lm, f) print(f'[!] ngram language model saved into ./data/{dataset}/lm.pkl')
class LidstoneBigramTests(unittest.TestCase): """unit tests for Lidstone class""" score_tests = [ # count(d | c) = 1 # *count(d | c) = 1.1 # Count(w | c for w in vocab) = 1 # *Count(w | c for w in vocab) = 1.8 ("d", ["c"], 1.1 / 1.8), # Total unigrams: 14 # Vocab size: 8 # Denominator: 14 + 0.8 = 14.8 # count("a") = 2 # *count("a") = 2.1 ("a", None, 2.1 / 14.8), # in vocabulary but unseen # count("z") = 0 # *count("z") = 0.1 ("z", None, 0.1 / 14.8), # out of vocabulary should use "UNK" score # count("<UNK>") = 3 # *count("<UNK>") = 3.1 ("y", None, 3.1 / 14.8), ] def setUp(self): vocab, training_text = _prepare_test_data(2) self.model = Lidstone(0.1, 2, vocabulary=vocab) self.model.fit(training_text) def test_gamma(self): self.assertEqual(0.1, self.model.gamma) def test_entropy_perplexity(self): text = [ ("<s>", "a"), ("a", "c"), ("c", "<UNK>"), ("<UNK>", "d"), ("d", "c"), ("c", "</s>"), ] # Unlike MLE this should be able to handle completely novel ngrams # Ngram = score, log score # <s>, a = 0.3929, -1.3479 # a, c = 0.0357, -4.8074 # c, UNK = 0.0(5), -4.1699 # UNK, d = 0.0263, -5.2479 # d, c = 0.0357, -4.8074 # c, </s> = 0.0(5), -4.1699 # TOTAL logscore: −24.5504 # - AVG logscore: 4.0917 H = 4.0917 perplexity = 17.0504 self.assertAlmostEqual(H, self.model.entropy(text), places=4) self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
class TestLidstoneBigram(metaclass=ParametrizedTests): """Unit tests for Lidstone class""" score_tests = [ # count(d | c) = 1 # *count(d | c) = 1.1 # Count(w | c for w in vocab) = 1 # *Count(w | c for w in vocab) = 1.8 ("d", ["c"], 1.1 / 1.8), # Total unigrams: 14 # Vocab size: 8 # Denominator: 14 + 0.8 = 14.8 # count("a") = 2 # *count("a") = 2.1 ("a", None, 2.1 / 14.8), # in vocabulary but unseen # count("z") = 0 # *count("z") = 0.1 ("z", None, 0.1 / 14.8), # out of vocabulary should use "UNK" score # count("<UNK>") = 3 # *count("<UNK>") = 3.1 ("y", None, 3.1 / 14.8), ] @classmethod def setup_method(self): vocab, training_text = _prepare_test_data(2) self.model = Lidstone(0.1, 2, vocabulary=vocab) self.model.fit(training_text) def test_gamma(self): assert 0.1 == self.model.gamma def test_entropy_perplexity(self): text = [ ("<s>", "a"), ("a", "c"), ("c", "<UNK>"), ("<UNK>", "d"), ("d", "c"), ("c", "</s>"), ] # Unlike MLE this should be able to handle completely novel ngrams # Ngram = score, log score # <s>, a = 0.3929, -1.3479 # a, c = 0.0357, -4.8074 # c, UNK = 0.0(5), -4.1699 # UNK, d = 0.0263, -5.2479 # d, c = 0.0357, -4.8074 # c, </s> = 0.0(5), -4.1699 # TOTAL logscore: −24.5504 # - AVG logscore: 4.0917 H = 4.0917 perplexity = 17.0504 assert pytest.approx(self.model.entropy(text), 1e-4) == H assert pytest.approx(self.model.perplexity(text), 1e-4) == perplexity
def lidstone(v: int, n: int, gamma: float, train_file: str, test_file: str): """ Provides Lidstone-smoothed scores. In addition to initialization arguments from BaseNgramModel also requires a number by which to increase the counts, gamma. :param v: Vocabulary choice :param n: ngram choice :param gamma: Smoothing choice :param train_file: Path to training data :param test_file: Path to testing data :return: """ validate_params(v, n, gamma, train_file, test_file) # Process train data train_data = pd.read_csv(train_file, delimiter='\t', names=[DF_COLUMN_ID, DF_COLUMN_NAME, DF_COLUMN_LANG, DF_COLUMN_TWEET]) train_data.drop(labels=[DF_COLUMN_ID, DF_COLUMN_NAME], inplace=True, axis=1) transform_to_vocab(train_data, v) train_data[DF_COLUMN_TWEET] = train_data[DF_COLUMN_TWEET].map(lambda tweet: tokenize(tweet, LIDSTONE_TOKENIZE_SPAN)) # Train model models_by_lang = {} for language, tweets in train_data.groupby(DF_COLUMN_LANG)[DF_COLUMN_TWEET]: tweet_list = tweets.tolist() train_ngrams, padded_vocab = padded_everygram_pipeline(n, tweet_list) model = Lidstone(gamma=gamma, order=n) model.fit(train_ngrams, padded_vocab) models_by_lang[language] = model # Process test data test_data = pd.read_csv(test_file, delimiter='\t', names=[DF_COLUMN_ID, DF_COLUMN_NAME, DF_COLUMN_LANG, DF_COLUMN_TWEET]) transform_to_vocab(test_data, v) test_data[DF_COLUMN_TWEET] = test_data[DF_COLUMN_TWEET].map(lambda tweet: tokenize(tweet=tweet, span=n, extended_func=True)) test_data[DF_COLUMN_TWEET] = test_data[DF_COLUMN_TWEET].map( lambda tweet_ngrams: [[modify_padding(ngram_char) for ngram_char in list(ngram)] for ngram in tweet_ngrams]) # Calculate scores test_data[DF_COLUMN_TWEET] = test_data[DF_COLUMN_TWEET].map(lambda tweet_ngrams: argmax(models_by_lang, tweet_ngrams)) score_lang_df = pd.DataFrame(test_data[DF_COLUMN_TWEET].tolist(), columns=[DF_COLUMN_SCORE, DF_COLUMN_GUESS]) # Finalize results results = prepare_result_df(test_data, score_lang_df) results = finalize_result_df(results) # Evaluation stats print("Evaluating Lidstone with parameters: [vocabulary = {}, ngram size = {}, delta = {}]".format(v, n, gamma)) format_results(results)
class LidstoneTrigramTests(unittest.TestCase): score_tests = [ # Logic behind this is the same as for bigram model ("d", ["c"], 1.1 / 1.8), # if we choose a word that hasn't appeared after (b, c) ("e", ["c"], 0.1 / 1.8), # Trigram score now ("d", ["b", "c"], 1.1 / 1.8), ("e", ["b", "c"], 0.1 / 1.8), ] def setUp(self): vocab, training_text = _prepare_test_data(3) self.model = Lidstone(0.1, 3, vocabulary=vocab) self.model.fit(training_text)
class TestLidstoneTrigram(metaclass=ParametrizedTests): score_tests = [ # Logic behind this is the same as for bigram model ("d", ["c"], 1.1 / 1.8), # if we choose a word that hasn't appeared after (b, c) ("e", ["c"], 0.1 / 1.8), # Trigram score now ("d", ["b", "c"], 1.1 / 1.8), ("e", ["b", "c"], 0.1 / 1.8), ] @classmethod def setup_method(self): vocab, training_text = _prepare_test_data(3) self.model = Lidstone(0.1, 3, vocabulary=vocab) self.model.fit(training_text)
def train_lm_models(n, text): models = {} discount = 0.75 gamma = 0.5 #train_ngrams,train_vocab = get_data(n,text) '''model4 = KneserNeyInterpolated(order=n,discount=discount) train_ngrams,train_vocab = get_data(n,text) model4.fit(train_ngrams,train_vocab) models["4"]=model4 model3= WittenBellInterpolated(order=n) train_ngrams,train_vocab = get_data(n,text) model3.fit(train_ngrams,train_vocab) models["3"]=model3''' model2 = Lidstone(order=n, gamma=gamma) train_ngrams, train_vocab = get_data(n, text) model2.fit(train_ngrams, train_vocab) models["2"] = model2 model1 = MLE(order=n) train_ngrams, train_vocab = get_data(n, text) model1.fit(train_ngrams, train_vocab) models["1"] = model1 return models
N = 3 print(f"Process: Building n-grams with N={N}") # Train the China n-gram model china_train, china_vocab = padded_everygram_pipeline(order=N, text=china_dataset) taiwan_train, taiwan_vocab = padded_everygram_pipeline(order=N, text=taiwan_dataset) print("Process: Train the model") from nltk.lm import Lidstone gamma_param = 0.5 china_model = Lidstone(gamma=gamma_param, order=N) china_model.fit(china_train, china_vocab) taiwan_model = Lidstone(gamma=gamma_param, order=N) taiwan_model.fit(taiwan_train, taiwan_vocab) import math def log_score(model, N, sentence): log_score = 0.0 sentence = pad_both_ends(list(sentence), n=N) ngram_sents = list(ngrams(sentence, n=N)) for ngram_sent in ngram_sents: log_score += math.log( model.unmasked_score(word=ngram_sent[-1], context=ngram_sent[0:-1])) return log_score
def lidstone_trigram_model(trigram_training_data, vocabulary): model = Lidstone(0.1, order=3, vocabulary=vocabulary) model.fit(trigram_training_data) return model
def trainModel(n, lines): ngrams, phrases = pep(n, lines) model = Lidstone(0.01, n) model.fit(ngrams, phrases) model.vocab._cutoff = 2 return model
def main(): parser = ArgumentParser() parser.add_argument( '--data_file', default='../../data/facebook-maria/combined_group_data_es_tagged.tsv') parser.add_argument('--ngram_order', default=3) args = vars(parser.parse_args()) ## load data combined_data = pd.read_csv(args['data_file'], sep='\t', index_col=False) # remove URL-only statuses min_status_len = 3 combined_data = combined_data[ combined_data.loc[:, 'status_message_ne_tagged_stemmed'].apply( lambda x: len(str(x).split(' '))) > min_status_len] # tmp debugging # combined_data = combined_data.head(100) ## compute entropy ## (1) per-post ## (2) per-mention (within sentence) ## (3) per-mention (within fixed window?) ## train language model ngram_order = args['ngram_order'] tokenizer = ToktokTokenizer( ) # use TokTok for tokens because it's multilingual sent_tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle') sent_tokenize = lambda x: sent_tokenizer.tokenize(x) combined_data.loc[:, 'status_message_sents'] = combined_data.loc[:, 'status_message_ne_tagged_stemmed'].apply( sent_tokenize ) combined_data_sents = reduce( lambda x, y: x + y, combined_data.loc[:, 'status_message_sents'].values) combined_data_ngrams = (( ngram for ngram in generate_ngrams(sent, tokenizer, n=ngram_order)) for sent in combined_data_sents) # train Lidstone language model gamma = 0.1 combined_data_lm = Lidstone(order=ngram_order, gamma=gamma) vocab = set( reduce(lambda x, y: x + y, [ tokenizer.tokenize(txt) for txt in combined_data.loc[:, 'status_message_ne_tagged_stemmed'].values ])) print('about to fit LM with order=%d and vocab=%d' % (ngram_order, len(vocab))) combined_data_lm.fit(combined_data_ngrams, vocabulary_text=vocab) ## split data to per-sentence for easier handling print('about to compute entropy for all sentences') # multi-threading for slightly faster performance MAX_THREADS = 10 pool = Pool(MAX_THREADS) combined_data_sents_ordered = combined_data.loc[:, 'status_message_sents'].values combined_data_entropy = pool.starmap( compute_entropy_sent, zip(combined_data_sents_ordered, repeat(combined_data_lm), repeat(tokenizer), repeat(ngram_order))) print('combined data entropy shape %d' % (len(combined_data_entropy))) combined_data.loc[:, 'sent_entropy'] = combined_data_entropy # serial threading for LOSERS # combined_data.loc[:, 'sent_entropy'] = combined_data.loc[:, 'status_message_sents'].apply(lambda x: [combined_data_lm.entropy(generate_ngrams(sent, tokenizer, n=ngram_order)) for sent in x]) ## compute entropy for each (1) post (2) entity ## format: status ID, publish time, status message, entity name, post entropy, sentence entropy combined_data.loc[:, 'post_entropy'] = combined_data.loc[:, 'sent_entropy'].apply( lambda x: np. mean(x)) combined_data_flat = [] ne_matcher = re.compile('\w+\.<ne\.\w+>') ne_null = 'NULL_ENTITY.<ne>' for i, combined_data_i in combined_data.iterrows(): status_id_i = combined_data_i.loc['status_id'] status_time_i = combined_data_i.loc['status_published'] status_message_i = combined_data_i.loc[ 'status_message_ne_tagged_stemmed'] entropy_i = combined_data_i.loc['post_entropy'] for sent_j, entropy_j in zip(*combined_data_i.loc[ ['status_message_sents', 'sent_entropy']].values): print('processing sent %s' % (sent_j)) sent_tokens = tokenizer.tokenize(sent_j) sent_tokens_ne = list( filter(lambda x: ne_matcher.search(x) is not None, sent_tokens)) if (len(sent_tokens_ne) > 0): data_j = pd.DataFrame([[ status_id_i, status_time_i, status_message_i, sent_j, entropy_i, sent_token_ne, entropy_j ] for sent_token_ne in sent_tokens_ne]) else: data_j = pd.DataFrame([[ status_id_i, status_time_i, status_message_i, sent_j, entropy_i, ne_null, entropy_j ]]) print('adding data with shape %s' % (len(data_j))) combined_data_flat.append(data_j) combined_data_flat_cols = [ 'status_id', 'status_published', 'status_message', 'sent', 'post_entropy', 'entity', 'sent_entropy' ] combined_data_flat = pd.concat(combined_data_flat, axis=0) combined_data_flat.columns = combined_data_flat_cols ## get examples of low/high entropy statuses combined_data_flat.sort_values('sent_entropy', inplace=True, ascending=False) print(combined_data_flat.loc[:, 'status_message'].head(5)) print(combined_data_flat.loc[:, 'status_message'].tail(5)) ## save flat data out_file = args['data_file'].replace('.tsv', '_entropy.tsv') combined_data_flat.to_csv(out_file, sep='\t', index=False)