def train_n_gram(data, ids, n): """ Returns an n-gram model. Args: data - the original data frame containing all the data ids - the IDs of the dialogues the n-gram model will train on n - the length of the n-gram model Returns: NLTK NgramCounter containing the counts of all the n-grams in the training set. """ # Extracts all the dialogue act classes. unique_dialogue_acts = sorted(list(set(data['dialogue_act']))) # Makes n-grams of the all the dialogues with the given ids. training_dialogues = [] for ID in ids: training_dialogue = list( data[data['dialogue_id'] == ID]['dialogue_act']) training_dialogues.append(training_dialogue) # Get the every n-gram up to n from the training dialogues. n_grams = [] for i in range(n): n_grams = n_grams + [ list(ngrams(dialogue, n - i)) for dialogue in training_dialogues ] # Trains the n-gram model on the dialogue n-grams and the unique dialogue acts. lm = MLE(n) lm.fit(n_grams, unique_dialogue_acts) return lm
def makeModel(): #sentences = webtext.raw()+brown.raw()+reuters.raw() sentences = webtext.raw() + reuters.raw() # Tokenize the sentences try: # Use the default NLTK tokenizer. from nltk import word_tokenize, sent_tokenize # Testing whether it works. # Sometimes it doesn't work on some machines because of setup issues. word_tokenize( sent_tokenize("This is a foobar sentence. Yes it is.")[0]) except: # Use a naive sentence tokenizer and toktok. import re from nltk.tokenize import ToktokTokenizer # See https://stackoverflow.com/a/25736515/610569 sent_tokenize = lambda x: re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', x) # Use the toktok tokenizer that requires no dependencies. toktok = ToktokTokenizer() word_tokenize = word_tokenize = toktok.tokenize tokenized_text = [ list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(sentences) ] # Make it ready for making 3 grams n = 5 train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text) model = MLE(n) # Lets train a 3-grams model, previously we set n=3 model.fit(train_data, padded_sents) #print(model.vocab) return model
class FormatJudge: """Detects format errors on a tabular data set.""" def __init__(self, generator: PatternGenerator, n: int = 3, dim: int = 1): self.generator = generator self.n = n self.dim = dim def __call__(self, o: Any) -> list: return self.judge(o) def fit(self, values: List[Any]): patterns = [self.generator(v) for v in values] padded_patterns = [pad_both_ends(p, n=self.n) for p in patterns] ngrams_ = [ngrams(pp, n=self.n) for pp in padded_patterns] self.vocab = list(flatten( pad_both_ends(p, n=self.n) for p in patterns)) self.model = MLE(self.n) self.model.fit(ngrams_, self.vocab) def judge(self, o: Any) -> list: scores = [] p = self.generator(o) p = list(pad_both_ends(p, n=self.n)) for i, v in enumerate(p): if i < self.n - 1: continue letters = [] for j in range(i - (self.n - 1), i): letters.append(p[j]) scores.append(self.model.score(v, letters)) return heapq.nsmallest(self.dim, scores)
class LM_nGram(LM_BaseModel): def __init__(self, args, generator, config=None): LM_BaseModel.__init__(self, args, generator, config) self.ngram = MLE(2) def fit(self, steps): tokens = [step.tree.list() for step in steps] train_data = [ nltk.bigrams(t, pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>") for t in tokens ] words = [word for sent in tokens for word in sent] words.extend(["<s>", "</s>"]) padded_vocab = Vocabulary(words) self.ngram.fit(train_data, padded_vocab) def forward(self, steps): probs = [] for step in steps: words = step.tree.list() ngrams = nltk.bigrams(words, pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>") prob = [ self.ngram.score(ngram[-1], ngram[:-1]) for ngram in ngrams ] probs.append(sum(prob) / len(prob)) if len(probs) % 100 == 0: print(len(probs), sum(probs) / len(probs)) return probs
def fit(self, sequences: List[List]): train, vocab = padded_everygram_pipeline(self.config.GRAM_SIZE, sequences) model = MLE(self.config.GRAM_SIZE) model.fit(train, vocab) self.model = model if self.config.SAVE_PATH: self.save_model(self.config.SAVE_PATH)
def train_ngram_model(src_dict: dict, ngram_order=N_GRAM_ORDER): print(f"Training {ngram_order}-gram model on train dataset...") train_data, padded_sents = padded_everygram_pipeline( ngram_order, src_dict["train"]) model = MLE(ngram_order) model.fit(train_data, padded_sents) return model
def fit(self, values: List[Any]): patterns = [self.generator(v) for v in values] padded_patterns = [pad_both_ends(p, n=self.n) for p in patterns] ngrams_ = [ngrams(pp, n=self.n) for pp in padded_patterns] self.vocab = list(flatten( pad_both_ends(p, n=self.n) for p in patterns)) self.model = MLE(self.n) self.model.fit(ngrams_, self.vocab)
def getEveryModel(n: int, text: List, ngrams): """ get mixed-n model """ lm = MLE(n) train, vocab = padded_everygram_pipeline(n, text) lm.fit(train, vocab) return lm
def create_LanguageModel(Docs,model_type,ngram): text = " ".join(Docs) text = text.replace("\\n"," ") tokenized_text = [list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(text)] train_data, padded_sents = padded_everygram_pipeline(ngram, tokenized_text) model = MLE(ngram) if model_type != "MLE": model = KneserNeyInterpolated(ngram) model.fit(train_data, padded_sents) return model
def train_model(self, corpus, n): """ MLE训练基于统计的语言模型 :param text: [['a','b'],['a','b','c']] """ train_data, padded_sents = padded_everygram_pipeline(n, corpus) # train model t1 = time.time() self.model = MLE(n) self.model.fit(train_data, padded_sents) print('training LM takes {} time'.format(time.time()-t1))
class UrlNGram: def __init__(self, urls, n=2): self.ngram = MLE(n) train_data, padded_sents = padded_everygram_pipeline(n, urls) self.ngram.fit(train_data, padded_sents) def get_entropy(self, url): return self.ngram.entropy(list(url)) def get_perplexity(self, url): return self.ngram.perplexity(list(url))
def create_and_fit_model(corpus): # Recibe un corpus tokenizado por sentencia y por palabra. train_data, padded_sents = padded_everygram_pipeline(NGRAM, corpus) # Crea modelo model = MLE(NGRAM) # Ajusta a los datos model.fit(train_data, padded_sents) return model
def ngram_language_model(corpus, n): # form of corpus: [[word, word, word], [word, word, ..., word]] # ngram language model train_data, padded_sents = padded_everygram_pipeline(n, corpus) print(padded_sents) model = MLE(n) model.fit(train_data, padded_sents) print(len(model.vocab)) with open('ngram_model.pkl', 'wb') as fout: pickle.dump(model, fout) return model
def get_MLELM(self, tokens, n_gram = 2) -> MLE: ''' Trains lm and stores in class upon training to be reused ''' paddedLine = [list(pad_both_ends(tokens, n=n_gram))] train, vocab = padded_everygram_pipeline(2, paddedLine) if (tokens not in self.lms.keys()): lm = MLE(n_gram) lm.fit(train, vocab) self.lms[tokens] = lm return self.lms[tokens]
def treinando_modelo_MLE(lista_de_textos): #Salvando todas as frases em uma unica variavel todas_as_questoes = " ".join(lista_de_textos) #Separando as palavras do texto levando em consideração o espaço em branco todas_as_palavras = WhitespaceTokenizer().tokenize(todas_as_questoes) #Adicionando os fake chars em cada palavra e gerando o vetor de vocabulos(nesse caso letras de cada palavra) treino_bigram,vocab = padded_everygram_pipeline(2,todas_as_palavras) #Criando modelo MLE para bigramas modelo = MLE(2) #Treinando os modelos modelo.fit(treino_bigram,vocab) return modelo
def main() -> None: """Main entrypoint.""" # Create an argument parser for parsing CLI arguments parser = ArgumentParser( description= "A tool to train an AI to predict the probability of a word in a sentence" ) # Add parameters for the server connection parser.add_argument("-i", "--input", required=True, type=str, help="The input file to read from") parser.add_argument("-o", "--output", required=True, type=str, help="The output file to serialize the model to") parser.add_argument("-l", "--language", required=True, type=str, help="The name of the language to use") # Parse the arguments options = parser.parse_args() # Read and extract tokens tokens = [] with open(options.input, "r") as file: raw_text = file.read() # Tokenize the text. tokens = [ list(map(str.lower, word_tokenize(sentence))) for sentence in sent_tokenize(raw_text, language=options.language) ] # n-gram size (trigram) n = 3 # Prepare train data train_data, padded_sentences = padded_everygram_pipeline(n, tokens) # Train a Maximum Likelihood Estimation (MLE) model model = MLE(n) model.fit(train_data, padded_sentences) with open(options.output, "wb") as file: pickle.dump(model, file)
def train_model(text, n_gram=3): """ MLE训练基于统计的语言模型 :param text: [['a','b'],['a','b','c']] """ print('train size={}'.format(len(text))) train_data, padded_sents = padded_everygram_pipeline(n_gram, text) # train model model = MLE(n_gram) # Lets train a 3-grams model, previously we set n=3 model.fit(train_data, padded_sents) print('词表大小={}'.format(len(model.vocab))) return model
class ngram_language_model: def __init__(self): self.model = None self.valid_text = None def build_and_train_model(self): text = brown.sents(categories=[ 'adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction' ]) valid_text = [] for sentence in text: words = [] for word in sentence: words.extend(nltk.word_tokenize(word)) valid_text.append(words) self.valid_text = valid_text n = 3 # length of largest everygram train_data, padded_sents = padded_everygram_pipeline(n, valid_text) self.model = MLE(n) self.model.fit(train_data, padded_sents) return def make_predictions(self, msg, number_of_predictions=5): """ makes prediction for the next possible words using the available words """ sentence = [] for x in msg.strip().split(): sentence.extend(nltk.word_tokenize(x)) alpha = 0.1 beta = 0.3 gamma = 0.6 predictions = [] prediction_dict = {} for word in self.model.vocab: alpha_prob = alpha * self.model.score(word) beta_prob = beta * self.model.score(word, sentence[-1:]) gamma_prob = gamma * self.model.score(word, sentence[-2:]) prob = alpha_prob + beta_prob + gamma_prob predictions.append((word, prob)) predictions.sort(key=lambda x: x[1], reverse=True) for word, prob in predictions[:number_of_predictions]: prediction_dict[word] = prob return prediction_dict
def trainNGramModelForWords(): newsListOne = [] text = '' with open("combined.txt", 'r', encoding='utf-8', errors='ignore') as outfile: newslist = json.load(outfile) for news in newslist: newsListOne.extend(news) text = ' '.join([str(elem) for elem in newsListOne]) tokenized_text = [list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(text)] n = 3 train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text) model = MLE(n) # Lets train a 3-grams maximum likelihood estimation model. model.fit(train_data, padded_sents) return model
def create_and_fit_model(self, corpus): # Recibe un corpus tokenizado por sentencia y por palabra. print('primer paso') train_data, padded_sents = padded_everygram_pipeline( self.ngram, corpus) print('creando modelo') # Crea modelo model = MLE(self.ngram) print('ajustando datos') # Ajusta a los datos model.fit(train_data, padded_sents) print('ajusto') return model
def train_ngram(corpus, n, words=True): """ Train ngram (POS) language model from a corpus. """ # Read the corpus file if corpus[-4:] == '.txt': with open(os.path.join('data', 'corpora', corpus), encoding='utf8') as f: text = f.read() elif corpus[-4:] == '.pkl': with open(os.path.join('data', 'corpora', corpus), 'rb') as f: text = pickle.load(f) # Lowercase if the model will be trained on words (to be skipped for # POS tags) if words: if type(text) is list: for sent_idx, sent in enumerate(text): for word_idx, word in enumerate(sent): text[sent_idx][word_idx] = word.lower() elif type(text) is str: text = text.lower() # Tokenize if type(text) is str: text = sent_tokenize(text) if words: text = [word_tokenize(sent) for sent in text] else: text = [sent.split() for sent in text] # Train ngram language model # Do not apply any ngram smoothing thechniques for the model train_data, vocab = padded_everygram_pipeline(n, text) lm = MLE(n) lm.fit(train_data, vocab) # Save the model with open( os.path.join('data', 'models', corpus[:-4] + '_' + str(n) + 'gram' + '.pkl'), 'wb') as f: pickle.dump(lm, f, 4) return lm
def create_LanguageModel(docs, model_type="MLE", ngram=3): global _ngram _ngram = ngram tokenized_text = [] new_docs = preprocess(docs) for d in new_docs: text = sent_tokenize(d, language="turkish") for sent in text: temp = [] for i in word_tokenize(sent, language="turkish"): temp.append(i.lower()) tokenized_text.append(temp) training_ngrams, vocab = padded_everygram_pipeline(ngram, tokenized_text) if model_type == "MLE": model = MLE(ngram) #, vocabulary=Vocabulary(vocab)) model.fit(training_ngrams, vocab) # print(model.vocab) return model elif model_type == "KneserNeyInterpolated": model = KneserNeyInterpolated(ngram) model.fit(training_ngrams, vocab) # padded_sents) # print(model.vocab) return model else: print("Unkown Model Type") return 0
def _train(n: int, texts: List): """ texts 已经分词的文本列表""" lm = MLE(n) train, vocab = [], set([]) for t in texts: g = ngrams(t, n, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>') g = list(g) vocab = vocab | set(t) train.append(g) lm.fit(train, vocabulary_text=list(vocab)) return lm
def generate_sentence(model: MLE, length: int, seed=random.randint(0, 1e10)): content = [] for token in model.generate(length, random_seed=seed): if token == '<s>': continue if token == '</s>': break content.append(token) return detokenize(content)
def create_model(self, corpus_name): print('reading corpus') reader = PlaintextCorpusReader(CORPUS_DIR, corpus_name) print('padded everygram pipeline') train_data, vocab = padded_everygram_pipeline(self.ngram, (reader.sents())) print('creando modelo') # Crea modelo model = MLE(self.ngram) print('ajustando datos') # Ajusta a los datos model.fit(train_data, vocab) print('ajusto') return model
def get_tweet(model: MLE): tweet = [] for word in model.generate(30, text_seed=["<s>"]): if word == "<s>": continue elif word == "</s>": break tweet.append(word) return tweet
def generateNGram(tweets, lexicon): n = 4 train_data, padded_sents = padded_everygram_pipeline(n, lexicon) model = MLE(n) model.fit(train_data, padded_sents) ngramdict = [] for i in lexicon: ngramdict.append(model.counts[i]) featureset = [] lemmatizer = WordNetLemmatizer() for t in tweets: words = word_tokenize(t) words = [lemmatizer.lemmatize(i) for i in words] features = np.zeros(len(lexicon)) for w in words: if w in ngramdict: features[lexicon.index(w)] += 1 featureset.append(list(features)) return featureset
def getModel(n: int, text: List): """ 在这里训练模型 """ lm = MLE(n) # get train, vocab train, vocab = [], set([]) for t in text: g = ngrams(t, n, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>') g = list(g) vocab = vocab | set(t) train.append(g) lm.fit(train, vocabulary_text=list(vocab)) return lm
def build_word_model(corpus, order=3): """ Creates character-level n-gram word model. """ words = ' '.join(corpus).split() # Flatten corpus into words words = [w for w in words if re.match(r'[a-z]', w)] # Use clean words only vocab = set() data = [] for word in words: w = unigrams(word) vocab.update(w) data.append(ngramize(w, order)) model = MLE(order) model.fit(data, vocabulary_text=vocab) return model
class MleTrigramTests(unittest.TestCase): """MLE trigram model tests""" score_tests = [ # count(d | b, c) = 1 # count(b, c) = 1 ("d", ("b", "c"), 1), # count(d | c) = 1 # count(c) = 1 ("d", ["c"], 1), # total number of tokens is 18, of which "a" occurred 2 times ("a", None, 2.0 / 18), # in vocabulary but unseen ("z", None, 0), # out of vocabulary should use "UNK" score ("y", None, 3.0 / 18), ] def setUp(self): vocab, training_text = _prepare_test_data(3) self.model = MLE(3, vocabulary=vocab) self.model.fit(training_text)
class MleTrigramTests(unittest.TestCase): """MLE trigram model tests""" score_tests = [ # count(d | b, c) = 1 # count(b, c) = 1 ("d", ("b", "c"), 1), # count(d | c) = 1 # count(c) = 1 ("d", ["c"], 1), # total number of tokens is 18, of which "a" occured 2 times ("a", None, 2.0 / 18), # in vocabulary but unseen ("z", None, 0), # out of vocabulary should use "UNK" score ("y", None, 3.0 / 18), ] def setUp(self): vocab, training_text = _prepare_test_data(3) self.model = MLE(3, vocabulary=vocab) self.model.fit(training_text)
class NgramModelTextGenerationTests(unittest.TestCase): """Using MLE estimator, generate some text.""" def setUp(self): vocab, training_text = _prepare_test_data(3) self.model = MLE(3, vocabulary=vocab) self.model.fit(training_text) def test_generate_one_no_context(self): self.assertEqual(self.model.generate(random_seed=3), "<UNK>") def test_generate_one_limiting_context(self): # We don't need random_seed for contexts with only one continuation self.assertEqual(self.model.generate(text_seed=["c"]), "d") self.assertEqual(self.model.generate(text_seed=["b", "c"]), "d") self.assertEqual(self.model.generate(text_seed=["a", "c"]), "d") def test_generate_one_varied_context(self): # When context doesn't limit our options enough, seed the random choice self.assertEqual( self.model.generate(text_seed=("a", "<s>"), random_seed=2), "a" ) def test_generate_cycle(self): # Add a cycle to the model: bd -> b, db -> d more_training_text = [list(padded_everygrams(self.model.order, list("bdbdbd")))] self.model.fit(more_training_text) # Test that we can escape the cycle self.assertEqual( self.model.generate(7, text_seed=("b", "d"), random_seed=5), ["b", "d", "b", "d", "b", "d", "</s>"], ) def test_generate_with_text_seed(self): self.assertEqual( self.model.generate(5, text_seed=("<s>", "e"), random_seed=3), ["<UNK>", "a", "d", "b", "<UNK>"], ) def test_generate_oov_text_seed(self): self.assertEqual( self.model.generate(text_seed=("aliens",), random_seed=3), self.model.generate(text_seed=("<UNK>",), random_seed=3), ) def test_generate_None_text_seed(self): # should crash with type error when we try to look it up in vocabulary with self.assertRaises(TypeError): self.model.generate(text_seed=(None,)) # This will work self.assertEqual( self.model.generate(text_seed=None, random_seed=3), self.model.generate(random_seed=3), )
def setUp(self): vocab, training_text = _prepare_test_data(3) self.model = MLE(3, vocabulary=vocab) self.model.fit(training_text)
class MleBigramTests(unittest.TestCase): """unit tests for MLENgramModel class""" score_tests = [ ("d", ["c"], 1), # Unseen ngrams should yield 0 ("d", ["e"], 0), # Unigrams should also be 0 ("z", None, 0), # N unigrams = 14 # count('a') = 2 ("a", None, 2.0 / 14), # count('y') = 3 ("y", None, 3.0 / 14), ] def setUp(self): vocab, training_text = _prepare_test_data(2) self.model = MLE(2, vocabulary=vocab) self.model.fit(training_text) def test_logscore_zero_score(self): # logscore of unseen ngrams should be -inf logscore = self.model.logscore("d", ["e"]) self.assertTrue(math.isinf(logscore)) def test_entropy_perplexity_seen(self): # ngrams seen during training trained = [ ("<s>", "a"), ("a", "b"), ("b", "<UNK>"), ("<UNK>", "a"), ("a", "d"), ("d", "</s>"), ] # Ngram = Log score # <s>, a = -1 # a, b = -1 # b, UNK = -1 # UNK, a = -1.585 # a, d = -1 # d, </s> = -1 # TOTAL logscores = -6.585 # - AVG logscores = 1.0975 H = 1.0975 perplexity = 2.1398 self.assertAlmostEqual(H, self.model.entropy(trained), places=4) self.assertAlmostEqual(perplexity, self.model.perplexity(trained), places=4) def test_entropy_perplexity_unseen(self): # In MLE, even one unseen ngram should make entropy and perplexity infinite untrained = [("<s>", "a"), ("a", "c"), ("c", "d"), ("d", "</s>")] self.assertTrue(math.isinf(self.model.entropy(untrained))) self.assertTrue(math.isinf(self.model.perplexity(untrained))) def test_entropy_perplexity_unigrams(self): # word = score, log score # <s> = 0.1429, -2.8074 # a = 0.1429, -2.8074 # c = 0.0714, -3.8073 # UNK = 0.2143, -2.2224 # d = 0.1429, -2.8074 # c = 0.0714, -3.8073 # </s> = 0.1429, -2.8074 # TOTAL logscores = -21.6243 # - AVG logscores = 3.0095 H = 3.0095 perplexity = 8.0529 text = [("<s>",), ("a",), ("c",), ("-",), ("d",), ("c",), ("</s>",)] self.assertAlmostEqual(H, self.model.entropy(text), places=4) self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)