def train(train_file): """Return the required language models trained from a file.""" unigram = NgramModel(1) bigram_left = NgramModel(2) bigram_right = NgramModel(2) for line in train_file: tokens = line.rstrip().split() unigram.update(tokens) bigram_left.update(tokens) bigram_right.update(reversed(tokens)) return (unigram, bigram_left, bigram_right)
def gen_disassociated_press(file=KJBIBLE, order=3, len=100): """Generate some autocorrelated text.""" tokens = [k for k in tokenize(file) if k.isalpha()] model = NgramModel(order, tokens, MLEProbDist) ret = ['',] * (order-1) for i in range(len): tail = ret[-(order-1):] ret.append(model.generate_one(tail)) return ' '.join(ret[(order-1):])
def _make_models(self, tuples): self._word_ids = WordIdDictionary() # Extract sequence of words, lemmas, and tags words, lemmas, tags = tuple( map( lambda tokens: list(self._word_ids.add_words_transform(tokens) ), zip(*tuples))) self._tags = tags # Create models for words, lemmas, and tags self._words_ngram = NgramModel(words, self._n) self._lemmas_ngram = NgramModel(lemmas, self._n) self._tags_ngram = NgramModel( tags, 2 * self._n) # Can afford to use 2 * n-gram size for grammar # Map tag and (tag, lemma) to valid lemmas and vocabulary, respectively # It's faster to use a list than predicate on unigrams during backoff search self._tag_lemmas = ConditionalFreqDist(zip(tags, lemmas)) self._tag_lemma_words = ConditionalFreqDist( zip(zip(tags, lemmas), words))
def train_ngram(self, target_data_path, days, kakao_data_path): results = [] ngramModel = NgramModel(self.dataLoader) self.dataLoader.ngram_data_loader(days) self.dataLoader.kakao_data_loader(kakao_data_path) target_datas = self.dataLoader.target_data_loader(target_data_path) target_data_len = len(target_datas) for i, target_data in enumerate(target_datas): if i % (target_data_len / 100) == 0: print("{}% 완료".format((i / target_data_len) * 100)) re = ngramModel.detect_rule_recommend(target_data) results.append(target_data + " " + " ".join(re)) self.dataLoader.write_result(self.write_file_path, results)
def initialize_bot(chars, nicks): n = 3 # intros = ["So", "Hi", "In fact", "For what it's worth", "Think about it", # "Conversely", "On the other hand", "Debatably", "Especially", "Not to mention", "Although", "Moreover", "Equally", # "But", "Yes", # "See here", "Ultimately", "Rather", "Nevertheless", "As you said", "Mind you", "Even so"] char_corps = load_corpora(chars) est = lambda fdist, bins: MLEProbDist(fdist) # est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) # est = lambda fdist, bins: WittenBellProbDist(fdist) # est = lambda fdist, bins: KneserNeyProbDist(fdist) models = {character: NgramModel(n, corp, estimator=est) for character, corp in char_corps.iteritems()} # return ChatBot(chars, nicks, intros, models, ngram=n, debug=False) return ChatBot(chars, nicks, models, ngram=n, debug=False)
with open(file=path, mode="wb") as fp: fp.write(pickle.dumps(obj=params_dict)) @classmethod def load(cls, path): """ 加载模型 :param path: 保存路径 :return: """ params_dict = pickle.load(open(file=path, mode="rb")) lookup_table = params_dict['_lookup_table'] ngram_model = pickle.loads(params_dict['_ngram_model_pickle'], fix_imports=True) return cls(ngram_model=ngram_model, lookup_table=lookup_table) if __name__ == '__main__': from nltk.text import Text from nltk.corpus import gutenberg text1 = Text(gutenberg.words('melville-moby_dick.txt')) # ngramCounter = NgramCounter(order=2, train=text1) ngramModel = NgramModel(ngram_counter=ngramCounter) corrector = NgramCorrector(ngram_model=ngramModel) print(corrector.correct(['I', 'dooo', 'think', 'you', 'rre', 'goooood'])) corrector2 = NgramCorrector.load("123") print(corrector2.correct(['I', 'don', 'think', 'you', 'rre', 'goooood']))
test_text.extend(sentences) else: test_text.append(txt) #print test_files print len(test_files) total_train_files = [] TOTAL = INCREMENT UPPER_LIMIT = 500 while len(total_train_files) < UPPER_LIMIT: total_train_files = train_files[:TOTAL] data_set_corpus = PlaintextCorpusReader(sys.argv[1], total_train_files) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) lm = NgramModel(3, data_set_corpus.words(), estimator) #lm = NgramModel(2, data_set_corpus.words(), estimator) P = [] for s in test_text: s_tokens = nltk.word_tokenize(s) if SENTENCE: #if len(s_tokens) > 3: if len(s_tokens) > 10: p = lm.perplexity(s_tokens) P.append(p) else: p = lm.perplexity(s_tokens) P.append(p) TOTAL += INCREMENT
if __name__ == "__main__": if len(sys.argv) < 2: print "Usage: %s <corpus-root> <tweets-file>" % (sys.argv[0]) sys.exit(1) corpus_root = sys.argv[1] estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) ignored_words = nltk.corpus.stopwords.words('english') pos_movie_reviews = PlaintextCorpusReader(corpus_root + "/pos", ".*\.txt") neg_movie_reviews = PlaintextCorpusReader(corpus_root + "/neg", ".*\.txt") print "Corpora built." pos_unigram_lm = NgramModel(1, pos_movie_reviews.words(), estimator) print "Positive unigram model complete." pos_bigram_lm = NgramModel(2, pos_movie_reviews.words(), estimator) print "Positive bigram model complete." #pos_trigram_lm = NgramModel(3, pos_movie_reviews.words(), estimator) neg_unigram_lm = NgramModel(1, neg_movie_reviews.words(), estimator) print "Negative unigram model complete." neg_bigram_lm = NgramModel(2, neg_movie_reviews.words(), estimator) print "Negative bigram model complete." #neg_trigram_lm = NgramModel(3, neg_movie_reviews.words(), estimator) #read in the tweets tweets = [] tokenizer = utils.Tokenizer()
def main(): """ Trains and evaluates neural language models on the Microsoft Sentence Completion Challenge dataset. Allowed cmd-line flags: -s TS_FILES : Uses the reduced trainsed (TS_FILES trainset files) -o MIN_OCCUR : Only uses terms that occur MIN_OCCUR or more times in the trainset. Other terms are replaced with a special token. -f MIN_FILES : Only uses terms that occur in MIN_FILES or more files in the trainset. Other terms are replaced with a special token. -n : n-gram length (default 4) -t : Use tree-grams (default does not ues tree-grams) -u FTRS : Features to use. FTRS must be a string composed of zeros and ones, of length 5. Ones indicate usage of following features: (word, lemma, google_pos, penn_pos, dependency_type), respectively. Neural-net specific cmd-line flags: -ep EPOCHS : Number of training epochs, defaults to 20. -eps EPS : Learning rate, defaults to 0.005. -mnb MBN_SIZE : Size of the minibatch, defaults to 2000. """ logging.basicConfig(level=logging.INFO) log.info("Evaluating model") # get the data handling parameters ts_reduction = util.argv('-s', None, int) min_occ = util.argv('-o', 5, int) min_files = util.argv('-f', 2, int) n = util.argv('-n', 4, int) use_tree = '-t' in sys.argv bool_format = lambda s: s.lower() in ["1", "true", "yes", "t", "y"] ft_format = lambda s: map(bool_format, s) ftr_use = np.array(util.argv('-u', ft_format("001000"), ft_format)) val_per_epoch = util.argv('-v', 10, int) # nnets only support one-feature ngrams assert ftr_use.sum() == 1 # get nnet training parameters use_lbl = '-l' in sys.argv epochs = util.argv('-ep', 20, int) eps = util.argv('-eps', 0.002, float) mnb_size = util.argv('-mnb', 2000, int) n_hid = util.argv('-h', 1000, int) d = util.argv('-d', 100, int) # load data ngrams, q_groups, answers, feature_sizes = data.load_ngrams( n, ftr_use, use_tree, subset=ts_reduction, min_occ=min_occ, min_files=min_files) used_ftr_sizes = feature_sizes[ftr_use] # remember, we only use one feature vocab_size = used_ftr_sizes[0] log.info("Data loaded, %d ngrams", ngrams.shape[0]) # split data into sets x_train, x_valid, x_test = util.dataset_split(ngrams, 0.05, 0.05, rng=456) # generate a version of the validation set that has # the first term (the conditioned one) randomized # w.r.t. unigram distribution # so first create the unigram distribution, no smoothing unigrams_data = data.load_ngrams(1, ftr_use, False, subset=ts_reduction, min_occ=min_occ, min_files=min_files)[0] unigrams_data = NgramModel(1, False, ftr_use, feature_sizes, ts_reduction, min_occ, min_files, 0.0, 0.0, unigrams_data) unigrams_dist = unigrams_data.probability_additive( np.arange(vocab_size).reshape(vocab_size, 1)) unigrams_dist /= unigrams_dist.sum() # finally, generate validation sets with randomized term x_valid_r = random_ngrams(x_valid, vocab_size, False, unigrams_dist) # the directory for this model dir = "%s_%s_%d-gram_features-%s_data-subset_%r-min_occ_%r-min_files_%r"\ % ("llbl" if use_lbl else "lmlp", "tree" if use_tree else "linear", n, "".join([str(int(b)) for b in ftr_use]), ts_reduction, min_occ, min_files) dir = os.path.join(_DIR, dir) if not os.path.exists(dir): os.makedirs(dir) # filename base for this model file = "nhid-%d_d-%d_train_mnb-%d_epochs-%d_eps-%.5f" % ( n_hid, d, mnb_size, epochs, eps) # store the logs if False: log_file_handler = logging.FileHandler( os.path.join(dir, file + ".log")) log_file_handler.setLevel(logging.INFO) logging.root.addHandler(log_file_handler) # we will plot log-lik ratios for every _VALIDATE_MNB minibatches # we will also plot true mean log-lik valid_on = {"x_valid": x_valid[:_LL_SIZE], "x_valid_r": x_valid_r[ :_LL_SIZE], "x_train": x_train[:_LL_SIZE]} valid_ll = {k: [] for k in valid_on.keys()} valid_p_mean = {k: [] for k in valid_on.keys()} # how often we validate mnb_count = (x_train.shape[0] - 1) / mnb_size + 1 _VALIDATE_MNB = mnb_count / val_per_epoch def mnb_callback(net, epoch, mnb): """ Callback function called after every minibatch. """ if (mnb + 1) % _VALIDATE_MNB: return # calculate log likelihood using the exact probability probability_f = theano.function([net.input], net.probability) for name, valid_set in valid_on.iteritems(): p = probability_f(valid_set) valid_ll[name].append(np.log(p).mean()) valid_p_mean[name].append(p.mean()) log.info('Epoch %d, mnb: %d, x_valid mean-log-lik: %.5f' ' , x_valid p-mean: %.5f' ' , ln(p(x_valid) / p(x_valid_r).mean(): %.5f', epoch, mnb, valid_ll["x_valid"][-1], valid_p_mean["x_valid"][-1], valid_ll["x_valid"][-1] - valid_ll["x_valid_r"][-1]) # track if the model progresses on the sentence completion challenge # sent_challenge = [] def epoch_callback(net, epoch): # log some info about the parameters, just so we know param_mean_std = [(k, v.mean(), v.std()) for k, v in net.params().iteritems()] log.info("Epoch %d: %s", epoch, "".join( ["\n\t%s: %.5f +- %.5f" % pms for pms in param_mean_std])) # evaluate model on the sentence completion challenge # probability_f = theano.function([net.input], net.probability) # qg_log_lik = [[np.log(probability_f(q)).sum() for q in q_g] # for q_g in q_groups] # predictions = map(lambda q_g: np.argmax(q_g), qg_log_lik) # sent_challenge.append((np.array(predictions) == answers).mean()) # log.info('Epoch %d sentence completion eval score: %.4f', # epoch, sent_challenge[-1]) log.info("Creating model") if use_lbl: net = LLBL(n, vocab_size, d, 12345) else: net = LMLP(n, vocab_size, d, 12345) net.mnb_callback = mnb_callback net.epoch_callback = epoch_callback train_cost, valid_cost, _ = net.train( x_train, x_valid, mnb_size, epochs, eps) # plot training progress info # first we need values for the x-axis (minibatch count) mnb_count = (x_train.shape[0] - 1) / mnb_size + 1 mnb_valid_ep = mnb_count / _VALIDATE_MNB x_axis_mnb = np.tile((np.arange(mnb_valid_ep) + 1) * _VALIDATE_MNB, epochs) x_axis_mnb += np.repeat(np.arange(epochs) * mnb_count, mnb_valid_ep) x_axis_mnb = np.hstack(([0], x_axis_mnb)) plt.figure(figsize=(16, 12)) plt.subplot(221) plt.plot(mnb_count * (np.arange(epochs) + 1), train_cost, 'b-', label='train') plt.plot(mnb_count * (np.arange(epochs) + 1), valid_cost, 'g-', label='valid') plt.axhline(min(valid_cost), linestyle='--', color='g') plt.yticks(list(plt.yticks()[0]) + [min(valid_cost)]) plt.title('cost') plt.grid() plt.legend(loc=1) plt.subplot(222) for name, valid_set in valid_ll.items(): plt.plot(x_axis_mnb, valid_set, label=name) plt.ylim((np.log(0.5 / vocab_size), max([max(v) for v in valid_ll.values()]) + 0.5)) plt.axhline(max(valid_ll["x_valid"]), linestyle='--', color='g') plt.yticks(list(plt.yticks()[0]) + [max(valid_ll["x_valid"])]) plt.title('log-likelihood(x)') plt.grid() plt.legend(loc=4) plt.subplot(224) for name, valid_set in valid_p_mean.items(): plt.plot(x_axis_mnb, valid_set, label=name) plt.title('p(x).mean()') plt.grid() plt.legend(loc=4) # plt.subplot(224) # plt.plot(mnb_count * np.arange(epochs + 1), sent_challenge, 'g-') # plt.title('sent_challenge') # plt.grid() plt.savefig(os.path.join(dir, file + ".pdf"))