def test_models_have_correct_lambda_size(): lm = LanguageModel(4) data = open_file('kn_test.txt') lm.train(data) for i in range(0, lm.n - 2): model = lm.models[i] assert len(model.lambdas) == len(model.hist_words_dct)
def test_models_have_correct_n(): lm = LanguageModel(4) data = open_file('kn_test.txt') lm.train(data) for i in range(0, lm.n - 2): model = lm.models[i] assert model.n == i + 2
def test_perplexity_produces_expected_values(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) perp = round(lm.perplexity(2, math.log(0.5)), 5) correct = round(math.sqrt(2), 5) assert perp == correct
class State: keywords = {} def __init__(self): logging.info("Instantiating State class: %s" % self.__class__.__name__) # Add keywords from superclasses self.keywords = State.fold_keywords( self.__class__, self.keywords) # If the State doesn't have a LanguageModel set, then # Automatically create LanguageModel specific to the keywords of this State if not hasattr(self,'lm'): logging.info("We need to create a LanguageModel for this State") commands_array = self.keywords.keys() self.lm = LanguageModel(self.__class__.__name__,commands_array) self.lm.update_all() logging.info("LanguageModel created") @staticmethod def fold_keywords(clazz, keywords): for base in clazz.__bases__: keywords.update( State.fold_keywords(base, base.keywords)) return keywords def process(self, text): state_change = [] if text in self.keywords: state_change = self.keywords[text] if type(state_change) not in [list,tuple]: state_change = [ state_change ] logging.info('Processed text = %s with result = %s' % (text, state_change)) return state_change
def test_models_have_correct_beginning_grams(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) assert sorted(lm.models[0].beginning_grams) \ == sorted(['this', 'shall', 'PAD']) assert sorted(lm.models[1].beginning_grams) \ == sorted(['PAD this', 'this text', 'PAD PAD', 'shall train'])
def __init__(self): super().__init__() self.model_lm = LanguageModel() self.model_ct = ContentTransfer() self.kb = KnowledgeBase() self.ranker = Ranker(self.model_lm) self.local = True
def __init__(self): self.lm = LanguageModel('RenMinData.txt') self.dict = {} self.words = [] self.max_len_word = 0 self.load_dict('dict.txt') self.graph = None self.viterbi_cache = {}
def test_train_creates_expected_hist_words_dict(): lm = LanguageModel(2) data = open_file('kn_test.txt') lm.train(data) model = lm.models[-1] assert sorted(list(model.hist_words_dct.keys())) \ == sorted(['PAD', 'this', 'text', 'shall', 'train', '.']) assert list(model.hist_words_dct['this'].keys()) == ['text'] assert list(model.hist_words_dct['text'].keys()) == ['.'] assert list(model.hist_words_dct['shall'].keys()) == ['train'] assert list(model.hist_words_dct['train'].keys()) == ['text'] assert list(model.hist_words_dct['PAD'].keys()) == ['this'] assert sorted(list(model.hist_words_dct['.'].keys())) \ == sorted(['PAD', 'shall'])
def test_kn_produces_expected_values(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) assert lm.kn_evaluate(['text', 'shall', 'train']) == -2.0770634192748685 assert lm.kn_evaluate(['this', 'text', 'dog']) == -3.1656313103493887 assert lm.kn_evaluate(['the', 'brown', 'cat']) == -2.4724841297894433
def test_laplace_produces_expected_values(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) assert lm.laplace_evaluate(['this', 'shall', 'train', 'PAD']) \ == -2.890371757896165 assert lm.laplace_evaluate(['dog', 'text', '.', 'PAD']) \ == (math.log(1 / 9) + math.log(1 / 2))
def __init__(self): logging.info("Instantiating State class: %s" % self.__class__.__name__) # Add keywords from superclasses self.keywords = State.fold_keywords( self.__class__, self.keywords) # If the State doesn't have a LanguageModel set, then # Automatically create LanguageModel specific to the keywords of this State if not hasattr(self,'lm'): logging.info("We need to create a LanguageModel for this State") commands_array = self.keywords.keys() self.lm = LanguageModel(self.__class__.__name__,commands_array) self.lm.update_all() logging.info("LanguageModel created")
class DialogBackendLocal(DialogBackend): def __init__(self): super().__init__() self.model_lm = LanguageModel() self.model_ct = ContentTransfer() self.kb = KnowledgeBase() self.ranker = Ranker(self.model_lm) self.local = True def predict(self, context, max_n=1): print('backend running, context = %s' % context) query = self.get_query(context) # get results from different models results = self.model_lm.predict(context) passages = [] url_snippet = [] for line in open('args/kb_sites.txt', encoding='utf-8'): cust = line.strip('\n') kb_args = {'domain': 'cust', 'cust': cust, 'must_include': []} url_snippet.append(self.kb.predict(query, args=kb_args)[0]) passage = ' ... '.join([snippet for _, snippet in url_snippet]) passages.append((passage, query)) for passage, kb_query in passages: results += self.model_ct.predict(kb_query, passage) # rank hyps from different models hyps = [hyp for _, _, hyp in results] scored = self.ranker.predict(context, hyps) ret = [] for i, d in enumerate(scored): d['way'], _, d['hyp'] = results[i] ret.append((d['score'], d)) ranked = [d for _, d in sorted(ret, reverse=True)] if max_n > 0: ranked = ranked[:min(len(ranked), max_n)] return ranked, url_snippet
def main(): p = get_argparser() args = p.parse_args() lm = LanguageModel() lm.configure_logger(level=logging.DEBUG if args.DEBUG else logging.INFO, write_file=True) if args.train and args.data_path: lm.train(args.data_path, output_path=args.train, learning_rate=args.learning_rate, hidden_size=args.hidden_size, batch_size=args.batch_size, max_epoch=args.max_epoch) elif args.test and args.data_path: lm.predict(args.test, args.data_path) else: # Well, this is silly. p.print_help() exit(2)
def test_subsequent_training(): lm = LanguageModel(2) data = open_file('kn_test.txt') lm.train(data) model = lm.models[-1] wh1_len = len(model.word_hists_dct) hw1_len = len(model.hist_words_dct) data = tokenize('This sample.') lm.train(data) model = lm.models[-1] wh2_len = len(model.word_hists_dct) hw2_len = len(model.hist_words_dct) assert wh2_len - wh1_len == 1 assert hw2_len - hw1_len == 1 assert sorted(list(model.word_hists_dct['.'].keys())) \ == sorted(['text', 'sample']) assert sorted(list(model.hist_words_dct['this'].keys())) \ == sorted(['text', 'sample'])
# -*- coding: utf-8 -*- from lm import LanguageModel from memoize import Memoize lm = LanguageModel() def splits(text, max_len=10): return [(text[:i + 1], text[i + 1:]) for i in range(min(len(text), max_len))] @Memoize def segment(text): text = text.strip() if not text: return [] candidates = [[left] + segment(right) for left, right in splits(text)] return max(candidates, key=lm.get_words_prob) if __name__ == '__main__': test = [ 'colorlessgreenideassleepfuriously.', 'ihaveadream.', 'howtotrainadragon.', 'canwetakeaphotoofyou?' ] for text in test: words = segment(text) print(text)
def test_ngram(self): result = LanguageModel(2).get_ngrams(["hello", "world", "lmao"]) self.assertEqual(result, [(None, 'hello'), ('hello', 'world'), ('world', 'lmao'), ('lmao', None)])
def test_update_all(self): lm = LanguageModel('playing') lm.update_all( True) lm.reset_files()
def test_lm_has_correct_number_tokens_and_unigram_types(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) assert lm.num_tokens == 7 assert len(lm.unigrams) == 5
def test_discount(): lm = LanguageModel(2) data = open_file('kn_test.txt') lm.train(data) assert lm.discount == 0.75
else: source = Reader(options.input) if options.output == '-': writer = sys.stdout else: writer = Writer(options.output) if debug: rules.DEBUG = 1 config = Config(options.config) if logger.level <= logging.INFO: config.write(sys.stderr) lm = LanguageModel(config.lm_file, config.lm_order) rule_table = RuleTable.load(config.rule_table_file, lm, config) extra_feature_funcs = build_extra_feature_funcs(config) recombination_checker = CombinedRecombinationChecker(extra_feature_funcs) decoder = CKYDecoder(config, rule_table, lm, recombination_checker=recombination_checker, extra_feature_funcs=extra_feature_funcs, checking_hypo=checking, expend_loser=expend_loser) logger.info('Start decoding...') def translate(data):
parser.add_argument('--vocab_len', type=float, default=19800, dest='vocab_len') parser.add_argument('--lr', type=float, default=1e-3, dest='lr') parser.add_argument('--minibatch_size', type=int, default=64, dest='minibatch_size') parser.add_argument('--num_epochs', type=int, default=30, dest='num_epochs') parser.add_argument('--models_folder', default='../lm_models', dest='folder') parser.add_argument('--graph_folder', default='../lm_graph', dest='graphs') args = parser.parse_args() # Fit the model if args.mode == 'train': # Read the initial word vectors train_data = np.load(open('lm_train_data.npy','r')) train_labels = np.load(open('lm_train_labels.npy','r')) lm = LanguageModel(args.lr, args.num_steps, args.vocab_len, args.minibatch_size) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) lm.fit(sess, train_data, train_labels, num_epochs=args.num_epochs, folder=args.folder, graph_folder=args.graphs) else: tweets = dill.load(open("tweets", "rb")) w2i = dill.load(open("w2i","rb")) i2w = dill.load(open("i2w","rb")) word_vector = dill.load(open("word_vecs","rb")) start_wd = ["president", "@netanyahu", "democrats", "gop", "congress", "white", "my", "the", "#makeamericagreatagain" ,"republicans", "wall", "@realdonaldtrump", "crooked"] input_list = [np.array([[word_vector[w2i[item]]]]) for item in start_wd] model = LanguageModel(args.lr, args.num_steps, args.vocab_len, args.minibatch_size)
class DPSplit(object): # 动态规划分词""" def __init__(self): self.lm = LanguageModel('RenMinData.txt') self.dict = {} self.words = [] self.max_len_word = 0 self.load_dict('dict.txt') self.graph = None self.viterbi_cache = {} def get_key(self, t, k): return '_'.join([str(t), str(k)]) def load_dict(self, file): with open(file, 'r') as f: for line in f: word_list = [ w.encode('utf-8') for w in list(line.strip().decode('utf-8')) ] if len(word_list) > 0: self.dict[''.join(word_list)] = 1 if len(word_list) > self.max_len_word: self.max_len_word = len(word_list) def createGraph(self): # 根据输入的句子创建有向图""" self.graph = Graph() for i in range(len(self.words)): self.graph.sequence.append({}) word_length = len(self.words) # 为每一个字所在的位置创建一个可能词集合 for i in range(word_length): for j in range(self.max_len_word): if i + j + 1 > len(self.words): break word = ''.join(self.words[i:i + j + 1]) if word in self.dict: node = Node(word) # 按照该词的结尾字为其分配位置 self.graph.sequence[i + j][word] = node # 增加一个结束空节点,方便计算 end = Node('#') self.graph.sequence.append({'#': end}) # for s in self.graph.sequence: # for i in s.values(): # print i.word, # print ' - ' # exit(-1) def split(self, sentence): self.words = [ w.encode('utf-8') for w in list(sentence.decode('utf-8')) ] self.createGraph() # 根据viterbi动态规划算法计算图中的所有节点最大分数 self.viterbi(len(self.words), '#') # 输出分支最大的节点 end = self.graph.sequence[-1]['#'] node = end.prev_node result = [] while node: result.insert(0, node.word) node = node.prev_node print(''.join(self.words)) print(' '.join(result)) def viterbi(self, t, k): """第t个位置,是单词k的最优路径概率""" if self.get_key(t, k) in self.viterbi_cache: return self.viterbi_cache[self.get_key(t, k)] node = self.graph.sequence[t][k] # t = 0 的情况,即句子第一个字 if t == 0: node.max_score = self.lm.get_init_prop(k) self.viterbi_cache[self.get_key(t, k)] = node.max_score return node.max_score prev_t = t - len(k.decode('utf-8')) # 当前一个节点的位置已经超出句首,则无需再计算概率 if prev_t == -1: return 1.0 # 获得前一个状态所有可能的汉字 pre_words = self.graph.sequence[prev_t].keys() for l in pre_words: # 从l到k的状态转移概率 state_transfer = self.lm.get_trans_prop(k, l) # 当前状态的得分为上一个最优路径的概率乘以当前的状态转移概率 score = self.viterbi(prev_t, l) * state_transfer prev_node = self.graph.sequence[prev_t][l] cur_score = score + prev_node.max_score if cur_score > node.max_score: node.max_score = cur_score # 把当前节点的上一最优节点保存起来,用来回溯输出 node.prev_node = self.graph.sequence[prev_t][l] self.viterbi_cache[self.get_key(t, k)] = node.max_score return node.max_score
waited += 1 if waited >= patience: break era_index += 1 era_loss = 0. era_samples = 0 torch.save(checkpoint, os.path.join(save_dir, f"{era_index}_eras.pt")) return checkpoint if __name__ == "__main__": vocab_path = "data/vocab.txt" in_tokens = 2 embedding_size = 128 with open(vocab_path) as r: vocab = list(map(lambda l: l.strip(), r.readlines())) assert len(vocab) == len(set(vocab)) vocab_size = len(vocab) + 1 model = LanguageModel(in_tokens, vocab_size, embedding_size) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) train(model, optimizer, vocab, ["data/parted/0.txt"], ["data/parted/1.txt"], batch_size=32, max_train_eras=100, batches_per_era=100, max_val_batches=10)
def test_models_have_correct_vocab_size(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) assert (lm.models[0].ngram_vocab_size == 7) assert (lm.models[1].ngram_vocab_size == 9)
def test_kn_produces_expected_values_n4(): lm = LanguageModel(4) data = open_file('kn_test.txt') lm.train(data) assert lm.kn_evaluate(['shall', 'train', 'text', '.']) == -0.7742507185722116
def test_create_new(self): lm = LanguageModel('playing') self.assertFalse( lm.is_ready())
'CANCEL': 'Idle' } class PlayingMedia(Base): keywords = { 'STOP': (lambda: rc.stop_playing(), 'Idle'), 'PAUSE': lambda: rc.pause() } context = { 'menu': 'Idle' } # Create a LanguageModel that supports all the keywords defined in all the States keywords = [] for state in [Base,Idle,SelectMedia,PlayingMedia]: keywords += state.keywords.keys() all_state_lm = LanguageModel('all_state_lm', keywords) all_state_lm.update_all() Base.lm = all_state_lm ######################################### # Old states - OBSOLETE ######################################### #class InitialState(State): # lm = ManualLanguageModel('initial') # Overrides automatic creation of language model # keywords = { # 'MARY': 'Listening' # } # #class Listening(State): # keywords = {
def test_get_input_commands(self): lm = LanguageModel('playing') self.assertEqual( len(lm.get_input_commands()), 7) # Exercise reading again, just in case the second time triggers a caching error lm.get_input_commands()
def test_p_next_sums_to_one(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) assert sum(lm.p_next(['this', 'text']).values()) == 1
def test_laplace_produces_expected_values2(): lm = LanguageModel(1) data = open_file('kn_test.txt') lm.train(data) assert lm.laplace_evaluate(['text']) == math.log(3 / 12) assert lm.laplace_evaluate(['dog']) == math.log(1 / 12)
def segment_noise(dataset, summary): chunk_dict = {} grammar_set = [] rands = np.random.rand(10000) rand_idx = 0 gidx = 0 batch_size = 128 file_dir = 'data/' + dataset + '/' model_file = 'model/%s/lm.model' % dataset dict_file = 'model/%s/lm.dict.p' % dataset train_file = 'data/%s/train.json' % dataset tokens_list, tags_list = chunk_text(train_file) token_dict = pickle.load(open(dict_file, 'rb')) word_size = len(token_dict) word_dim = 256 hidden_dim = 512 model = Model(word_size, word_dim, hidden_dim) model.cuda() if os.path.exists(model_file): best_point = torch.load(model_file) state_dict = best_point['state_dict'] new_state_dict = OrderedDict() for k, v in state_dict.items(): temp = state_dict[k] if k.startswith('module.'): k = k[7:] new_state_dict[k] = temp model.load_state_dict(new_state_dict) model.eval() shuffle_indices = np.random.permutation(np.arange(len(tokens_list))) tokens_list = np.array(tokens_list)[shuffle_indices] tags_list = np.array(tags_list)[shuffle_indices] noised_data = [] rev_token_dict = {token_dict[token]: token for token in token_dict} for _ in range(1): for idx in tqdm(range(0, len(tokens_list), batch_size)): tokens_batch = tokens_list[idx:idx + batch_size] tags_batch = tags_list[idx:idx + batch_size] probs_batch = [] probs_indices_batch = [] for tokens in tqdm(tokens_batch): if not check_sentence( [rev_token_dict[token] for token in tokens]): continue x_batch, x_mask = utils.pad([tokens]) x_batch = to_tensor(x_batch) x_mask = to_tensor(x_mask).float() ps_batch = model(x_batch, x_mask, ps_only=True) ps_batch = F.softmax(ps_batch, dim=-1) ps_batch = list(ps_batch.cpu().detach().numpy()) probs_sequence = [] probs_indices_sequence = [] for ps in ps_batch[0]: probs, probs_indices = nuclear_filter(ps) probs_sequence.append(probs) probs_indices_sequence.append(probs_indices) probs_batch.append(probs_sequence) probs_indices_batch.append(probs_indices_sequence) chunk_dict = {} grammar_set = [] chunks_batch = [] ctags_batch = [] for tokens, tags in zip(tokens_batch, tags_batch): chunks, ctags = split_to_chunks(tokens[1:-1], tags, chunk_dict, grammar_set) chunks_batch.append(chunks) ctags_batch.append(ctags) for chunk in chunk_dict: chunk_dict[chunk] = list(set(chunk_dict[chunk])) grammar_set = list(set(grammar_set)) np.random.shuffle(grammar_set) ps_idx = 0 for j, (tokens, chunks, ctags) in enumerate( tqdm(zip(tokens_batch, chunks_batch, ctags_batch), total=len(chunks_batch))): if not check_sentence( [rev_token_dict[token] for token in tokens]): continue lm_chunk_inputs = [] probs = probs_batch[ps_idx] probs_indices = probs_indices_batch[ps_idx] ps_idx += 1 if dataset == 'rotten': N = 20 else: N = 8 for _ in tqdm(range(N)): try: new_chunks = replace_tokens(chunks, probs, probs_indices) new_chunks, new_ctags = remove_chunks( new_chunks, ctags) lm_chunk_input = insert_chunks(new_chunks, new_ctags, chunk_dict, grammar_set, rands, rand_idx, gidx) lm_chunk_input = ' '.join([ rev_token_dict[token] for token in lm_chunk_input ]) lm_chunk_inputs.append(lm_chunk_input) except: pass inst = {} inst['summary'] = ' '.join( [rev_token_dict[token] for token in tokens[1:-1]]) inst['segment_reviews'] = lm_chunk_inputs noised_data.append(inst) return noised_data
class LanguageModelTests(unittest.TestCase): @classmethod def setUpClass(cls): print("\LanguageModelTests starts") print("==========") @classmethod def tearDownClass(cls): print("==========") print("LanguageModelTests has ended") def setUp(self): self.lm = LanguageModel(3) self.token_sequences = [['the', 'cat', 'runs'], ['the', 'dog', 'runs']] self.lm.train(self.token_sequences) def test_get_ngrams(self): print("id: " + self.id()) self.lm.n = 4 input_tokens = ['the', 'cat', 'in', 'the', 'hat'] result_ngrams = [ (None, None, None, 'the'), (None, None, 'the', 'cat'), (None, 'the', 'cat', 'in'), ('the', 'cat', 'in', 'the'), ('cat', 'in', 'the', 'hat'), ('in', 'the', 'hat', None), ('the', 'hat', None, None), ('hat', None, None, None) ] self.assertEqual(self.lm.get_ngrams(input_tokens), result_ngrams) def test_train_vocabulary_and_counts(self): print("id: " + self.id()) self.assertEqual(self.lm.vocabulary, {None, 'the', 'cat', 'runs', 'dog'}) result_counts = { (None, None): { 'the': 2 }, (None, 'the'): { 'cat': 1, 'dog': 1 }, ('the', 'cat'): { 'runs': 1 }, ('cat', 'runs'): { None: 1 }, ('runs', None): { None: 2 }, ('the', 'dog'): { 'runs': 1 }, ('dog', 'runs'): { None: 1 } } self.assertEqual(self.lm.counts, result_counts) def test_normalize(self): print("id: " + self.id()) input_words = {'cat': 1, 'dog': 1} result_probabilities = {'cat': 0.5, 'dog': 0.5} self.assertEqual(self.lm.normalize(input_words), result_probabilities) def test_normalize_sum_probabilies(self): print("id: " + self.id()) input_words = {'cat': 1, 'dog': 1} probabilities = self.lm.normalize(input_words) prob_sum = 0 for key in probabilities: prob_sum += probabilities[key] self.assertEqual(prob_sum, 1) def test_predict_next(self): print("id: " + self.id()) input_tokens = [None, "zero", None, 'the', 'dog'] result_probabilities = {'runs': 1} self.assertEqual(self.lm.p_next(input_tokens), result_probabilities) def test_sample(self): print("id: " + self.id()) input_probability_distribution = {'heads': 0.5, 'tails': 0.5} predicted_word = self.lm.sample(input_probability_distribution)[0] self.assertIn(predicted_word, input_probability_distribution)
def setUp(self): self.lm = LanguageModel(3) self.token_sequences = [['the', 'cat', 'runs'], ['the', 'dog', 'runs']] self.lm.train(self.token_sequences)
def test_create_existing(self): lm = LanguageModel('2503') self.assertTrue(lm.is_ready())
def main(args): """ Main function of the program operates based on the argument provided. Train - Ask for ngram - Ask for training file path - Train language model - Save the trained model Generate - Load the saved model from pickle file - Ask for a beam search (y/n) - Ask Beam length - Print one generated sentence in terminal - Ask for number of sentences to be generated on file - Save the input number of sentences in a file (Default: new_shakespeare.txt) Perplexity - Load Pickle file - Ask the test set file path - Print perplexity value Common - Load pickle - Ask number of most common ngram - Print the most common ngram with their occurence number. """ if args['train']: if not args['--n']: ngram = input("Please enter n for n-gram (Default: 3)-\n") if not ngram: ngram=3 else: ngram=args['--n'] lm = LanguageModel(int(ngram)) if not args['--path']: path = input("Please enter path of the file-\n") else: path = args['--path'] lm.train(readFile(path)) print("N-gram training completed") print("Saving the model") f = open('trained_model_ngram.pkl','wb') pickle.dump(lm, f) f.close() print("Model saved") if args['generate']: lm = loadPickle() if click.confirm('Do you want to generate with Beam search?', default=True): lm.beam_flag = True beam_size =input("Enter beam size (Default: 20)-\n") if not beam_size: lm.beam_width = beam_size else: lm.beam_flag = False print("Generating one sentence in terminal...") print(detokenize(lm.generate())) if not args['--lines']: noOfText =input("Enter number of generated text you want to save (Default: 10)-\n") if not noOfText: noOfText=10 else: noOfText = args['--lines'] generated = [] for g in range(0, int(noOfText)): generated.append(detokenize(lm.generate())) with open('new_shakespeare.txt', 'w') as f: for g in generated: f.write("%s\n" % g) print("Sentence file generated in current folder") if args['perplexity']: lm = loadPickle() if not args['--path']: path = input("Please enter path of the test file-\n") else: path = args['--path'] print("Perplexity for {}-gram is {}".format(lm.ngram,lm.perplexity(readFile(path)))) if args['common']: lm = loadPickle() if args['--number']: number = args['--number'] else: number = 5 lm.count_common_ngram(int(number))
import pickle from lm import LanguageModel train_filename = "train_sequence.pkl" model_filename = "model.pkl" dataset = pickle.load(open(train_filename, "rb")) lm = LanguageModel(lidstone_param=3e-4) lm.fit(dataset) pickle.dump(lm, open(model_filename, "wb"))