def sample(): X, y = load_data_and_labels() vocab_list, vocab_dict, rev_vocab_dict = create_vocabulary( X, FLAGS.en_vocab_size) X, seq_lens = data_to_token_ids(X, vocab_dict) test_sentence = "It was the best movie I have ever seen." test_sentence = get_tokens(clean_str(test_sentence)) test_sentence, seq_len = data_to_token_ids([test_sentence], vocab_dict) test_sentence = test_sentence[0] test_sentence = test_sentence + ([PAD_ID] * (max(len(sentence) \ for sentence in X) - len(test_sentence))) test_sentence = np.array(test_sentence).reshape([1, -1]) FLAGS.max_sequence_length = len(test_sentence[0]) with tf.Session() as sess: model = create_model(sess, FLAGS) probability = model.step(sess, batch_X=test_sentence, batch_seq_lens=np.array(seq_len), forward_only=True, sampling=True) print probability print np.argmax(probability)
def do_word2vec(): my_len = 15000000 data_utils.create_vocabulary('data/topic/topic_index.vocal', 'data/topic/topic_index.txt', my_len) data_utils.data_to_token_ids('data/topic/topic_index.txt', 'data/topic/topic_index.vec', 'data/topic/topic_index.vocal') data_utils.create_vocabulary('data/topic/topic_group.vocal', 'data/topic/topic_group.txt', my_len) data_utils.data_to_token_ids('data/topic/topic_group.txt', 'data/topic/topic_group.vec', 'data/topic/topic_group.vocal')
def gen_cut_file_jieba(input_file, cut_outputfile, vocabulary_outfile, vec_outputfile, start_header = [], appendword_file = ''): ''' input_file: 输入原始文件 cut_outputfile: 分词后的原文文件 vocabulary_outfile: 分词后产生的词表文件 start_header: list, 支持在词表头强行加入一系列数据,用于标识如__UNK___ vec_outputfile: 分词后基于上述产生词表的 token_id 文件.利用data_utils里的函数.实际训练时加载vec_outputfile来生成内存里的对照表vocab # 上面那个功能是data.utils(initialize_vocabulary) ''' print ("going to cut file...") vocabulary = [] outfileobj = open(cut_outputfile, 'w+', encoding = 'utf-8') #先输入头部的几个tag for word in start_header: vocabulary.append(word) print ("goting to read input file " + input_file) #读取原始文件 progress_line = 0 with open(input_file, "r", encoding = "utf8") as f: for line in f: if(progress_line % 2500 == 0): print ("proc for " + str(progress_line) + " line(s)...") print ("vocabulary size: " + str(len(vocabulary))) seg_list = jieba.lcut(line.strip(), cut_all=False) #lcut直接拿list #输出文件 outfileobj.write(" ".join(seg_list)) outfileobj.write("\n") #维护词表 for single_seg in seg_list: if(not single_seg in vocabulary): vocabulary.append(single_seg) progress_line = progress_line + 1 #通过appendword_file追加3500个常用汉字. if (not appendword_file == "" ): print ("going to open appendfile:" + str(appendword_file)) with open(appendword_file, 'r', encoding = "utf8") as apf: for line in apf: vocabulary.append(line.strip()) print ("output vocabulary to file:" + vocabulary_outfile) #输出词表 vocabulary_fileobj = open(vocabulary_outfile, 'w+', encoding = 'utf-8') for word in vocabulary: vocabulary_fileobj.write(word) vocabulary_fileobj.write("\n") #处理之后,根据上一步处理生成的分割结果和词典,转化一份实际的id list用于计算. #def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer=None, normalize_digits=True): data_utils.data_to_token_ids(cut_outputfile, vec_outputfile, vocabulary_outfile)
def build_ids(): data_to_token_ids(MODERN_TRAIN_PATH, MODERN_TRAIN_IDS_PATH, MODERN_VOCAB_PATH, tokenizer=tokenizer) data_to_token_ids(MODERN_DEV_PATH, MODERN_DEV_IDS_PATH, MODERN_VOCAB_PATH, tokenizer=tokenizer) data_to_token_ids(ORIGINAL_TRAIN_PATH, ORIGINAL_TRAIN_IDS_PATH, ORIGINAL_VOCAB_PATH, tokenizer=tokenizer) data_to_token_ids(ORIGINAL_DEV_PATH, ORIGINAL_DEV_IDS_PATH, ORIGINAL_VOCAB_PATH, tokenizer=tokenizer) print( subprocess.check_output(['wc', '-l', MODERN_TRAIN_IDS_PATH]) ) print( subprocess.check_output(['wc', '-l', MODERN_DEV_IDS_PATH]) ) print( subprocess.check_output(['wc', '-l', ORIGINAL_TRAIN_IDS_PATH]) ) print( subprocess.check_output(['wc', '-l', ORIGINAL_DEV_IDS_PATH]) )
def main(_): datasets_path = FLAGS.datasets_path vocab_path = FLAGS.vocab_path tfrecords_path = FLAGS.tfrecords_path words_vocab, labels_vocab = data_utils.initialize_vocabulary(vocab_path) train_word_ids_list, train_label_ids_list = data_utils.data_to_token_ids( os.path.join(datasets_path, 'train.txt'), words_vocab, labels_vocab) validation_word_ids_list, validation_label_ids_list = data_utils.data_to_token_ids( os.path.join(datasets_path, 'validation.txt'), words_vocab, labels_vocab) test_word_ids_list, test_label_ids_list = data_utils.data_to_token_ids( os.path.join(datasets_path, 'test.txt'), words_vocab, labels_vocab) create_record(train_word_ids_list, train_label_ids_list, os.path.join(tfrecords_path, 'train.tfrecords')) create_record(validation_word_ids_list, validation_label_ids_list, os.path.join(tfrecords_path, 'validate.tfrecords')) create_record(test_word_ids_list, test_label_ids_list, os.path.join(tfrecords_path, 'test.tfrecords')) print_all(os.path.join(tfrecords_path, 'train.tfrecords'))
def sample(FLAGS): # Load the data needed to convert your sentence en_token_ids, en_seq_lens, en_vocab_dict, en_rev_vocab_dict = \ process_data('data/en.p', max_vocab_size=5000, target_lang=False) sp_token_ids, sp_seq_lens, sp_vocab_dict, sp_rev_vocab_dict = \ process_data('data/sp.p', max_vocab_size=5000, target_lang=True) # Change FLAGS parameters FLAGS.batch_size = 1 FLAGS.en_vocab_size = len(en_vocab_dict) FLAGS.sp_vocab_size = len(sp_vocab_dict) FLAGS.sp_max_len = max(sp_seq_lens) + 1 # GO token # Process sample sentence inference_sentence = ["I like to play tennis and eat sandwiches."] # Split into tokens tokenized = [] for i in xrange(len(inference_sentence)): tokenized.append(basic_tokenizer(inference_sentence[i])) # Convert data to token ids data_as_tokens, sample_en_seq_lens = data_to_token_ids( tokenized, en_vocab_dict, target_lang=False, normalize_digits=True) # make dummy_sp_inputs dummy_sp_inputs = np.array([[GO_ID]*FLAGS.sp_max_len]) sample_sp_seq_lens = np.array([len(dummy_sp_inputs)]) print data_as_tokens print sample_en_seq_lens print dummy_sp_inputs print sample_sp_seq_lens with tf.Session() as sess: # Load trained model model = create_model(sess, FLAGS, forward_only=True) y_pred = model.step(sess, FLAGS, batch_encoder_inputs=data_as_tokens, batch_decoder_inputs=dummy_sp_inputs, batch_targets=None, batch_en_seq_lens=sample_en_seq_lens, batch_sp_seq_lens=sample_sp_seq_lens, dropout=0.0, forward_only=True, sampling=True) # compose the predicted sp sentence sp_sentence = [] for idx in y_pred[0]: sp_sentence.append(sp_rev_vocab_dict[idx]) print " ".join([word for word in sp_sentence])
def decode_file(test_file): print('Applying Parameters:') for k, v in FLAGS.__dict__['__flags'].items(): print('%s: %s' % (k, str(v))) vocab_path = FLAGS.data_dir + '/in_vocab_1000.txt' tag_vocab_path = FLAGS.data_dir + '/out_vocab_1000.txt' label_vocab_path = FLAGS.data_dir + '/label.txt' vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) tag_vocab, rev_tag_vocab = data_utils.initialize_vocabulary(tag_vocab_path) label_vocab, rev_label_vocab = data_utils.initialize_vocabulary( label_vocab_path) LM_vocab = vocab.copy() assert LM_vocab[data_utils._BOS] == data_utils.BOS_ID del LM_vocab[data_utils._BOS] LM_vocab[data_utils._BOS] = data_utils.BOS_ID rev_LM_vocab = [x for x in rev_vocab] rev_LM_vocab[data_utils.BOS_ID] = data_utils._EOS data_utils.data_to_token_ids(test_file, test_file + '.ids', vocab_path, tokenizer=data_utils.naive_tokenizer) test_set = read_test_data(test_file + '.ids') lm_test_output_file = FLAGS.test_output_file + '.ppl' intent_test_output_file = FLAGS.test_output_file + '.intent.hyp' tagging_test_output_file = FLAGS.test_output_file + '.tag.hyp' config = tf.ConfigProto( gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.23), ) with tf.Session(config=config) as sess: print("Loading model...") _, model_test = create_model(sess, len(vocab), len(tag_vocab), len(label_vocab), len(LM_vocab)) print( "Loaded model with source_vocab_size=%d, target_vocab_size=%d, and label_vocab_size=%d, and lm_vocab_size=%d." % (len(vocab), len(tag_vocab), len(label_vocab), len(LM_vocab))) def run_eval(data_set, decode_output_file): with open(lm_test_output_file, 'wb') as f_lm: with open(intent_test_output_file, 'wb') as f_intent: with open(tagging_test_output_file, 'wb') as f_tagging: eval_loss = 0.0 bucket_id = 0 count = 0 total_word_count = 0 for i in xrange(len(data_set[bucket_id])): count += 1 if count % 1000 == 0: print("Decoding utterance No. %d..." % count) eval_encoder_inputs, eval_encoder_inputs_shiftByOne, eval_tags, eval_tag_weights, eval_intent_weights, eval_lm_weights, eval_sequence_length, eval_labels = model_test.get_one( data_set, bucket_id, i) eval_intent_weights = eval_tag_weights tagging_logits = [] classification_logits = [] _, step_loss, tagging_logits, classification_logits = model_test.joint_step( sess, eval_encoder_inputs, eval_encoder_inputs_shiftByOne, eval_lm_weights, eval_tags, eval_tag_weights, eval_labels, eval_intent_weights, eval_sequence_length, bucket_id, True, use_attention=FLAGS.use_attention) f_lm.write('%.2f\n' % (-step_loss * (eval_sequence_length[0] - 1))) f_lm.flush() eval_loss += step_loss * (eval_sequence_length[0]) total_word_count += eval_sequence_length[0] hyp_label = None # intent results hyp_label = np.argmax(classification_logits[0], 0) f_intent.write('%s\n' % rev_label_vocab[hyp_label]) f_intent.flush() # tagging results f_tagging.write('%s\n' % ' '.join([ rev_tag_vocab[np.argmax(x, 1)] for x in tagging_logits[1:eval_sequence_length[0]] ])) f_tagging.flush() eval_perplexity = math.exp(float(eval_loss) / total_word_count) return eval_perplexity valid_perplexity = run_eval(test_set, FLAGS.test_output_file) print(" Eval perplexity: %.2f" % valid_perplexity) sys.stdout.flush()
def decode(): with tf.Session() as sess: # load dictionary srce_vocab_path = os.path.join(FLAGS.data_dir, "train", "vocab%d.srce" % FLAGS.srce_vocab_min) trgt_vocab_path = os.path.join(FLAGS.data_dir, "train", "vocab%d.trgt" % FLAGS.trgt_vocab_min) _, re_srce_vocab = data_utils.initialize_vocabulary(srce_vocab_path) _, re_trgt_vocab = data_utils.initialize_vocabulary(trgt_vocab_path) # Load test data. if FLAGS.decode_test: srce_test_ids_path = os.path.join(FLAGS.data_dir, "test", "ids%d.srce" % FLAGS.srce_vocab_min) trgt_test_ids_path = os.path.join(FLAGS.data_dir, "test", "ids.trgt") srce_test_data_path = os.path.join(FLAGS.data_dir, "test/data.srce") trgt_test_data_path = os.path.join(FLAGS.data_dir, "test/data.trgt") # Prepare test data data_utils.data_to_token_ids(srce_test_data_path, srce_test_ids_path, srce_vocab_path) data_utils.data_to_token_ids(trgt_test_data_path, trgt_test_ids_path, trgt_vocab_path) trgt_test_pos = os.path.join(FLAGS.data_dir, "test", "positions.trgt") trgt_test_map = os.path.join(FLAGS.data_dir, "test", "map.srce") test_set = read_data(srce_test_ids_path, trgt_test_ids_path, trgt_test_pos, trgt_test_map) elif FLAGS.decode_dev: srce_dev_ids_path = os.path.join(FLAGS.data_dir, "dev", "ids%d.srce" % FLAGS.srce_vocab_min) trgt_dev_ids_path = os.path.join(FLAGS.data_dir, "dev", "ids%d.trgt" % FLAGS.trgt_vocab_min) trgt_dev_pos = os.path.join(FLAGS.data_dir, "dev", "positions.trgt") trgt_dev_map = os.path.join(FLAGS.data_dir, "dev", "map.srce") test_set = read_data(srce_dev_ids_path, trgt_dev_ids_path, trgt_dev_pos, trgt_dev_map) else: raise ValueError(" Please set decode_test or decode_dev to True! ") # Create model and load parameters. model = create_model(sess, len(re_srce_vocab), len(re_trgt_vocab), True) model.batch_size = 1 # We decode one sentence at a time. # Decode test data. ---> read from files decode_result_path = os.path.join(FLAGS.data_dir, ("result/result_size%d_dropout%.2f" % (FLAGS.size, FLAGS.keep_prob))) decode_data_path = os.path.join(FLAGS.data_dir, ("result/gold_size%d_dropout%.2f" % (FLAGS.size, FLAGS.keep_prob))) test_bucket_sizes = [len(test_set[b]) for b in xrange(len(_buckets))] print ("test bucket size: ", test_bucket_sizes) count = 0 correct = 0 with open(decode_result_path, 'w') as fpred: with open(decode_data_path, 'w') as fgold: # note that the test data has been sorted by bucket size for b in xrange(len(_buckets)): print ("bucket%d:" % b) if len(test_set[b]) == 0: # empty bucket continue for sent in test_set[b]: encoder_input, decoder_input, target_weight, pos, maps = model.get_batch({b: [sent]}, b) # get output_logits _, _, output_logits, _, _, _= model.step(sess, encoder_input, decoder_input, target_weight, b, True, decoder_inputs_positions=pos, decoder_inputs_maps=maps) # greedy decoder: outputs are argmax of output_logits outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # write to file fpred.write(data_utils.token_ids_to_sentence(outputs, re_trgt_vocab) + '\n') gold = sent[1] if data_utils.EOS_ID in sent[1]: gold = sent[1][:sent[1].index(data_utils.EOS_ID)] fgold.write(data_utils.token_ids_to_sentence(gold, re_trgt_vocab) + '\n') if gold == outputs: correct += 1 # else: # print ("source: ", data_utils.token_ids_to_sentence(sent[0], re_srce_vocab), '\t', pos, '\t', maps) # print ("target: ", data_utils.token_ids_to_sentence(gold, re_trgt_vocab)) # print ("predict: ", data_utils.token_ids_to_sentence(outputs, re_trgt_vocab) + '\n') count += 1 print("count = %d, correct = %d, accuracy = %f" % (count, correct, float(correct)/count))
ff.write(word + "\n") vocabulary_count = vocabulary_count + 1 #通过appendword追加3500个常用汉字. if (not appendword == "" ): print ("going to open appendfile:" + str(appendword)) with open(appendword, 'r', encoding = "utf8") as apf: for line in apf: ff.write(line.strip() + "\n") vocabulary_count = vocabulary_count + 1 #再输入词 for word in tags: ff.write(word + "\n") vocabulary_count = vocabulary_count + 1 print ("outputfile " + outputfile + " with " + str(vocabulary_count) + " line(s)...") return True #抓紧了开车 gen_cut_file_jieba(encoding_in_filename, encoding_cut_out_filename) gen_cut_file_jieba(decoding_in_filename, decoding_cut_out_filename) gen_cut_file_jieba(test_encoding_in_filename, test_encoding_cut_out_filename) gen_cut_file_jieba(test_decoding_in_filename, test_decoding_cut_out_filename) gen_vocabulary_file_jieba(encoding_in_filename,encoding_out_filename, START_VOCABULART, vocabulary_size, appendword) gen_vocabulary_file_jieba(decoding_in_filename,decoding_out_filename, START_VOCABULART, vocabulary_size, appendword) gen_vocabulary_file_jieba(test_encoding_in_filename,test_encoding_out_filename, START_VOCABULART, vocabulary_size, appendword) gen_vocabulary_file_jieba(test_decoding_in_filename,test_decoding_out_filename, START_VOCABULART, vocabulary_size, appendword) #在上述步骤都完成之后,依词表将分词结果(_cut的输出)转化为向量 data_utils.data_to_token_ids(encoding_cut_out_filename, encoding_vec_filename, encoding_out_filename) data_utils.data_to_token_ids(decoding_cut_out_filename, decoding_vec_filename, decoding_out_filename) data_utils.data_to_token_ids(test_encoding_cut_out_filename, test_encoding_vec_filename, test_encoding_out_filename) data_utils.data_to_token_ids(test_decoding_cut_out_filename, test_decoding_vec_filename, test_decoding_out_filename)
def train(): X, y = load_data_and_labels() vocab_list, vocab_dict, rev_vocab_dict = create_vocabulary( X, FLAGS.en_vocab_size) X, seq_lens = data_to_token_ids(X, vocab_dict) train_X, train_y, train_seq_lens, valid_X, valid_y, valid_seq_lens = \ split_data(X, y, seq_lens) FLAGS.max_sequence_length = len(train_X[0]) with tf.Session() as sess: # Load old model or create new one model = create_model(sess, FLAGS) # Train results for epoch_num, epoch in enumerate( generate_epoch(train_X, train_y, train_seq_lens, FLAGS.num_epochs, FLAGS.batch_size)): print "EPOCH:", epoch_num sess.run(tf.assign(model.lr, FLAGS.learning_rate * \ (FLAGS.learning_rate_decay_factor ** epoch_num))) train_loss = [] train_accuracy = [] for batch_num, (batch_X, batch_y, batch_seq_lens) in enumerate(epoch): _, loss, accuracy = model.step( sess, batch_X, batch_seq_lens, batch_y, dropout_keep_prob=FLAGS.dropout_keep_prob, forward_only=False, sampling=False) train_loss.append(loss) train_accuracy.append(accuracy) print print "EPOCH %i SUMMARY" % epoch_num print "Training loss %.3f" % np.mean(train_loss) print "Training accuracy %.3f" % np.mean(train_accuracy) print "----------------------" # Validation results for valid_epoch_num, valid_epoch in enumerate( generate_epoch(valid_X, valid_y, valid_seq_lens, num_epochs=1, batch_size=FLAGS.batch_size)): valid_loss = [] valid_accuracy = [] for valid_batch_num, \ (valid_batch_X, valid_batch_y, valid_batch_seq_lens) in \ enumerate(valid_epoch): loss, accuracy = model.step(sess, valid_batch_X, valid_batch_seq_lens, valid_batch_y, dropout_keep_prob=1.0, forward_only=True, sampling=False) valid_loss.append(loss) valid_accuracy.append(accuracy) print "Validation loss %.3f" % np.mean(valid_loss) print "Validation accuracy %.3f" % np.mean(valid_accuracy) print "----------------------" # Save checkpoint every epoch. if not os.path.isdir(FLAGS.ckpt_dir): os.makedirs(FLAGS.ckpt_dir) checkpoint_path = os.path.join(FLAGS.ckpt_dir, "model.ckpt") print "Saving the model." model.saver.save(sess, checkpoint_path, global_step=model.global_step)
wfp1.write(l1) wfp1.write("\n") wfp2.write(l2) wfp2.write("\n") #关闭wfp1和wfp2 wfp1.close() wfp2.close() print ("drop line:" + str(drop_count)) print ("total line:" + str(total_line)) #预处理过滤掉重复的输入. do_filter_two(encoding_in_filename_before_filter, decoding_in_filename_before_filter, encoding_in_filename, decoding_in_filename) #抓紧了开车 gen_cut_file_jieba(encoding_in_filename, encoding_cut_out_filename, encoding_vocab_filename, START_VOCABULART, appendword) gen_cut_file_jieba(decoding_in_filename, decoding_cut_out_filename, decoding_vocab_filename, START_VOCABULART, appendword) gen_cut_file_jieba(test_encoding_in_filename, test_encoding_cut_out_filename, test_encoding_vocab_filename, START_VOCABULART, appendword) gen_cut_file_jieba(test_decoding_in_filename, test_decoding_cut_out_filename, test_decoding_vocab_filename, START_VOCABULART, appendword) #依照新的词表,对已经cut过的文件里,无法从vocab找到的"词"再次分解成字。 data_ap(encoding_cut_out_filename, encoding_vocab_filename, encoding_cut_out_fixed_filename) data_ap(decoding_cut_out_filename, decoding_vocab_filename, decoding_cut_out_fixed_filename) #在上述步骤都完成之后,依词表将分词结果(_cut的输出)转化为向量 data_utils.data_to_token_ids(encoding_cut_out_fixed_filename, encoding_vec_filename, encoding_vocab_filename) data_utils.data_to_token_ids(decoding_cut_out_fixed_filename, decoding_vec_filename, decoding_vocab_filename) data_utils.data_to_token_ids(test_encoding_cut_out_filename, test_encoding_vec_filename, test_encoding_vocab_filename) data_utils.data_to_token_ids(test_decoding_cut_out_filename, test_decoding_vec_filename, test_decoding_vocab_filename)