Exemple #1
0
def sample():

    X, y = load_data_and_labels()
    vocab_list, vocab_dict, rev_vocab_dict = create_vocabulary(
        X, FLAGS.en_vocab_size)
    X, seq_lens = data_to_token_ids(X, vocab_dict)

    test_sentence = "It was the best movie I have ever seen."
    test_sentence = get_tokens(clean_str(test_sentence))
    test_sentence, seq_len = data_to_token_ids([test_sentence], vocab_dict)
    test_sentence = test_sentence[0]
    test_sentence = test_sentence + ([PAD_ID] * (max(len(sentence) \
        for sentence in X) - len(test_sentence)))
    test_sentence = np.array(test_sentence).reshape([1, -1])
    FLAGS.max_sequence_length = len(test_sentence[0])

    with tf.Session() as sess:
        model = create_model(sess, FLAGS)

        probability = model.step(sess,
                                 batch_X=test_sentence,
                                 batch_seq_lens=np.array(seq_len),
                                 forward_only=True,
                                 sampling=True)

        print probability
        print np.argmax(probability)
Exemple #2
0
def do_word2vec():
    my_len = 15000000
    data_utils.create_vocabulary('data/topic/topic_index.vocal',
                                 'data/topic/topic_index.txt', my_len)
    data_utils.data_to_token_ids('data/topic/topic_index.txt',
                                 'data/topic/topic_index.vec',
                                 'data/topic/topic_index.vocal')

    data_utils.create_vocabulary('data/topic/topic_group.vocal',
                                 'data/topic/topic_group.txt', my_len)
    data_utils.data_to_token_ids('data/topic/topic_group.txt',
                                 'data/topic/topic_group.vec',
                                 'data/topic/topic_group.vocal')
Exemple #3
0
def gen_cut_file_jieba(input_file, cut_outputfile, vocabulary_outfile, vec_outputfile, start_header = [], appendword_file = ''):
    '''
    input_file: 输入原始文件
    cut_outputfile: 分词后的原文文件
    vocabulary_outfile: 分词后产生的词表文件
    start_header: list, 支持在词表头强行加入一系列数据,用于标识如__UNK___
    vec_outputfile: 分词后基于上述产生词表的 token_id 文件.利用data_utils里的函数.实际训练时加载vec_outputfile来生成内存里的对照表vocab
    # 上面那个功能是data.utils(initialize_vocabulary)
    '''
    print ("going to cut file...")
    vocabulary = []
    outfileobj = open(cut_outputfile, 'w+', encoding = 'utf-8')
    #先输入头部的几个tag
    for word in start_header:
        vocabulary.append(word)
    print ("goting to read input file " + input_file)
    #读取原始文件
    progress_line = 0 
    with open(input_file, "r", encoding = "utf8") as f:
        for line in f:
            if(progress_line % 2500 == 0):
                print ("proc for " + str(progress_line) + " line(s)...")
                print ("vocabulary size: " + str(len(vocabulary)))
            seg_list = jieba.lcut(line.strip(), cut_all=False) #lcut直接拿list
            #输出文件
            outfileobj.write(" ".join(seg_list))
            outfileobj.write("\n")
            #维护词表
            for single_seg in seg_list:
                if(not single_seg in vocabulary):
                    vocabulary.append(single_seg)
            progress_line = progress_line + 1
    #通过appendword_file追加3500个常用汉字.
    if (not appendword_file == "" ):
        print ("going to open appendfile:" + str(appendword_file))
        with open(appendword_file, 'r', encoding = "utf8") as apf:
            for line in apf:
                vocabulary.append(line.strip())
    print ("output vocabulary to file:" + vocabulary_outfile)
    #输出词表
    vocabulary_fileobj = open(vocabulary_outfile, 'w+', encoding = 'utf-8')
    for word in vocabulary:
        vocabulary_fileobj.write(word)
        vocabulary_fileobj.write("\n")
        
    #处理之后,根据上一步处理生成的分割结果和词典,转化一份实际的id list用于计算.
    #def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer=None, normalize_digits=True):
    data_utils.data_to_token_ids(cut_outputfile, vec_outputfile, vocabulary_outfile)
def build_ids():
    data_to_token_ids(MODERN_TRAIN_PATH, MODERN_TRAIN_IDS_PATH, MODERN_VOCAB_PATH, tokenizer=tokenizer)
    data_to_token_ids(MODERN_DEV_PATH, MODERN_DEV_IDS_PATH, MODERN_VOCAB_PATH, tokenizer=tokenizer)
    data_to_token_ids(ORIGINAL_TRAIN_PATH, ORIGINAL_TRAIN_IDS_PATH, ORIGINAL_VOCAB_PATH, tokenizer=tokenizer)
    data_to_token_ids(ORIGINAL_DEV_PATH, ORIGINAL_DEV_IDS_PATH, ORIGINAL_VOCAB_PATH, tokenizer=tokenizer)

    print( subprocess.check_output(['wc', '-l', MODERN_TRAIN_IDS_PATH]) )
    print( subprocess.check_output(['wc', '-l', MODERN_DEV_IDS_PATH]) )
    print( subprocess.check_output(['wc', '-l', ORIGINAL_TRAIN_IDS_PATH]) )
    print( subprocess.check_output(['wc', '-l', ORIGINAL_DEV_IDS_PATH]) )
def main(_):
    datasets_path = FLAGS.datasets_path
    vocab_path = FLAGS.vocab_path
    tfrecords_path = FLAGS.tfrecords_path

    words_vocab, labels_vocab = data_utils.initialize_vocabulary(vocab_path)

    train_word_ids_list, train_label_ids_list = data_utils.data_to_token_ids(
        os.path.join(datasets_path, 'train.txt'), words_vocab, labels_vocab)
    validation_word_ids_list, validation_label_ids_list = data_utils.data_to_token_ids(
        os.path.join(datasets_path, 'validation.txt'), words_vocab,
        labels_vocab)
    test_word_ids_list, test_label_ids_list = data_utils.data_to_token_ids(
        os.path.join(datasets_path, 'test.txt'), words_vocab, labels_vocab)

    create_record(train_word_ids_list, train_label_ids_list,
                  os.path.join(tfrecords_path, 'train.tfrecords'))
    create_record(validation_word_ids_list, validation_label_ids_list,
                  os.path.join(tfrecords_path, 'validate.tfrecords'))
    create_record(test_word_ids_list, test_label_ids_list,
                  os.path.join(tfrecords_path, 'test.tfrecords'))

    print_all(os.path.join(tfrecords_path, 'train.tfrecords'))
Exemple #6
0
def sample(FLAGS):

    # Load the data needed to convert your sentence
    en_token_ids, en_seq_lens, en_vocab_dict, en_rev_vocab_dict = \
        process_data('data/en.p', max_vocab_size=5000, target_lang=False)
    sp_token_ids, sp_seq_lens, sp_vocab_dict, sp_rev_vocab_dict = \
        process_data('data/sp.p', max_vocab_size=5000, target_lang=True)

    # Change FLAGS parameters
    FLAGS.batch_size = 1
    FLAGS.en_vocab_size = len(en_vocab_dict)
    FLAGS.sp_vocab_size = len(sp_vocab_dict)
    FLAGS.sp_max_len = max(sp_seq_lens) + 1 # GO token

    # Process sample sentence
    inference_sentence = ["I like to play tennis and eat sandwiches."]
    # Split into tokens
    tokenized = []
    for i in xrange(len(inference_sentence)):
        tokenized.append(basic_tokenizer(inference_sentence[i]))
    # Convert data to token ids
    data_as_tokens, sample_en_seq_lens = data_to_token_ids(
        tokenized, en_vocab_dict, target_lang=False, normalize_digits=True)

    # make dummy_sp_inputs
    dummy_sp_inputs = np.array([[GO_ID]*FLAGS.sp_max_len])
    sample_sp_seq_lens = np.array([len(dummy_sp_inputs)])

    print data_as_tokens
    print sample_en_seq_lens
    print dummy_sp_inputs
    print sample_sp_seq_lens

    with tf.Session() as sess:

        # Load trained model
        model = create_model(sess, FLAGS, forward_only=True)

        y_pred = model.step(sess, FLAGS, batch_encoder_inputs=data_as_tokens,
            batch_decoder_inputs=dummy_sp_inputs, batch_targets=None,
            batch_en_seq_lens=sample_en_seq_lens,
            batch_sp_seq_lens=sample_sp_seq_lens,
            dropout=0.0, forward_only=True, sampling=True)

        # compose the predicted sp sentence
        sp_sentence = []
        for idx in y_pred[0]:
            sp_sentence.append(sp_rev_vocab_dict[idx])
        print " ".join([word for word in sp_sentence])
Exemple #7
0
def decode_file(test_file):
    print('Applying Parameters:')
    for k, v in FLAGS.__dict__['__flags'].items():
        print('%s: %s' % (k, str(v)))

    vocab_path = FLAGS.data_dir + '/in_vocab_1000.txt'
    tag_vocab_path = FLAGS.data_dir + '/out_vocab_1000.txt'
    label_vocab_path = FLAGS.data_dir + '/label.txt'

    vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)
    tag_vocab, rev_tag_vocab = data_utils.initialize_vocabulary(tag_vocab_path)
    label_vocab, rev_label_vocab = data_utils.initialize_vocabulary(
        label_vocab_path)
    LM_vocab = vocab.copy()
    assert LM_vocab[data_utils._BOS] == data_utils.BOS_ID
    del LM_vocab[data_utils._BOS]
    LM_vocab[data_utils._BOS] = data_utils.BOS_ID
    rev_LM_vocab = [x for x in rev_vocab]
    rev_LM_vocab[data_utils.BOS_ID] = data_utils._EOS

    data_utils.data_to_token_ids(test_file,
                                 test_file + '.ids',
                                 vocab_path,
                                 tokenizer=data_utils.naive_tokenizer)

    test_set = read_test_data(test_file + '.ids')
    lm_test_output_file = FLAGS.test_output_file + '.ppl'
    intent_test_output_file = FLAGS.test_output_file + '.intent.hyp'
    tagging_test_output_file = FLAGS.test_output_file + '.tag.hyp'

    config = tf.ConfigProto(
        gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.23), )

    with tf.Session(config=config) as sess:
        print("Loading model...")
        _, model_test = create_model(sess, len(vocab), len(tag_vocab),
                                     len(label_vocab), len(LM_vocab))
        print(
            "Loaded model with source_vocab_size=%d, target_vocab_size=%d, and label_vocab_size=%d, and lm_vocab_size=%d."
            % (len(vocab), len(tag_vocab), len(label_vocab), len(LM_vocab)))

        def run_eval(data_set, decode_output_file):
            with open(lm_test_output_file, 'wb') as f_lm:
                with open(intent_test_output_file, 'wb') as f_intent:
                    with open(tagging_test_output_file, 'wb') as f_tagging:
                        eval_loss = 0.0
                        bucket_id = 0
                        count = 0
                        total_word_count = 0
                        for i in xrange(len(data_set[bucket_id])):
                            count += 1
                            if count % 1000 == 0:
                                print("Decoding utterance No. %d..." % count)
                            eval_encoder_inputs, eval_encoder_inputs_shiftByOne, eval_tags, eval_tag_weights, eval_intent_weights, eval_lm_weights, eval_sequence_length, eval_labels = model_test.get_one(
                                data_set, bucket_id, i)
                            eval_intent_weights = eval_tag_weights

                            tagging_logits = []
                            classification_logits = []

                            _, step_loss, tagging_logits, classification_logits = model_test.joint_step(
                                sess,
                                eval_encoder_inputs,
                                eval_encoder_inputs_shiftByOne,
                                eval_lm_weights,
                                eval_tags,
                                eval_tag_weights,
                                eval_labels,
                                eval_intent_weights,
                                eval_sequence_length,
                                bucket_id,
                                True,
                                use_attention=FLAGS.use_attention)

                            f_lm.write('%.2f\n' %
                                       (-step_loss *
                                        (eval_sequence_length[0] - 1)))
                            f_lm.flush()
                            eval_loss += step_loss * (eval_sequence_length[0])
                            total_word_count += eval_sequence_length[0]

                            hyp_label = None
                            # intent results
                            hyp_label = np.argmax(classification_logits[0], 0)
                            f_intent.write('%s\n' % rev_label_vocab[hyp_label])
                            f_intent.flush()
                            # tagging results
                            f_tagging.write('%s\n' % ' '.join([
                                rev_tag_vocab[np.argmax(x, 1)] for x in
                                tagging_logits[1:eval_sequence_length[0]]
                            ]))
                            f_tagging.flush()

                eval_perplexity = math.exp(float(eval_loss) / total_word_count)
            return eval_perplexity

        valid_perplexity = run_eval(test_set, FLAGS.test_output_file)
        print("  Eval perplexity: %.2f" % valid_perplexity)
        sys.stdout.flush()
def decode():
  with tf.Session() as sess:
    # load dictionary
    srce_vocab_path = os.path.join(FLAGS.data_dir, "train", "vocab%d.srce" % FLAGS.srce_vocab_min)
    trgt_vocab_path = os.path.join(FLAGS.data_dir, "train", "vocab%d.trgt" % FLAGS.trgt_vocab_min)
    
    _, re_srce_vocab = data_utils.initialize_vocabulary(srce_vocab_path)
    _, re_trgt_vocab = data_utils.initialize_vocabulary(trgt_vocab_path)

    # Load test data.
    if FLAGS.decode_test:
      srce_test_ids_path = os.path.join(FLAGS.data_dir, "test", "ids%d.srce" % FLAGS.srce_vocab_min)
      trgt_test_ids_path = os.path.join(FLAGS.data_dir, "test", "ids.trgt")
      srce_test_data_path = os.path.join(FLAGS.data_dir, "test/data.srce")
      trgt_test_data_path = os.path.join(FLAGS.data_dir, "test/data.trgt")

      # Prepare test data
      data_utils.data_to_token_ids(srce_test_data_path, srce_test_ids_path, srce_vocab_path)
      data_utils.data_to_token_ids(trgt_test_data_path, trgt_test_ids_path, trgt_vocab_path)
      trgt_test_pos = os.path.join(FLAGS.data_dir, "test", "positions.trgt")
      trgt_test_map = os.path.join(FLAGS.data_dir, "test", "map.srce")
      test_set = read_data(srce_test_ids_path, trgt_test_ids_path, trgt_test_pos, trgt_test_map)

    elif FLAGS.decode_dev:
      srce_dev_ids_path = os.path.join(FLAGS.data_dir, "dev", "ids%d.srce" % FLAGS.srce_vocab_min)
      trgt_dev_ids_path = os.path.join(FLAGS.data_dir, "dev", "ids%d.trgt" % FLAGS.trgt_vocab_min)
      trgt_dev_pos = os.path.join(FLAGS.data_dir, "dev", "positions.trgt")
      trgt_dev_map = os.path.join(FLAGS.data_dir, "dev", "map.srce")
      test_set = read_data(srce_dev_ids_path, trgt_dev_ids_path, trgt_dev_pos, trgt_dev_map)

    else:
      raise ValueError(" Please set decode_test or decode_dev to True! ")

    # Create model and load parameters.
    model = create_model(sess, len(re_srce_vocab), len(re_trgt_vocab), True)
    model.batch_size = 1  # We decode one sentence at a time.

    # Decode test data.  ---> read from files

    decode_result_path = os.path.join(FLAGS.data_dir, ("result/result_size%d_dropout%.2f" % (FLAGS.size, FLAGS.keep_prob)))
    decode_data_path = os.path.join(FLAGS.data_dir, ("result/gold_size%d_dropout%.2f" % (FLAGS.size, FLAGS.keep_prob)))
    
    test_bucket_sizes = [len(test_set[b]) for b in xrange(len(_buckets))]
    print ("test bucket size: ", test_bucket_sizes)

    count = 0
    correct = 0

    with open(decode_result_path, 'w') as fpred:
      with open(decode_data_path, 'w') as fgold: # note that the test data has been sorted by bucket size
        for b in xrange(len(_buckets)):
          print ("bucket%d:" % b)
          
          if len(test_set[b]) == 0: # empty bucket
            continue
          
          for sent in test_set[b]:
            
            encoder_input, decoder_input, target_weight, pos, maps = model.get_batch({b: [sent]}, b)
            # get output_logits
            _, _, output_logits, _, _, _= model.step(sess, encoder_input, decoder_input, target_weight, b, True, 
                  decoder_inputs_positions=pos, decoder_inputs_maps=maps)
            # greedy decoder: outputs are argmax of output_logits
            outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
            # If there is an EOS symbol in outputs, cut them at that point.
            if data_utils.EOS_ID in outputs:
              outputs = outputs[:outputs.index(data_utils.EOS_ID)]

            # write to file
            fpred.write(data_utils.token_ids_to_sentence(outputs, re_trgt_vocab) + '\n')
            gold = sent[1]
            if data_utils.EOS_ID in sent[1]:
              gold = sent[1][:sent[1].index(data_utils.EOS_ID)]
            fgold.write(data_utils.token_ids_to_sentence(gold, re_trgt_vocab) + '\n')

            if gold == outputs:
              correct += 1
            # else:
            #   print ("source: ", data_utils.token_ids_to_sentence(sent[0], re_srce_vocab), '\t', pos, '\t', maps)
            #   print ("target: ", data_utils.token_ids_to_sentence(gold, re_trgt_vocab))
            #   print ("predict: ", data_utils.token_ids_to_sentence(outputs, re_trgt_vocab) + '\n')

            count += 1
    print("count = %d, correct = %d, accuracy = %f" % (count, correct, float(correct)/count))
Exemple #9
0
            ff.write(word + "\n")
            vocabulary_count = vocabulary_count + 1
        #通过appendword追加3500个常用汉字.
        if (not appendword == "" ):
            print ("going to open appendfile:" + str(appendword))
            with open(appendword, 'r', encoding = "utf8") as apf:
                for line in apf:
                    ff.write(line.strip() + "\n")
                    vocabulary_count = vocabulary_count + 1
        #再输入词
        for word in tags:
            ff.write(word + "\n")
            vocabulary_count = vocabulary_count + 1
    print ("outputfile " + outputfile + " with " + str(vocabulary_count) + " line(s)...")
    return True

#抓紧了开车
gen_cut_file_jieba(encoding_in_filename, encoding_cut_out_filename)
gen_cut_file_jieba(decoding_in_filename, decoding_cut_out_filename)
gen_cut_file_jieba(test_encoding_in_filename, test_encoding_cut_out_filename)
gen_cut_file_jieba(test_decoding_in_filename, test_decoding_cut_out_filename)
gen_vocabulary_file_jieba(encoding_in_filename,encoding_out_filename, START_VOCABULART, vocabulary_size, appendword)
gen_vocabulary_file_jieba(decoding_in_filename,decoding_out_filename, START_VOCABULART, vocabulary_size, appendword)
gen_vocabulary_file_jieba(test_encoding_in_filename,test_encoding_out_filename, START_VOCABULART, vocabulary_size, appendword)
gen_vocabulary_file_jieba(test_decoding_in_filename,test_decoding_out_filename, START_VOCABULART, vocabulary_size, appendword)

#在上述步骤都完成之后,依词表将分词结果(_cut的输出)转化为向量
data_utils.data_to_token_ids(encoding_cut_out_filename, encoding_vec_filename, encoding_out_filename)
data_utils.data_to_token_ids(decoding_cut_out_filename, decoding_vec_filename, decoding_out_filename)
data_utils.data_to_token_ids(test_encoding_cut_out_filename, test_encoding_vec_filename, test_encoding_out_filename)
data_utils.data_to_token_ids(test_decoding_cut_out_filename, test_decoding_vec_filename, test_decoding_out_filename)
Exemple #10
0
def train():

    X, y = load_data_and_labels()
    vocab_list, vocab_dict, rev_vocab_dict = create_vocabulary(
        X, FLAGS.en_vocab_size)
    X, seq_lens = data_to_token_ids(X, vocab_dict)
    train_X, train_y, train_seq_lens, valid_X, valid_y, valid_seq_lens = \
        split_data(X, y, seq_lens)
    FLAGS.max_sequence_length = len(train_X[0])

    with tf.Session() as sess:

        # Load old model or create new one
        model = create_model(sess, FLAGS)

        # Train results
        for epoch_num, epoch in enumerate(
                generate_epoch(train_X, train_y, train_seq_lens,
                               FLAGS.num_epochs, FLAGS.batch_size)):
            print "EPOCH:", epoch_num

            sess.run(tf.assign(model.lr, FLAGS.learning_rate * \
                (FLAGS.learning_rate_decay_factor ** epoch_num)))

            train_loss = []
            train_accuracy = []
            for batch_num, (batch_X, batch_y,
                            batch_seq_lens) in enumerate(epoch):

                _, loss, accuracy = model.step(
                    sess,
                    batch_X,
                    batch_seq_lens,
                    batch_y,
                    dropout_keep_prob=FLAGS.dropout_keep_prob,
                    forward_only=False,
                    sampling=False)

                train_loss.append(loss)
                train_accuracy.append(accuracy)

            print
            print "EPOCH %i SUMMARY" % epoch_num
            print "Training loss %.3f" % np.mean(train_loss)
            print "Training accuracy %.3f" % np.mean(train_accuracy)
            print "----------------------"

            # Validation results
            for valid_epoch_num, valid_epoch in enumerate(
                    generate_epoch(valid_X,
                                   valid_y,
                                   valid_seq_lens,
                                   num_epochs=1,
                                   batch_size=FLAGS.batch_size)):
                valid_loss = []
                valid_accuracy = []

                for valid_batch_num, \
                    (valid_batch_X, valid_batch_y, valid_batch_seq_lens) in \
                        enumerate(valid_epoch):

                    loss, accuracy = model.step(sess,
                                                valid_batch_X,
                                                valid_batch_seq_lens,
                                                valid_batch_y,
                                                dropout_keep_prob=1.0,
                                                forward_only=True,
                                                sampling=False)

                    valid_loss.append(loss)
                    valid_accuracy.append(accuracy)

            print "Validation loss %.3f" % np.mean(valid_loss)
            print "Validation accuracy %.3f" % np.mean(valid_accuracy)
            print "----------------------"

            # Save checkpoint every epoch.
            if not os.path.isdir(FLAGS.ckpt_dir):
                os.makedirs(FLAGS.ckpt_dir)
            checkpoint_path = os.path.join(FLAGS.ckpt_dir, "model.ckpt")
            print "Saving the model."
            model.saver.save(sess,
                             checkpoint_path,
                             global_step=model.global_step)
                wfp1.write(l1)
                wfp1.write("\n")
                wfp2.write(l2)
                wfp2.write("\n")
                
    #关闭wfp1和wfp2
    wfp1.close()
    wfp2.close()
    
    print ("drop line:" + str(drop_count))
    print ("total line:" + str(total_line))
    

#预处理过滤掉重复的输入.
do_filter_two(encoding_in_filename_before_filter, decoding_in_filename_before_filter, encoding_in_filename, decoding_in_filename)
    
#抓紧了开车
gen_cut_file_jieba(encoding_in_filename, encoding_cut_out_filename, encoding_vocab_filename, START_VOCABULART, appendword)
gen_cut_file_jieba(decoding_in_filename, decoding_cut_out_filename, decoding_vocab_filename, START_VOCABULART, appendword)
gen_cut_file_jieba(test_encoding_in_filename, test_encoding_cut_out_filename, test_encoding_vocab_filename, START_VOCABULART, appendword)
gen_cut_file_jieba(test_decoding_in_filename, test_decoding_cut_out_filename, test_decoding_vocab_filename, START_VOCABULART, appendword)

#依照新的词表,对已经cut过的文件里,无法从vocab找到的"词"再次分解成字。
data_ap(encoding_cut_out_filename, encoding_vocab_filename, encoding_cut_out_fixed_filename)
data_ap(decoding_cut_out_filename, decoding_vocab_filename, decoding_cut_out_fixed_filename)

#在上述步骤都完成之后,依词表将分词结果(_cut的输出)转化为向量
data_utils.data_to_token_ids(encoding_cut_out_fixed_filename, encoding_vec_filename, encoding_vocab_filename)
data_utils.data_to_token_ids(decoding_cut_out_fixed_filename, decoding_vec_filename, decoding_vocab_filename)
data_utils.data_to_token_ids(test_encoding_cut_out_filename, test_encoding_vec_filename, test_encoding_vocab_filename)
data_utils.data_to_token_ids(test_decoding_cut_out_filename, test_decoding_vec_filename, test_decoding_vocab_filename)