Beispiel #1
0
def main(_):
    converter = TextConverter(filename=FLAGS.converter_path)
    if os.path.isdir(FLAGS.checkpoint_path):
        FLAGS.checkpoint_path = tf.train.latest_checkpoint(
            FLAGS.checkpoint_path)

    model = CharRNN(converter.vocab_size,
                    None,
                    sampling=True,
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size)

    model.load(FLAGS.checkpoint_path)

    # start = converter.text_to_arr(FLAGS.seed_for_generating)
    seeds = [
        'var a = fun', 'function a(', 'this.', 'document.', 'window.',
        'var a = document.g', 'var a;', 'jQuery'
    ]
    for seed in seeds:
        start = converter.text_to_arr(seed)
        for i in range(0, FLAGS.num_to_generate):
            print('Generating: ' + seed + ' -> ' + str(i))
            file_name = str(uuid.uuid1())
            file_path = '../../BrowserFuzzingData/generated/' + FLAGS.file_type + '/' + file_name + '.' + FLAGS.file_type
            arr = model.sample(FLAGS.max_length_of_generated, start,
                               converter.vocab_size, converter.word_to_int)
            f = open(file_path, "wb")
            f.write(converter.arr_to_text(arr).encode('utf-8'))
            f.close()
Beispiel #2
0
def main(_):
    FLAGS.start_string = FLAGS.start_string  #.decode('utf-8')
    converter = TextConverter(filename=FLAGS.converter_path)
    if os.path.isdir(FLAGS.checkpoint_path):
        FLAGS.checkpoint_path =\
            tf.train.latest_checkpoint(FLAGS.checkpoint_path)

    model = CharRNN(converter.vocab_size,
                    sampling=True,
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size)

    model.load(FLAGS.checkpoint_path)

    start_string = FLAGS.start_string
    sys.stdout.write("> ")
    sys.stdout.flush()
    start_string = sys.stdin.readline()
    while start_string:
        start = converter.text_to_arr(start_string)
        arr = model.sample(FLAGS.max_length, start, converter.vocab_size)
        print(converter.arr_to_text(arr))

        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
Beispiel #3
0
def main(_):
    model_path = os.path.join('model', FLAGS.file_type)
    if not os.path.exists(model_path):
        os.makedirs(model_path)

    # Read and Load Corpus for Train and Validation.
    training_corpus, validating_corpus = read_corpus()

    # Build Text Converter
    print(
        "---------------------------- Initializing Text Converter ----------------------------"
    )
    start_time = time.time()
    converter = TextConverter(training_corpus, FLAGS.max_vocab)
    converter.save_to_file(os.path.join(model_path, 'converter.pkl'))
    print('Initialize Text Converter Finished in %.3f Seconds.\n' %
          (time.time() - start_time))

    # Vectorize Content of Corpus
    vectroize_corpus(converter)

    # Build Char RNN Model
    model = CharRNN(converter.vocab_size,
                    num_seqs=FLAGS.num_seqs,
                    num_steps=FLAGS.num_steps,
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    learning_rate=FLAGS.learning_rate,
                    train_keep_prob=FLAGS.train_keep_prob,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size)

    # Train Model
    model.train(FLAGS.max_steps, model_path, FLAGS.validate_every_n_steps,
                FLAGS.log_every_n_steps)
Beispiel #4
0
def main(_):
    model_path = os.path.join('model', FLAGS.name)  # 保存模型的路径
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)
    # 用codecs提供的open方法来指定打开的文件的语言编码,它会在读取的时候自动转换为内部unicode
    with codecs.open(FLAGS.input_file, encoding='utf-8') as f:
        text = f.read()  # 读取训练的文本
    converter = TextConverter(text, FLAGS.max_vocab)  # 转换text文本格式
    converter.save_to_file(os.path.join(model_path, 'converter.pkl'))

    arr = converter.text_to_arr(text)  # 转换text为数组
    g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps)  # 批生成
    print(converter.vocab_size)
    model = CharRNN(converter.vocab_size,  # 读取模型
                    num_seqs=FLAGS.num_seqs,
                    num_steps=FLAGS.num_steps,
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    learning_rate=FLAGS.learning_rate,
                    train_keep_prob=FLAGS.train_keep_prob,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size
                    )
    model.train(g,  # 训练
                FLAGS.max_steps,
                model_path,
                FLAGS.save_every_n,
                FLAGS.log_every_n,
                )
def main(_):
    model_path = os.path.join('model', FLAGS.name)
    #print(model_path)
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)
    with codecs.open(FLAGS.input_file, encoding='utf-8') as f:
        text = f.read()
    converter = TextConverter(text, FLAGS.max_vocab)
    converter.save_to_file(os.path.join(model_path, 'converter.pkl'))

    arr = converter.text_to_arr(text)
    g = batch_generator(arr, FLAGS.num_seq, FLAGS.num_step)
    print(converter.vocab_size)
    model = CharModel(
        converter.vocab_size,
        num_seq=FLAGS.num_seq,
        num_step=FLAGS.num_step,
        lstm_size=FLAGS.lstm_size,
        num_layers=FLAGS.num_layers,
        #learning_rate=FLAGS.learning_rate,
        train_keep_prob=FLAGS.train_keep_prob,
        #use_embedding=FLAGS.use_embedding,
        embedding_size=FLAGS.embedding_size,
        is_Training=True)
    #model.add_placeholder()
    #model.build_lstm()
    #model.build_loss()
    #model.build_optimizer()
    model.train(g, FLAGS.max_steps, model_path)
Beispiel #6
0
def main(_):
    model_path = os.path.join('model', FLAGS.name)
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)
    with codecs.open(FLAGS.input_file, encoding='utf-8') as f:
        text = f.read()
    converter = TextConverter(text, FLAGS.max_vocab)
    converter.save_to_file(os.path.join(model_path, 'converter.pkl'))

    arr = converter.text_to_arr(text)
    g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps)
    print(converter.vocab_size)
    model = CharRNN(converter.vocab_size,
                    num_seqs=FLAGS.num_seqs,
                    num_steps=FLAGS.num_steps,
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    learning_rate=FLAGS.learning_rate,
                    train_keep_prob=FLAGS.train_keep_prob,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size)
    model.train(
        g,
        FLAGS.max_steps,
        model_path,
        FLAGS.save_every_n,
        FLAGS.log_every_n,
    )
def main(_):
    model_path = os.path.join('model', FLAGS.name)#创建路径字符串
    if os.path.exists(model_path) is False:#创建文件夹路径
        os.makedirs(model_path)
    with codecs.open(FLAGS.input_file, encoding='utf-8') as f:
        text = f.read()#读取整个文件作为字符串
    converter = TextConverter(text, FLAGS.max_vocab)
    converter.save_to_file(os.path.join(model_path, 'converter.pkl'))

    arr = converter.text_to_arr(text)#将文本序列化
    g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps)#100,100
    print(converter.vocab_size)
    model = CharRNN(converter.vocab_size,#创建模型,这里num_classes设置为了字典的大小,因为要预测下一个char
                    num_seqs=FLAGS.num_seqs,
                    num_steps=FLAGS.num_steps,
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    learning_rate=FLAGS.learning_rate,
                    train_keep_prob=FLAGS.train_keep_prob,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size
                    )
    model.train(g,#训练模型
                FLAGS.max_steps,
                model_path,
                FLAGS.save_every_n,
                FLAGS.log_every_n,
                )
Beispiel #8
0
def main(_):

    model_path = os.path.join('models', Config.file_name)

    converter = TextConverter(vocab_dir='data/vocabs',
                              max_vocab=Config.vocab_size,
                              seq_length=Config.seq_length)
    print('vocab lens:', converter.vocab_size)

    # 加载上一次保存的模型
    model = Model(Config)
    checkpoint_path = tf.train.latest_checkpoint(model_path)
    if checkpoint_path:
        model.load(checkpoint_path)

    while True:

        english_speek = input("上联:")
        english_speek = ' '.join(english_speek)
        english_speek = english_speek.split()
        en_arr, arr_len = converter.text_en_to_arr(english_speek)

        test_g = [np.array([
            en_arr,
        ]), np.array([
            arr_len,
        ])]
        output_ids = model.test(test_g, model_path, converter)
        strs = converter.arr_to_text(output_ids)
        print('下联:', strs)
def main(_):
    model_path = os.path.join('model', FLAGS.name)
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)
    with codecs.open(FLAGS.input_file, encoding='utf-8') as f:
        text = f.read()
    converter = TextConverter(text, FLAGS.max_vocab)
    converter.save_to_file(os.path.join(model_path, 'converter.pkl'))

    arr = converter.text_to_arr(text)
    g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps)
    print(converter.vocab_size)
    model = CharRNN(converter.vocab_size,
                    num_seqs=FLAGS.num_seqs,
                    num_steps=FLAGS.num_steps,
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    learning_rate=FLAGS.learning_rate,
                    train_keep_prob=FLAGS.train_keep_prob,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size
                    )
    model.train(g,
                FLAGS.max_steps,
                model_path,
                FLAGS.save_every_n,
                FLAGS.log_every_n,
                )
Beispiel #10
0
def main(_):
    script_path = os.path.abspath(os.path.dirname(__file__))
    model_path = os.path.join(script_path, 'model', FLAGS.name)
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)
    with codecs.open(FLAGS.input_file, encoding='utf-8') as f:
        text = f.read()
    print("corpus size " + str(len(text)))

    if os.path.exists(FLAGS.whitelist_file):
        with codecs.open(FLAGS.whitelist_file, encoding='utf-8') as f:
            whitelist = f.read()
        text = remove_non_matching_chars(text, whitelist)

    converter = TextConverter(text, FLAGS.max_vocab)
    converter.save_to_file(os.path.join(model_path, 'converter.pkl'))

    arr = converter.text_to_arr(text)
    g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps)
    model = CharRNN(converter.vocab_size,
                    num_seqs=FLAGS.num_seqs,
                    num_steps=FLAGS.num_steps,
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    learning_rate=FLAGS.learning_rate,
                    train_keep_prob=FLAGS.train_keep_prob,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size)
    model.train(
        g,
        FLAGS.max_steps,
        model_path,
        FLAGS.save_every_n,
        FLAGS.log_every_n,
    )
Beispiel #11
0
def main(_):

    model_path = os.path.join('models', FLAGS.file_name)
    if os.path.isdir(model_path):
        FLAGS.checkpoint_path = tf.train.latest_checkpoint(model_path)

    converter = TextConverter(
        filename=os.path.join(model_path, 'converter.pkl'))
    # QAs = converter.load_obj(filename=os.path.join(model_path, 'QAs.pkl'))
    QAs, text = load_origin_data('data/task3_dev.txt')
    testQAs_to_arrs = converter.testQAs_to_arrs(QAs, FLAGS.num_steps)

    test_samples = testQAs_to_arrs

    print('use embeding:', FLAGS.use_embedding)
    print('vocab size:', converter.vocab_size)

    from model3 import Model

    with open(model_path + '/submission.csv', 'w') as file:
        file.write('test_id,result' + '\n')
    batchsize = 1000
    for i in range(0, len(test_samples), batchsize):  # 内存不足 分批test
        print('>>>>:', i, '/', len(test_samples))
        test_g = test_samples_generator(test_samples[i:i + batchsize])

        model = Model(converter.vocab_size, FLAGS, test=False, embeddings=None)

        model.load(FLAGS.checkpoint_path)

        model.test(test_g, model_path)
    print('finished!')
Beispiel #12
0
def main(_):
    model_path = os.path.join('models', Config.file_name)

    vocab_file = os.path.join(model_path, 'vocab_tuples.pkl')

    # 获取测试问题
    sens_tags_test = get_sens_tags('data/test.txt')

    # 数据处理
    converter = TextConverter(None,
                              vocab_file,
                              max_vocab=Config.vocab_max_size)
    print('vocab size:', converter.vocab_size)

    # 产生测试样本
    test_QA_arrs = converter.QAs_to_arr(sens_tags_test, Config.seq_length)

    # 加载上一次保存的模型
    model = Model(Config, converter.vocab_size)
    checkpoint_path = tf.train.latest_checkpoint(model_path)
    if checkpoint_path:
        model.load(checkpoint_path)

    # 测试
    print('start to testing...')
    n = len(test_QA_arrs)
    for i in range(n):
        y_pre, y_cos = model.test(test_QA_arrs[i])
        tags = [converter.int_to_tag(id) for id in y_pre[:test_QA_arrs[i][2]]]
        print('\nword / tag / pre')
        for j in range(test_QA_arrs[i][2]):
            print("{} / {} / {}".format(sens_tags_test[i][0][j],
                                        sens_tags_test[i][1][j], tags[j]))
Beispiel #13
0
def main(_):
    word_char = 'word'  # 'word' or 'char'
    print('use word or char:',word_char)

    FLAGS.file_name = word_char+'_'+FLAGS.file_name
    print('model_path:',FLAGS.file_name)

    model_path = os.path.join('models', FLAGS.file_name)
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)

    if FLAGS.file_name[-1] == '2':
        from model2 import Model
    elif FLAGS.file_name[-1] == '3':
        from model3 import Model
    elif FLAGS.file_name[-1] == '4':
        from model4 import Model
    elif FLAGS.file_name[-1] == '5':
        from model5 import Model
    else:
        from model1 import Model

    data_path,save_path = 'data','process_data1'

    converter = TextConverter(word_char, data_path, save_path,  FLAGS.num_steps)
    embeddings = converter.embeddings

    if word_char == 'word':
        train_pkl = 'train_word.pkl'
        val_pkl = 'val_word.pkl'
    if word_char == 'char':
        train_pkl = 'train_char.pkl'
        val_pkl = 'val_char.pkl'

    train_samples = converter.load_obj(os.path.join(save_path, train_pkl))
    train_g = batch_generator(train_samples, FLAGS.batch_size)

    val_samples = converter.load_obj(os.path.join(save_path, val_pkl))
    val_g = val_samples_generator(val_samples)


    print('use embeding:',FLAGS.use_embedding)
    print('vocab size:',converter.vocab_size)


    model = Model(converter.vocab_size,FLAGS,test=False, embeddings=embeddings)

    # 继续上一次模型训练
    FLAGS.checkpoint_path = tf.train.latest_checkpoint(model_path)
    if FLAGS.checkpoint_path:
        model.load(FLAGS.checkpoint_path)

    model.train(train_g,
                FLAGS.max_steps,
                model_path,
                FLAGS.save_every_n,
                FLAGS.log_every_n,
                val_g
                )
Beispiel #14
0
 def test_save_file(self):
     testConverter = TextConverter(text=[
         "We", "are", "accounted", "poor", "citizens,", "the", "patricians",
         "goodare", "accounted", "poor", "citizens,", "the", "patricians",
         "good"
     ],
                                   max_vocab=10)
     testConverter.save_to_file('test.pcl')
Beispiel #15
0
def main(_):
    converter = TextConverter(filename=FLAGS.converter_path)

    model = charRNN(converter.vocab_size, train=False)
    model.load(tf.train.latest_checkpoint(FLAGS.checkpoint_path))

    start = converter.text_to_arr(FLAGS.start_string)
    arr = model.generate(FLAGS.max_length, start, converter.vocab_size)
    print(converter.arr_to_text(arr))
Beispiel #16
0
 def test_batch_generator(self):
     with codecs.open('data/shakespeare.txt', encoding='utf-8') as f:
         text = f.read()
     converter = TextConverter(text, 35000)
     arr = converter.text_to_arr(text)
     g = batch_generator(arr, 32, 50)
     count = 0
     for x, y in g:
         count += 1
         print(count)
Beispiel #17
0
 def model_built(self):#,vocab_size,sampling,lstm_size,num_layers,use_embedding,embedding_size):
     FLAGS.start_string = FLAGS.start_string.decode('utf-8')
     self.converter = TextConverter(filename=FLAGS.converter_path)
     if os.path.isdir(FLAGS.checkpoint_path):
         FLAGS.checkpoint_path =\
             tf.train.latest_checkpoint(FLAGS.checkpoint_path)
     self.tfmodel = CharRNN(self.converter.vocab_size, sampling=True,
                 lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers,
                 use_embedding=FLAGS.use_embedding,
                 embedding_size=FLAGS.embedding_size)
     self.tfmodel.load(FLAGS.checkpoint_path)
Beispiel #18
0
 def test_vocab_size(self):
     testConverter = TextConverter(text=[
         "We", "are", "accounted", "poor", "citizens,", "the", "patricians",
         "goodare", "accounted", "poor", "citizens,", "the", "patricians",
         "good"
     ],
                                   max_vocab=10)
     print(testConverter.vocab_size)
     print(testConverter.int_to_word(4))
     print(testConverter.text_to_arr(['the']))
     print(testConverter.arr_to_text([3, 4]))
Beispiel #19
0
def main(_):
    # # FLAGS.start_string = FLAGS.start_string#.decode('utf-8')
    word_char = 'word'  # 'word' or 'char'
    print('use word or char:', word_char)

    FLAGS.file_name = word_char + '_' + FLAGS.file_name
    print('model_path:', FLAGS.file_name)

    model_path = os.path.join('models', FLAGS.file_name)
    if os.path.isdir(model_path):
        FLAGS.checkpoint_path = tf.train.latest_checkpoint(model_path)

    if FLAGS.file_name[-1] == '2':
        from model2 import Model
    elif FLAGS.file_name[-1] == '3':
        from model3 import Model
    elif FLAGS.file_name[-1] == '4':
        from model4 import Model
    elif FLAGS.file_name[-1] == '5':
        from model5 import Model
    else:
        from model1 import Model

    data_path, save_path = 'data', 'process_data1'

    converter = TextConverter(word_char, data_path, save_path, FLAGS.num_steps)
    embeddings = converter.embeddings

    if word_char == 'word':
        test_pkl = 'test_word.pkl'
    if word_char == 'char':
        test_pkl = 'test_char.pkl'

    test_samples = converter.load_obj(os.path.join(save_path, test_pkl))

    print('use embeding:', FLAGS.use_embedding)
    print('vocab size:', converter.vocab_size)

    with open(model_path + '/submission.csv', 'w') as file:
        file.write(str('y_pre') + '\n')
    for i in range(0, len(test_samples), 5000):  # 内存不足 分批test
        print('>>>>:', i, '/', len(test_samples))
        test_g = test_samples_generator(test_samples[i:i + 5000])

        model = Model(converter.vocab_size,
                      FLAGS,
                      test=False,
                      embeddings=embeddings)

        model.load(FLAGS.checkpoint_path)

        model.test(test_g, model_path)
    print('finished!')
Beispiel #20
0
def main(_):
    model_path = os.path.join('model', 'en')
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)
    with open("data/shakespeare.txt") as f:
        text = f.read()
    print("=====>", len(text))
    converter = TextConverter(text)
    converter.save(os.path.join(model_path, "converter.pkl"))

    arr = converter.text_to_arr(text)
    g = batch_generator(arr, batch_size, seq_len, converter=None)

    model = charRNN(converter.vocab_size)
    
    model.train(g, model_path)
Beispiel #21
0
def main(_):
    converter = TextConverter(filename=FLAGS.converter_path)
    if os.path.isdir(FLAGS.checkpoint_path):
        FLAGS.checkpoint_path =\
            tf.train.latest_checkpoint(FLAGS.checkpoint_path)

    model = CharRNN(converter.vocab_size, sampling=True,
                    lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size)

    model.load(FLAGS.checkpoint_path)

    start = converter.text_to_arr(FLAGS.start_string)
    arr = model.sample(FLAGS.max_length, start, converter.vocab_size)
    print(converter.arr_to_text(arr))
Beispiel #22
0
def sample():

    with tf.Session() as sess:
        model_path = os.path.join(FLAGS.train_dir, FLAGS.model_name)
        converter = TextConverter(None, FLAGS.max_vocab_size,
                                  os.path.join(model_path, 'converter.pkl'))
        model = create_model(sess, converter.vocab_size, True, model_path)

        sys.stdout.write("> ")
        sys.stdout.flush()
        start_str = sys.stdin.readline().decode('utf-8')
        while start_str:
            start = converter.text_to_arr(start_str)

            samples = [c for c in start]
            initial_state = sess.run(model.initial_state)
            x = np.zeros((1, 1))
            for c in start:
                x[0, 0] = c
                feed = {model.inputs: x, model.initial_state: initial_state}
                preds, final_state = sess.run(
                    [model.proba_prediction, model.final_state],
                    feed_dict=feed)
                initial_state = final_state

            c = pick_top_n(preds, converter.vocab_size)
            while c == converter.vocab_size - 1:
                c = pick_top_n(preds, converter.vocab_size)
            samples.append(c)

            for i in range(FLAGS.sample_length):
                x[0, 0] = c
                feed = {model.inputs: x, model.initial_state: initial_state}
                preds, final_state = sess.run(
                    [model.proba_prediction, model.final_state],
                    feed_dict=feed)
                initial_state = final_state
                c = pick_top_n(preds, converter.vocab_size)
                while c == converter.vocab_size - 1:
                    c = pick_top_n(preds, converter.vocab_size)
                samples.append(c)

            print(converter.arr_to_text(np.array(samples)))

            sys.stdout.write("> ")
            sys.stdout.flush()
            start_str = sys.stdin.readline().decode('utf-8')
Beispiel #23
0
def main(_):
    FLAGS.start_string = FLAGS.start_string.decode('utf-8')
    converter = TextConverter(filename=FLAGS.converter_path)
    if os.path.isdir(FLAGS.checkpoint_path):
        FLAGS.checkpoint_path =\
            tf.train.latest_checkpoint(FLAGS.checkpoint_path)

    model = CharRNN(converter.vocab_size, sampling=True,
                    lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size)

    model.load(FLAGS.checkpoint_path)

    start = converter.text_to_arr(FLAGS.start_string)
    arr = model.sample(FLAGS.max_length, start, converter.vocab_size)
    print(converter.arr_to_text(arr))
Beispiel #24
0
def main(_):
    model_path = os.path.join('model', FLAGS.name)
    print(model_path)
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)
        path_exist = False
    else:
        path_exist = True
    with codecs.open(FLAGS.input_file, encoding='utf-8') as f:
        text = f.read()
    converter = TextConverter(text, FLAGS.max_vocab)
    converter.save_to_file(os.path.join(model_path, 'converter.pkl'))

    arr = converter.text_to_arr(text)
    g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps)
    print(converter.vocab_size)
    model = CharRNN(converter.vocab_size,
                    num_seqs=FLAGS.num_seqs,
                    num_steps=FLAGS.num_steps,
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    learning_rate=FLAGS.learning_rate,
                    train_keep_prob=FLAGS.train_keep_prob,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size
                    )
    model_file_path = tf.train.latest_checkpoint(model_path)
    if path_exist:
        model.load(model_file_path)
        indexes = []
        for dirpath, dirnames, filenames in os.walk(model_path):
            for name in filenames:
                filepath = os.path.join(dirpath, name)
                if filepath.endswith(".index"):
                    indexes.append(int(name[6:-6]))
        indexes.sort()
        last_index = indexes[-1]
        model.step = last_index

    model.train(g,
                FLAGS.max_steps,
                model_path,
                FLAGS.save_every_n,
                FLAGS.log_every_n,
                )
Beispiel #25
0
def generate():
    tf.compat.v1.disable_eager_execution()
    converter = TextConverter(filename=FLAGS.converter_path)
    if os.path.isdir(FLAGS.checkpoint_path):
        FLAGS.checkpoint_path =\
            tf.train.latest_checkpoint(FLAGS.checkpoint_path)

    model = CharRNN(converter.vocab_size,
                    sampling=True,
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size)

    model.load(FLAGS.checkpoint_path)

    start = converter.text_to_arr(FLAGS.start_string)
    arr = model.sample(FLAGS.max_length, start, converter.vocab_size)

    return converter.arr_to_text(arr)
Beispiel #26
0
def main(_):
    FLAGS.start_string = FLAGS.start_string.decode('utf-8')
    converter = TextConverter(filename=FLAGS.converter_path)  #创建文本转化器
    if os.path.isdir(FLAGS.checkpoint_path):
        FLAGS.checkpoint_path = tf.train.latest_checkpoint(
            FLAGS.checkpoint_path)  #下载最新模型

    model = CharRNN(converter.vocab_size,
                    sampling=True,
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size)

    model.load(FLAGS.checkpoint_path)  #加载模型

    start = converter.text_to_arr(FLAGS.start_string)  #将input text转为id
    arr = model.sample(FLAGS.max_length, start,
                       converter.vocab_size)  #输出为生成的序列
    print(converter.arr_to_text(arr))
Beispiel #27
0
def main(_):
    model_path = os.path.join('models', FLAGS.file_name)
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)

    if os.path.exists(os.path.join(
            model_path, 'converter.pkl')) or os.path.exists(
                os.path.join(model_path, 'QAs.pkl')) is False:
        print('词库文件不存在,创建...')
        QAs, text = load_origin_data('data/task3_train.txt')
        converter = TextConverter(text, 5000)
        converter.save_to_file(converter.vocab,
                               os.path.join(model_path, 'converter.pkl'))
        converter.save_to_file(QAs, os.path.join(model_path, 'QAs.pkl'))
    else:
        converter = TextConverter(
            filename=os.path.join(model_path, 'converter.pkl'))
        QAs = converter.load_obj(filename=os.path.join(model_path, 'QAs.pkl'))

    QA_arrs = converter.QAs_to_arrs(QAs, FLAGS.num_steps)

    thres = int(len(QA_arrs) * 0.9)
    train_samples = QA_arrs[:thres]
    val_samples = QA_arrs[thres:]

    train_g = batch_generator(train_samples, FLAGS.batch_size)
    val_g = val_samples_generator(val_samples)

    print('use embeding:', FLAGS.use_embedding)
    print('vocab size:', converter.vocab_size)

    from model3 import Model
    model = Model(converter.vocab_size, FLAGS, test=False, embeddings=None)

    # 继续上一次模型训练
    FLAGS.checkpoint_path = tf.train.latest_checkpoint(model_path)
    if FLAGS.checkpoint_path:
        model.load(FLAGS.checkpoint_path)

    model.train(train_g, FLAGS.max_steps, model_path, FLAGS.save_every_n,
                FLAGS.log_every_n, val_g)
Beispiel #28
0
def main(_):

    model_path = os.path.join('models', Config.file_name)

    et = TextConverter(text=None,save_dir='models/en_vocab.pkl', max_vocab=Config.en_vocab_size, seq_length = Config.seq_length)
    zt = TextConverter(text=None,save_dir='models/zh_vocab.pkl', max_vocab=Config.zh_vocab_size, seq_length = Config.seq_length+1)  # +1是因为,decoder层序列拆成input=[:-1]和label=[1:]
    print('english vocab lens:',et.vocab_size)
    print('chinese vocab lens:',zt.vocab_size)


    # 加载上一次保存的模型
    model = Model(Config)
    checkpoint_path = tf.train.latest_checkpoint(model_path)
    if checkpoint_path:
        model.load(checkpoint_path)

    while True:
        # english_speek = 'what can i help you ?'
        # print('english:', english_speek)
        english_speek = input("english:")

        english_speek = english_speek.split()
        en_arr, arr_len = et.text_to_arr(english_speek)

        test_g = [np.array([en_arr,]), np.array([arr_len,])]
        output_ids = model.test(test_g, model_path, zt)
        strs = zt.arr_to_text(output_ids)
        print('chinese:',strs)
Beispiel #29
0
def main(_):
    model_path = os.path.join('model', Config.file_name)
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)

    et = TextConverter(text=None,
                       save_dir='model/en_vocab.pkl',
                       max_vocab=Config.en_vocab_size,
                       seq_length=Config.seq_length)
    zt = TextConverter(text=None,
                       save_dir='model/zh_vocab.pkl',
                       max_vocab=Config.zh_vocab_size,
                       seq_length=Config.seq_length +
                       1)  # +1是因为,decoder层序列拆成input=[:-1]和label=[1:]
    print('english vocab lens:', et.vocab_size)
    print('chines20000e vocab lens:', zt.vocab_size)

    en_arrs = et.get_en_arrs('data/train.tags.data.en_clear')
    zh_arrs = zt.get_en_arrs('data/train.tags.data.zh_clear')

    train_g = batch_generator(en_arrs, zh_arrs, Config.batch_size)

    # 加载上一次保存的模型
    model = Model(Config)
    checkpoint_path = tf.train.latest_checkpoint(model_path)
    if checkpoint_path:
        model.load(checkpoint_path)

    print('start to training...')
    model.train(train_g, model_path)
Beispiel #30
0
def main(_):
    model_path = os.path.join('models', Config.file_name)

    input_file = 'data/去除2和null.xlsx'
    vocab_file = os.path.join(model_path, 'vocab_label.pkl')

    # 数据处理
    converter = TextConverter(None,
                              vocab_file,
                              max_vocab=Config.vocab_max_size,
                              seq_length=Config.seq_length)
    print('vocab size:', converter.vocab_size)

    # 加载上一次保存的模型
    model = Model(Config, converter.vocab_size)
    checkpoint_path = tf.train.latest_checkpoint(model_path)
    if checkpoint_path:
        model.load(checkpoint_path)

    # 获取测试库数据
    # test_libs = get_excel_libs('data/tianlong_libs.xlsx')  # 用整个库3w+
    QAs = get_excel_QAs(input_file)
    thres = int(0.8 * len(QAs))
    test_QAs = QAs[thres:]
    test_libs = [r for q, r, y in test_QAs]  # 用QAs

    test_libs_arrs = converter.libs_to_arrs(test_libs)

    # 产生匹配库向量
    save_file = checkpoint_path + '_matul_state_QAs.pkl'
    if os.path.exists(save_file) is False:
        response_matul_state = model.test_to_matul(test_libs_arrs)
        with open(save_file, 'wb') as f:
            pickle.dump(response_matul_state, f)
    else:
        with open(save_file, 'rb') as f:
            response_matul_state = pickle.load(f)

    # 测试
    print('start to testing...')
    QAY = []
    k, n = 0, 0
    for query, y_response, label in test_QAs:
        input_arr, input_len = converter.text_to_arr(query)
        indexs = model.test(input_arr, input_len, response_matul_state)
        responses = converter.index_to_response(indexs, test_libs)

        QAY.append((query, y_response, responses))
        if responses[0] == y_response:
            k += 1
            print(k, '/', n)
        n += 1
    print('accuracy:', k / float(n))
    result_xls = checkpoint_path + '_Q_for_QAs.xls'
    converter.save_to_excel(QAY, result_xls)
Beispiel #31
0
class Dianpin(Singleton):
    def __init__(self):
        self.text = ''
        self.tfmodel = None
        self.converter = None

    def model_built(self):#,vocab_size,sampling,lstm_size,num_layers,use_embedding,embedding_size):
        FLAGS.start_string = FLAGS.start_string.decode('utf-8')
        self.converter = TextConverter(filename=FLAGS.converter_path)
        if os.path.isdir(FLAGS.checkpoint_path):
            FLAGS.checkpoint_path =\
                tf.train.latest_checkpoint(FLAGS.checkpoint_path)
        self.tfmodel = CharRNN(self.converter.vocab_size, sampling=True,
                    lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size)
        self.tfmodel.load(FLAGS.checkpoint_path)
        
    def final_predict(self):
        start = self.converter.text_to_arr(FLAGS.start_string)
        arr = self.tfmodel.sample(FLAGS.max_length, start, self.converter.vocab_size)
        return self.converter.arr_to_text(arr)
Beispiel #32
0
def main(_):

    model_path = os.path.join('models', Config.file_name)
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)

    converter = TextConverter(vocab_dir='data/vocabs',
                              max_vocab=Config.vocab_size,
                              seq_length=Config.seq_length)
    print('vocab lens:', converter.vocab_size)

    en_arrs = converter.get_en_arrs('data/train/in.txt')
    de_arrs = converter.get_de_arrs('data/train/out.txt')

    train_g = batch_generator(en_arrs, de_arrs, Config.batch_size)

    # 加载上一次保存的模型
    model = Model(Config)
    checkpoint_path = tf.train.latest_checkpoint(model_path)
    if checkpoint_path:
        model.load(checkpoint_path)

    print('start to training...')
    model.train(train_g, model_path)