Ejemplo n.º 1
0
def main(_):
    model_path = os.path.join('model', FLAGS.name)
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)
    with codecs.open(FLAGS.input_file, encoding='utf-8') as f:
        text = f.read()
    converter = TextConverter(text, FLAGS.max_vocab)
    converter.save_to_file(os.path.join(model_path, 'converter.pkl'))

    arr = converter.text_to_arr(text)
    g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps)
    print(converter.vocab_size)
    model = CharRNN(converter.vocab_size,
                    num_seqs=FLAGS.num_seqs,
                    num_steps=FLAGS.num_steps,
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    learning_rate=FLAGS.learning_rate,
                    train_keep_prob=FLAGS.train_keep_prob,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size)
    model.train(
        g,
        FLAGS.max_steps,
        model_path,
        FLAGS.save_every_n,
        FLAGS.log_every_n,
    )
Ejemplo n.º 2
0
def sample(model: CharRNN, vocab: Vocab) -> str:
    model.eval()

    with torch.no_grad():
        hidden = None

        input = torch.from_numpy(
            np.array([
                convert_idx_to_one_hot(vocab, vocab.char_to_idx(SOS_TOKEN))
            ])).unsqueeze(0)
        input = input.float().to(config.device)

        pred = ''

        for _ in range(8):
            out, hidden = model(input, hidden)

            if out.view(-1).div(0.8).exp().sum() == 0:
                continue

            topi = torch.multinomial(out.view(-1).div(0.8).exp(), 1)[0]

            if topi.item() == vocab.char_to_idx(EOS_TOKEN):
                break

            pred += vocab.idx_to_char(topi.item())

            input = torch.from_numpy(
                np.array([convert_idx_to_one_hot(vocab,
                                                 topi.item())])).unsqueeze(0)
            input = input.float().to(config.device)

        return pred
Ejemplo n.º 3
0
def main(_):
    model_path = os.path.join('model', FLAGS.name)
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)
    with codecs.open(FLAGS.input_file, encoding='utf-8') as f:
        text = f.read()
    tokenizer = Tokenizer(text, FLAGS.num_words)
    tokenizer.save_to_file(os.path.join(model_path, 'tokenizer.pkl'))

    arr = tokenizer.texts_to_sequences(text)
    batch = batch_generator(arr, FLAGS.batch_size, FLAGS.num_steps)
    print(tokenizer.vocab_size)
    model = CharRNN(tokenizer.vocab_size,
                    batch_size=FLAGS.batch_size,
                    num_steps=FLAGS.num_steps,
                    n_neurons=FLAGS.n_neurons,
                    n_layers=FLAGS.n_layers,
                    learning_rate=FLAGS.learning_rate,
                    train_keep_prob=FLAGS.train_keep_prob,
                    embedding=FLAGS.embedding,
                    embedding_size=FLAGS.embedding_size
                    )
    model.train(batch,
                FLAGS.n_iterations,
                model_path,
                FLAGS.save_every_n,
                FLAGS.log_every_n,
                )
Ejemplo n.º 4
0
def main(_):
    model_path = os.path.join('model', FLAGS.name)
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)
    with codecs.open(FLAGS.input_file, encoding='utf-8') as f:
        text = f.read()
    converter = TextConverter(text, FLAGS.max_vocab)
    converter.save_to_file(os.path.join(model_path, 'converter.pkl'))

    arr = converter.text_to_arr(text)
    g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps)
    print(converter.vocab_size)
    model = CharRNN(converter.vocab_size,
                    num_seqs=FLAGS.num_seqs,
                    num_steps=FLAGS.num_steps,
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    learning_rate=FLAGS.learning_rate,
                    train_keep_prob=FLAGS.train_keep_prob,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size
                    )
    model.train(g,
                FLAGS.max_steps,
                model_path,
                FLAGS.save_every_n,
                FLAGS.log_every_n,
                )
Ejemplo n.º 5
0
def main(_):
    script_path = os.path.abspath(os.path.dirname(__file__))
    model_path = os.path.join(script_path, 'model', FLAGS.name)
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)
    with codecs.open(FLAGS.input_file, encoding='utf-8') as f:
        text = f.read()
    print("corpus size " + str(len(text)))

    if os.path.exists(FLAGS.whitelist_file):
        with codecs.open(FLAGS.whitelist_file, encoding='utf-8') as f:
            whitelist = f.read()
        text = remove_non_matching_chars(text, whitelist)

    converter = TextConverter(text, FLAGS.max_vocab)
    converter.save_to_file(os.path.join(model_path, 'converter.pkl'))

    arr = converter.text_to_arr(text)
    g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps)
    model = CharRNN(converter.vocab_size,
                    num_seqs=FLAGS.num_seqs,
                    num_steps=FLAGS.num_steps,
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    learning_rate=FLAGS.learning_rate,
                    train_keep_prob=FLAGS.train_keep_prob,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size)
    model.train(
        g,
        FLAGS.max_steps,
        model_path,
        FLAGS.save_every_n,
        FLAGS.log_every_n,
    )
Ejemplo n.º 6
0
def main(_):
    model_path = os.path.join('model', FLAGS.name)
    if not os.path.exists(model_path):
        os.makedirs(model_path)
    with codecs.open(FLAGS.input_file, encoding='utf-8') as f:
        text = f.read()
    convert = TextConvert(text, FLAGS.max_vocab)
    convert.save_vocab(os.path.join(model_path, 'text_convert.pkl'))

    arr = convert.text2arr(text)
    g = batch_generate(arr, FLAGS.num_seqs, FLAGS.num_steps)
    
    model = CharRNN(
        convert.vocab_size,
        num_seqs=FLAGS.num_seqs,
        num_steps=FLAGS.num_steps,
        lstm_size=FLAGS.lstm_size,
        num_layers=FLAGS.num_layers,
        learning_rate=FLAGS.learning_rate,
        train_keep_prob=FLAGS.train_keep_prob,
        use_embedding=FLAGS.use_embedding,
        embedding_size=FLAGS.embedding_size    
    )
    model.train(
        g,
        FLAGS.max_steps,
        model_path,
        FLAGS.save_n,
        FLAGS.print_n,
    )
Ejemplo n.º 7
0
def train(opt, th):
    ''' 训练模型
    Args:
        opt -- 参数
        th -- TextConverter对象
    Returns:
        None
    '''
    # 1. 训练数据
    data_set = TextDataset(opt.train_data_path, th)
    train_data = DataLoader(data_set,
                            opt.batch_size,
                            shuffle=True,
                            num_workers=opt.num_workers)
    # 2. 初始化模型
    model = CharRNN(th.vocab_size, opt.embed_size, opt.hidden_size,
                    opt.n_layers, opt.dropout_p, opt.bidir)
    if USE_CUDA:
        model = model.cuda(DEVICE_ID)

    # 3. 优化配置
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=opt.learning_rate)

    # 4. 训练
    for e in range(opt.max_epochs):
        epoch_loss = 0
        hidden = None
        for input_seqs, labels in train_data:
            # 都是[b, seq_len],最后一个不足b
            # 准备input和hidden
            b = input_seqs.shape[0]
            if hidden is not None:
                hidden = hidden[:, :b, :]
            labels = labels.long().view(-1)
            input_seqs, labels = get_variable(input_seqs), get_variable(labels)

            # 前向计算
            probs, hidden = model(input_seqs, hidden)
            probs = probs.view(-1, th.vocab_size)

            # loss和反向
            loss = criterion(probs, labels)
            optimizer.zero_grad()
            loss.backward(retain_graph=True)

            # 优化
            nn.utils.clip_grad_norm(model.parameters(), 5)
            optimizer.step()

            epoch_loss += loss.data[0]
        # 交叉熵
        entropy_loss = epoch_loss / len(train_data)
        perplexity = np.exp(entropy_loss)
        info = "epoch: {}, perp: {:.3f}".format(e + 1, perplexity)
        print(info)
        if perplexity <= opt.min_perplexity or e == opt.max_epochs - 1:
            print("best model")
            torch.save(model, opt.model_path)
            break
Ejemplo n.º 8
0
def main(_):
    model_path = os.path.join('model', FLAGS.name)
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)
    print(
        "---------------------------- Reading Corpus ----------------------------"
    )
    start_time = time.time()
    read_corpus()
    print("Read Corpus Finished in " + str(time.time() - start_time) +
          ' Seconds.')
    converter = TextConverter(corpus, FLAGS.max_vocab)
    converter.save_to_file(os.path.join(model_path, 'converter.pkl'))

    arr = converter.text_to_arr(corpus)
    g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps)
    model = CharRNN(converter.vocab_size,
                    num_seqs=FLAGS.num_seqs,
                    num_steps=FLAGS.num_steps,
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    learning_rate=FLAGS.learning_rate,
                    train_keep_prob=FLAGS.train_keep_prob,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size)
    model.train(
        g,
        FLAGS.max_steps,
        model_path,
        FLAGS.save_every_n,
        FLAGS.log_every_n,
    )
Ejemplo n.º 9
0
def main(_):
    model_path = os.path.join('model', FLAGS.name)  # 拼接路径model/'name'
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)
    with codecs.open(FLAGS.input_file, encoding='utf-8') as f:
        text = f.read()  # 读取文本
    converter = TextConverter(text,
                              FLAGS.max_vocab)  # 文本转换为词汇且截取FLAGS.max_vocab个词
    converter.save_to_file(os.path.join(model_path,
                                        'converter.pk1'))  # 序列化存储词汇

    data = converter.text_to_data(text)  # 将文本转化为输入(word_to_int)
    g = batch_generator(data, FLAGS.n_seqs, FLAGS.n_steps)  # 获取batch生成器
    print(converter.vocab_size)
    # 模型参数初始化
    model = CharRNN(converter.vocab_size,
                    n_seqs=FLAGS.n_seqs,
                    n_steps=FLAGS.n_steps,
                    state_size=FLAGS.state_size,
                    n_layers=FLAGS.n_layers,
                    learning_rate=FLAGS.learning_rate,
                    train_keep_prob=FLAGS.train_keep_prob,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size)
    model.train(g, FLAGS.max_steps, model_path, FLAGS.save_every_n,
                FLAGS.log_every_n)
Ejemplo n.º 10
0
def main(_):
    FLAGS.start_string = FLAGS.start_string  #.decode('utf-8')
    converter = TextConverter(filename=FLAGS.converter_path)
    if os.path.isdir(FLAGS.checkpoint_path):
        FLAGS.checkpoint_path =\
            tf.train.latest_checkpoint(FLAGS.checkpoint_path)

    model = CharRNN(converter.vocab_size,
                    sampling=True,
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size)

    model.load(FLAGS.checkpoint_path)

    start_string = FLAGS.start_string
    sys.stdout.write("> ")
    sys.stdout.flush()
    start_string = sys.stdin.readline()
    while start_string:
        start = converter.text_to_arr(start_string)
        arr = model.sample(FLAGS.max_length, start, converter.vocab_size)
        print(converter.arr_to_text(arr))

        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
Ejemplo n.º 11
0
def main(_):
    model_path = os.path.join('model', FLAGS.name)#创建路径字符串
    if os.path.exists(model_path) is False:#创建文件夹路径
        os.makedirs(model_path)
    with codecs.open(FLAGS.input_file, encoding='utf-8') as f:
        text = f.read()#读取整个文件作为字符串
    converter = TextConverter(text, FLAGS.max_vocab)
    converter.save_to_file(os.path.join(model_path, 'converter.pkl'))

    arr = converter.text_to_arr(text)#将文本序列化
    g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps)#100,100
    print(converter.vocab_size)
    model = CharRNN(converter.vocab_size,#创建模型,这里num_classes设置为了字典的大小,因为要预测下一个char
                    num_seqs=FLAGS.num_seqs,
                    num_steps=FLAGS.num_steps,
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    learning_rate=FLAGS.learning_rate,
                    train_keep_prob=FLAGS.train_keep_prob,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size
                    )
    model.train(g,#训练模型
                FLAGS.max_steps,
                model_path,
                FLAGS.save_every_n,
                FLAGS.log_every_n,
                )
Ejemplo n.º 12
0
def main(_):
    model_path = os.path.join('model', FLAGS.file_type)
    if not os.path.exists(model_path):
        os.makedirs(model_path)

    # Read and Load Corpus for Train and Validation.
    training_corpus, validating_corpus = read_corpus()

    # Build Text Converter
    print(
        "---------------------------- Initializing Text Converter ----------------------------"
    )
    start_time = time.time()
    converter = TextConverter(training_corpus, FLAGS.max_vocab)
    converter.save_to_file(os.path.join(model_path, 'converter.pkl'))
    print('Initialize Text Converter Finished in %.3f Seconds.\n' %
          (time.time() - start_time))

    # Vectorize Content of Corpus
    vectroize_corpus(converter)

    # Build Char RNN Model
    model = CharRNN(converter.vocab_size,
                    num_seqs=FLAGS.num_seqs,
                    num_steps=FLAGS.num_steps,
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    learning_rate=FLAGS.learning_rate,
                    train_keep_prob=FLAGS.train_keep_prob,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size)

    # Train Model
    model.train(FLAGS.max_steps, model_path, FLAGS.validate_every_n_steps,
                FLAGS.log_every_n_steps)
Ejemplo n.º 13
0
def main(_):
    converter = TextConverter(filename=FLAGS.converter_path)
    if os.path.isdir(FLAGS.checkpoint_path):
        FLAGS.checkpoint_path = tf.train.latest_checkpoint(
            FLAGS.checkpoint_path)

    model = CharRNN(converter.vocab_size,
                    None,
                    sampling=True,
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size)

    model.load(FLAGS.checkpoint_path)

    # start = converter.text_to_arr(FLAGS.seed_for_generating)
    seeds = [
        'var a = fun', 'function a(', 'this.', 'document.', 'window.',
        'var a = document.g', 'var a;', 'jQuery'
    ]
    for seed in seeds:
        start = converter.text_to_arr(seed)
        for i in range(0, FLAGS.num_to_generate):
            print('Generating: ' + seed + ' -> ' + str(i))
            file_name = str(uuid.uuid1())
            file_path = '../../BrowserFuzzingData/generated/' + FLAGS.file_type + '/' + file_name + '.' + FLAGS.file_type
            arr = model.sample(FLAGS.max_length_of_generated, start,
                               converter.vocab_size, converter.word_to_int)
            f = open(file_path, "wb")
            f.write(converter.arr_to_text(arr).encode('utf-8'))
            f.close()
Ejemplo n.º 14
0
def main(_):
    model_path = os.path.join('model', FLAGS.name)  # 保存模型的路径
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)
    # 用codecs提供的open方法来指定打开的文件的语言编码,它会在读取的时候自动转换为内部unicode
    with codecs.open(FLAGS.input_file, encoding='utf-8') as f:
        text = f.read()  # 读取训练的文本
    converter = TextConverter(text, FLAGS.max_vocab)  # 转换text文本格式
    converter.save_to_file(os.path.join(model_path, 'converter.pkl'))

    arr = converter.text_to_arr(text)  # 转换text为数组
    g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps)  # 批生成
    print(converter.vocab_size)
    model = CharRNN(converter.vocab_size,  # 读取模型
                    num_seqs=FLAGS.num_seqs,
                    num_steps=FLAGS.num_steps,
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    learning_rate=FLAGS.learning_rate,
                    train_keep_prob=FLAGS.train_keep_prob,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size
                    )
    model.train(g,  # 训练
                FLAGS.max_steps,
                model_path,
                FLAGS.save_every_n,
                FLAGS.log_every_n,
                )
Ejemplo n.º 15
0
def main():
    inputs, token_to_idx, idx_to_token = load_dataset(file_name=sys.argv[2])

    #coloredlogs.install(level='DEBUG')
    num_layers = 2
    rnn_type = 'lstm'
    dropout = 0.5
    emb_size = 50
    hidden_size = 256
    learning_rate = 0.001
    n_tokens = len(idx_to_token)

    model = CharRNN(num_layers=num_layers,
                    rnn_type=rnn_type,
                    dropout=dropout,
                    n_tokens=n_tokens,
                    emb_size=emb_size,
                    hidden_size=hidden_size,
                    pad_id=token_to_idx[PAD_TOKEN])
    if torch.cuda.is_available():
        model = model.cuda()

    optimiser = optim.Adam(model.parameters(), lr=learning_rate)

    try:
        model, optimiser, epoch, valid_loss_min = load_ckp(
            checkpoint_fpath=sys.argv[1], model=model, optimiser=optimiser)
        generate_sample(model, token_to_idx, idx_to_token, n_tokens=20)
    except KeyboardInterrupt:
        print('Aborted!')
def main(_):
    model_path = os.path.join("model", FLAGS.name)
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)
    with open(FLAGS.input_file, encoding='utf-8') as f:
        text = f.read()
    converter = TextConverter(text=text,
                              filename=None,
                              max_vocab=FLAGS.max_vocab)
    converter.save_to_file(os.path.join(model_path, 'converter.pkl'))

    arr = converter.text_to_arr(text=text)
    print(converter.vocab_size())
    g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps)
    model = CharRNN(num_class=converter.vocab_size(),
                    num_seqs=FLAGS.num_seqs,
                    num_steps=FLAGS.num_steps,
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    learn_rate=FLAGS.learning_rate,
                    train_keep_prob=FLAGS.train_keep_prob,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size)
    model.train(
        batch_generate=g,
        max_steps=FLAGS.max_steps,
        save_path=model_path,
        save_per_n=FLAGS.save_per_n,
        print_per_n=FLAGS.print_per_n,
    )
Ejemplo n.º 17
0
def load_model(model_filename):
    with open(model_filename, 'rb') as f:
        checkpoint = torch.load(f)

    n_hidden, n_layers, state_dict, chars = checkpoint['n_hidden'], checkpoint['n_layers'], \
                                            checkpoint['state_dict'], checkpoint['chars']

    model = CharRNN(chars=chars, n_hidden=n_hidden, n_layers=n_layers)
    model.load_state_dict(state_dict=state_dict)
    return model
Ejemplo n.º 18
0
 def model_built(self):#,vocab_size,sampling,lstm_size,num_layers,use_embedding,embedding_size):
     FLAGS.start_string = FLAGS.start_string.decode('utf-8')
     self.converter = TextConverter(filename=FLAGS.converter_path)
     if os.path.isdir(FLAGS.checkpoint_path):
         FLAGS.checkpoint_path =\
             tf.train.latest_checkpoint(FLAGS.checkpoint_path)
     self.tfmodel = CharRNN(self.converter.vocab_size, sampling=True,
                 lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers,
                 use_embedding=FLAGS.use_embedding,
                 embedding_size=FLAGS.embedding_size)
     self.tfmodel.load(FLAGS.checkpoint_path)
Ejemplo n.º 19
0
def main(args):

    # configure GPU datatype
    use_gpu = torch.cuda.is_available()
    if not use_gpu:
        raise Exception('error: CUDA library unavailable')

    global gpu_dtype
    gpu_dtype = torch.cuda.FloatTensor

    # load train, val, test data
    with open(args.data_dir + '/data.pkl', 'rb') as f:
        url_array, label_array = pickle.load(f)

    # partition dataset (this must sum to <50K)
    num_train = 10000
    num_val = 2000
    num_test = 8000

    data_train = url_array[:, :num_train, :]
    labels_train = label_array[:num_train]
    data_val = url_array[:, num_train:num_train + num_val, :]
    labels_val = label_array[num_train:num_train + num_val]
    data_test = url_array[:, num_train + num_val:num_train + num_val +
                          num_test, :]
    labels_test = label_array[num_train + num_val:num_train + num_val +
                              num_test]

    # initialize model and configure for GPU
    model = CharRNN()
    model = model.type(gpu_dtype)

    # train model on training data, reporting accuracy on held out validation set
    train(model, (data_train, labels_train), (data_val, labels_val),
          args.num_epochs, args.batch_size)

    # convert model to CPU for use on GPU-less AWS instance
    model = model.type(torch.FloatTensor)

    # get test accuracy
    print('Final results on held-out test set: ')
    check_accuracy(model, (data_test, labels_test), use_gpu=False)
    check_accuracy(model, (data_train, labels_train), use_gpu=False)

    # save model to disk for use in prediction
    path = 'models/char_rnn.pk'
    print('Saving model to %s' % path)
    torch.save(model, path)
Ejemplo n.º 20
0
def main(_):
    converter = TextConverter(filename=FLAGS.converter_path)
    if os.path.isdir(FLAGS.checkpoint_path):
        FLAGS.checkpoint_path =\
            tf.train.latest_checkpoint(FLAGS.checkpoint_path)

    model = CharRNN(converter.vocab_size, sampling=True,
                    lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size)

    model.load(FLAGS.checkpoint_path)

    start = converter.text_to_arr(FLAGS.start_string)
    arr = model.sample(FLAGS.max_length, start, converter.vocab_size)
    print(converter.arr_to_text(arr))
Ejemplo n.º 21
0
def sample(checkpoint):
    samples = [c for c in prime]
    int_to_vocab, vocab_to_int, no_classes = pickle.load(
        open("./saves/data.p", "rb"))

    # Initialize the model
    model = CharRNN(no_classes=no_classes, sampling=True)
    saver = tf.train.Saver()

    with tf.Session() as sess:

        # Load the checkpoint
        saver.restore(sess, checkpoint)
        new_state = sess.run(model.initial_state)

        # Feed the prime word to the model and predict the next character
        for c in prime:
            x = np.zeros((1, 1))
            x[0, 0] = vocab_to_int[c]
            feed = {model.inputs: x, model.initial_state: new_state}
            preds, new_state = sess.run(
                [model.prediction, model.final_state], feed_dict=feed)
        c = pick_top_n(preds, no_classes)
        samples.append(int_to_vocab[c])

        # Generate new samples
        for i in range(n_samples):
            x[0, 0] = c
            feed = {model.inputs: x, model.initial_state: new_state}
            preds, new_state = sess.run(
                [model.prediction, model.final_state], feed_dict=feed)
            c = pick_top_n(preds, no_classes)
            samples.append(int_to_vocab[c])

    return ''.join(samples)
Ejemplo n.º 22
0
def main(_):
    tc = TextConverter("", -1, byte_file=FLAGS.vocab_path)
    output_size = tc.vocab_size
    if os.path.isdir(FLAGS.checkpoint_path):
        FLAGS.checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
    model = CharRNN(output_size=output_size,
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    sampling=True)
    model.load(FLAGS.checkpoint_path)
    start = tc.text_to_arr(FLAGS.start_string)
    generate_arr = model.sample(FLAGS.length, start, output_size)
    generate_text = tc.arr_to_text(generate_arr)
    with open(FLAGS.save_path, 'w', encoding='utf-8') as f:
        f.write(generate_text)
    print(generate_text)
Ejemplo n.º 23
0
def main():
    parser = argparse.ArgumentParser("Char-RNN on the complete works of Shakespeare")
    parser.add_argument("--test", type=bool, default=False,
            help = "if true, keep only a thousand lines from the Shakespeare corpus")

    args = parser.parse_args()

    seed(1616)
    
    text = extract_shakespeare_data("data/t8.shakespeare.txt")
    char_encoder = CharEncoder(text)
    #get sequences of 100 characters
    sequences = make_sequences(text)
    #vectorize with numeric labeling
    #each character gets mapped to an integer & vice versa
    sequences = char_encoder.label_sequences(sequences)
    if args.test:
        print("Test: downsizing data to 1,000 sequences...")
        sequences = sequences[:1000]

    shuffle(sequences)
    n_training_sequences = int(.9 * len(sequences))
    #split the dataset into training and validation sets
    training = sequences[:n_training_sequences]
    validation = sequences[n_training_sequences:]

    hidden_size = 128 
    rnn = CharRNN(char_encoder.n_chars, hidden_size)
    train(rnn, training, validation, epochs = 4, lr = 0.01, evaluate_per = 2, batch_size = 20)
    
    print(sample(rnn, prime_str = "Macbeth", size = 100, encoder = char_encoder,
            temperature=.9))
Ejemplo n.º 24
0
def main(_):
    tokenizer = Tokenizer(vocab_path=FLAGS.tokenizer_path)
    if os.path.isdir(FLAGS.checkpoint_path):
        FLAGS.checkpoint_path = \
            tf.train.latest_checkpoint(FLAGS.checkpoint_path)

    model = CharRNN(tokenizer.vocab_size, sampling=True,
                    n_neurons=FLAGS.n_neurons, n_layers=FLAGS.n_layers,
                    embedding=FLAGS.embedding,
                    embedding_size=FLAGS.embedding_size)

    model.load(FLAGS.checkpoint_path)

    start = tokenizer.texts_to_sequences(FLAGS.start_string)
    arr = model.sample(FLAGS.max_length, start, tokenizer.vocab_size)
    print(tokenizer.sequences_to_texts(arr))
Ejemplo n.º 25
0
def composePotery():
    converter = TextConverter(filename=FLAGS.converter_path)
    if os.path.isdir(FLAGS.checkpoint_path):
        FLAGS.checkpoint_path =\
            tf.train.latest_checkpoint(FLAGS.checkpoint_path)

    model = CharRNN(converter.vocab_size, sampling=True,
                    lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size)
    model.load(FLAGS.checkpoint_path)

    start = []
    arr = model.sample(FLAGS.max_length, start, converter.vocab_size)
    rawText = converter.arr_to_text(arr)
    return(selectPoetry(rawText))
Ejemplo n.º 26
0
def sample(checkpoint, n_samples, lstm_size, vocab_size, prime="The "):
    samples = [c for c in prime]
    model = CharRNN(vocab_size, lstm_size=lstm_size, sampling=True)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        saver.restore(sess, checkpoint)
        new_state = sess.run(model.initial_state)
        for c in prime:
            x = np.zeros((1, 1))
            x[0, 0] = vocab_to_int[c]
            feed = {
                model.inputs: x,
                model.keep_prob: 1.,
                model.initial_state: new_state
            }
            preds, new_state = sess.run([model.prediction, model.final_state],
                                        feed_dict=feed)

        c = pick_top_n(preds, vocab_size)
        samples.append(int_to_vocab[c])

        for i in range(n_samples):
            x[0, 0] = c
            feed = {
                model.inputs: x,
                model.keep_prob: 1.,
                model.initial_state: new_state
            }
            preds, new_state = sess.run([model.prediction, model.final_state],
                                        feed_dict=feed)
            c = pick_top_n(preds, vocab_size)
            samples.append(int_to_vocab[c])
    return ''.join(samples)
Ejemplo n.º 27
0
def main(_):
    FLAGS.start_string = FLAGS.start_string.decode('utf-8')
    converter = TextConverter(filename=FLAGS.converter_path)
    if os.path.isdir(FLAGS.checkpoint_path):
        FLAGS.checkpoint_path =\
            tf.train.latest_checkpoint(FLAGS.checkpoint_path)

    model = CharRNN(converter.vocab_size, sampling=True,
                    lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size)

    model.load(FLAGS.checkpoint_path)

    start = converter.text_to_arr(FLAGS.start_string)
    arr = model.sample(FLAGS.max_length, start, converter.vocab_size)
    print(converter.arr_to_text(arr))
Ejemplo n.º 28
0
def train(model: CharRNN, optimizer: optim.Optimizer, criterion, inputs,
          targets):
    model.train()
    optimizer.zero_grad()

    hidden = None

    total_loss = 0

    for i in range(inputs.shape[0]):
        output, hidden = model(inputs[i].unsqueeze(0).unsqueeze(0).float(),
                               hidden)
        loss = criterion(output.squeeze(0), targets[i].unsqueeze(0).long())
        total_loss += loss

    total_loss.backward()
    optimizer.step()

    return output, total_loss.item() / inputs.shape[0]
Ejemplo n.º 29
0
def start_text(start_text, n_words=250):
    # Here we have loaded in a model that trained over 20 epochs `rnn_20_epoch.net`
    with open('saved_model/rnn_20_epoch.net', 'rb') as f:
        checkpoint = torch.load(f)

    loaded = CharRNN(checkpoint['tokens'],
                     n_hidden=checkpoint['n_hidden'],
                     n_layers=checkpoint['n_layers'])
    loaded.load_state_dict(checkpoint['state_dict'])

    generated_text = sample(loaded,
                            n_words,
                            top_k=5,
                            prime='{} '.format(start_text))

    generated_text = generated_text.replace('\n', ' ')
    generated_text = '{}.'.format(generated_text.split('.')[0])

    return generated_text
Ejemplo n.º 30
0
def generate():
    tf.compat.v1.disable_eager_execution()
    converter = TextConverter(filename=FLAGS.converter_path)
    if os.path.isdir(FLAGS.checkpoint_path):
        FLAGS.checkpoint_path =\
            tf.train.latest_checkpoint(FLAGS.checkpoint_path)

    model = CharRNN(converter.vocab_size,
                    sampling=True,
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size)

    model.load(FLAGS.checkpoint_path)

    start = converter.text_to_arr(FLAGS.start_string)
    arr = model.sample(FLAGS.max_length, start, converter.vocab_size)

    return converter.arr_to_text(arr)
Ejemplo n.º 31
0
def main(_):
    FLAGS.start_string = FLAGS.start_string.decode('utf-8')
    converter = TextConverter(filename=FLAGS.converter_path)  #创建文本转化器
    if os.path.isdir(FLAGS.checkpoint_path):
        FLAGS.checkpoint_path = tf.train.latest_checkpoint(
            FLAGS.checkpoint_path)  #下载最新模型

    model = CharRNN(converter.vocab_size,
                    sampling=True,
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size)

    model.load(FLAGS.checkpoint_path)  #加载模型

    start = converter.text_to_arr(FLAGS.start_string)  #将input text转为id
    arr = model.sample(FLAGS.max_length, start,
                       converter.vocab_size)  #输出为生成的序列
    print(converter.arr_to_text(arr))
Ejemplo n.º 32
0
    def _make_estimator(self):
        params = tf.contrib.training.HParams(**Config.model.to_dict())
        run_config = tf.contrib.learn.RunConfig(
            model_dir=Config.train.model_dir)

        char_rnn = CharRNN()
        self.estimator = tf.estimator.Estimator(
                model_fn=char_rnn.model_fn,
                model_dir=Config.train.model_dir,
                params=params,
                config=run_config)