Ejemplo n.º 1
0
def mrkov_chain_text():
    if request.method == "GET":
        return render_template("index.html", markov_chain_texts = '文章を生成します。')
    elif request.method == "POST":
        keyword = request.form['keyword']
        generator = TextGenerator()
        markov_chain_texts = generator.generate(keyword)
        return render_template("index.html", keyword=keyword, markov_chain_texts=markov_chain_texts)
Ejemplo n.º 2
0
    def __init__(self, author, path='', sequence_length=50):
        path = Path(path)
        with open(path / author, encoding='utf-8') as f:
            text = f.read()
        self.generator = TextGenerator(text)

        self.generator.load_model((path / author).name)
        self.author = author
Ejemplo n.º 3
0
    def __init__(self, credentials, stalked_account):
        self.api = TwitterAPI(credentials['api_key'],
                              credentials['api_key_secret'],
                              credentials['access_token'],
                              credentials['access_token_secret'])

        self.text_generator = TextGenerator()
        self.stalked_account = stalked_account
        self.stalked_account_id = self.__get_stalked_account_id()
Ejemplo n.º 4
0
def main():
    random.seed(SEED)
    np.random.seed(SEED)
    stringGenerator = TextGenerator('../corpus/index2word.pickle',
                                    '../corpus/word2index.pickle',
                                    '../corpus/all.code')

    assert START_TOKEN == 0

    gen_data_loader = Gen_Data_loader(BATCH_SIZE)
    likelihood_data_loader = Likelihood_data_loader(BATCH_SIZE)
    #vocab_size = 5000
    vocab_size = len(stringGenerator.index2Word)

    generator = get_trainable_model(vocab_size)
    target_params = cPickle.load(open('save/target_params.pkl'))
    target_params[00] = np.random.rand(vocab_size, 32).astype(np.float32)
    target_params[-2] = np.random.rand(32, vocab_size).astype(np.float32)
    target_params[-1] = np.random.rand(vocab_size).astype(np.float32)
    target_lstm = TARGET_LSTM(vocab_size, 64, 32, 32, 20, 0, target_params)

    config = tf.ConfigProto()
    # config.gpu_options.per_process_gpu_memory_fraction = 0.5
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.initialize_all_variables())

    #generate_samples(sess, target_lstm, 64, 10000, positive_file)
    stringGenerator.saveSamplesToFile(20, 10000, positive_file)
    gen_data_loader.create_batches(positive_file)

    log = open('log/experiment-log.txt', 'w')
    #  pre-train generator
    print 'Start pre-training...'
    log.write('pre-training...\n')
    for epoch in xrange(PRE_EPOCH_NUM):
        print 'pre-train epoch:', epoch
        loss = pre_train_epoch(sess, generator, gen_data_loader)
        if epoch % 5 == 0:
            #generate_samples(sess, generator, BATCH_SIZE, generated_num, eval_file)
            stringGenerator.saveSamplesToFile(20, generated_num, eval_file)
            likelihood_data_loader.create_batches(eval_file)
            test_loss = target_loss(sess, target_lstm, likelihood_data_loader)
            print 'pre-train epoch ', epoch, 'test_loss ', test_loss
            buffer = str(epoch) + ' ' + str(test_loss) + '\n'
            log.write(buffer)

    #generate_samples(sess, generator, BATCH_SIZE, generated_num, eval_file)
    stringGenerator.saveSamplesToFile(20, generated_num, eval_file)
    likelihood_data_loader.create_batches(eval_file)
    test_loss = target_loss(sess, target_lstm, likelihood_data_loader)
    buffer = 'After supervised-training:' + ' ' + str(test_loss) + '\n'
    log.write(buffer)

    log.close()
Ejemplo n.º 5
0
def main(file_global_path):
    file_name = sys.argv[1:2]
    if not file_name:
        print('File name not provided')
        sys.exit(1)
    file_name = file_name[0]

    text_generator = TextGenerator()
    generate_text_from_file(text_generator, file_global_path + file_name)
    generate_text_from_previous_text(text_generator)
Ejemplo n.º 6
0
def main():
    random.seed(SEED)
    np.random.seed(SEED)
    if (pos == 1):
        stringGenerator = TextGenerator(
            '../corpus_uncond_pos/index2word.pickle',
            '../corpus_uncond_pos/word2index.pickle',
            '../corpus_uncond_pos/input_file.txt',
            '../corpus_uncond_pos/target_file.txt',
            '../corpus_uncond_pos/vocab_creation_file.txt')
    else:
        stringGenerator = TextGenerator(
            '../corpus_uncond_neg/index2word.pickle',
            '../corpus_uncond_neg/word2index.pickle',
            '../corpus_uncond_neg/input_file.txt',
            '../corpus_uncond_neg/target_file.txt',
            '../corpus_uncond_neg/vocab_creation_file.txt')
    generated_num_inp = stringGenerator.sentencesCount_inp
    generated_num_test = stringGenerator.sentencesCount_test

    with open(starting_word_file, "w+") as op:
        for i in range(len(good_ids)):
            tokensSequence = [good_ids[i]]
            tokensSequence += [0] * (SEQ_LENGTH - 1)
            strSentence = " ".join([str(index)
                                    for index in tokensSequence]) + "\n"
            op.write(strSentence)

    assert START_TOKEN == 0

    gen_data_loader = Gen_Data_loader(BATCH_SIZE, SEQ_LENGTH)
    start_data_loader = Gen_Data_loader(BATCH_SIZE, SEQ_LENGTH)
    likelihood_data_loader = Likelihood_data_loader(BATCH_SIZE, SEQ_LENGTH)
    vocab_size = len(stringGenerator.index2Word)
    dis_data_loader = Dis_dataloader(SEQ_LENGTH)
    #Embedding matrix from google vec:
    GLOVE_DIR = '../corpus_uncond_neg/glove.6B/'
    embeddings_index = {}
    f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

    print('Found %s word vectors.' % len(embeddings_index))
    EmbeddingMatrix = np.zeros((vocab_size, EMB_DIM))

    #embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
    for i, word in index2word.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            EmbeddingMatrix[i] = embedding_vector
        else:
            EmbeddingMatrix[i] = np.random.uniform(-1, 1, EMB_DIM)
    if (pos == 1):
        np.savez('embedding_pos.npz', EmbeddingMatrix)
    else:
        np.savez('embedding_neg.npz', EmbeddingMatrix)
    ###############################################################################

    best_score = 1000
    generator = get_trainable_model(vocab_size)

    real_inputs_discrete = tf.placeholder(tf.int32,
                                          shape=[BATCH_SIZE, SEQ_LEN])
    real_inputs = tf.one_hot(real_inputs_discrete, vocab_size)
    print(real_inputs)

    disc_real = Discriminator(real_inputs)
    disc_fake = Discriminator(generator.g_predictions_wgan)

    disc_cost = tf.reduce_mean(disc_fake) - tf.reduce_mean(disc_real)
    gen_cost = -tf.reduce_mean(disc_fake)

    # WGAN lipschitz-penalty
    alpha = tf.random_uniform(shape=[BATCH_SIZE, 1, 1], minval=0., maxval=1.)
    differences = generator.g_predictions_wgan - real_inputs
    interpolates = real_inputs + (alpha * differences)
    gradients = tf.gradients(Discriminator(interpolates), [interpolates])[0]
    slopes = tf.sqrt(
        tf.reduce_sum(tf.square(gradients), reduction_indices=[1, 2]))
    gradient_penalty = tf.reduce_mean((slopes - 1.)**2)
    disc_cost += LAMBDA * gradient_penalty

    gen_params = generator.g_params

    disc_params = lib.params_with_name('Discriminator')

    gen_train_op = tf.train.AdamOptimizer(learning_rate=1e-4,
                                          beta1=0.5,
                                          beta2=0.9).minimize(
                                              gen_cost, var_list=gen_params)
    disc_train_op = tf.train.AdamOptimizer(learning_rate=1e-4,
                                           beta1=0.5,
                                           beta2=0.9).minimize(
                                               disc_cost, var_list=disc_params)

    config = tf.ConfigProto()
    # config.gpu_options.per_process_gpu_memory_fraction = 0.5
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.initialize_all_variables())
    saver = tf.train.Saver()
    #generate_samples(sess, target_lstm, 64, 10000, positive_file)
    stringGenerator.saveSamplesToFile_inp(SEQ_LENGTH, generated_num_inp,
                                          positive_file)
    stringGenerator.saveSamplesToFile_inp_text(SEQ_LENGTH, generated_num_inp,
                                               inp_ref_file)
    stringGenerator.saveSamplesToFile_test_text(SEQ_LENGTH, generated_num_test,
                                                test_ref_file)

    stringGenerator.saveSamplesToFile_test(SEQ_LENGTH, generated_num_test,
                                           test_file)
    gen_data_loader.create_batches(positive_file)

    start_data_loader.create_batches(starting_word_file)

    if (pos == 1):
        log = open('log_pos_was/experiment-log.txt', 'w')
    else:
        log = open('log_neg_was/experiment-log.txt', 'w')
    #  pre-train generator
    print 'Start pre-training...'
    log.write('pre-training...\n')

    EPOCHS = 0
    load = 0
    if (load == 1):
        epoch = 5
        if (pos == 1):
            saver.restore(
                sess,
                "/target_generate_pos_was/pretrain" + str(epoch) + ".ckpt")
        else:
            saver.restore(
                sess,
                "/target_generate_pos_was/pretrain" + str(epoch) + ".ckpt")
        EPOCHS = EPOCHS + epoch

    for epoch in xrange(PRE_EPOCH_NUM):
        print 'pre-train epoch:', epoch
        if (pos == 1):
            eval_file2 = 'target_generate_pos_was/eval_file' + '_pretrain_gen_' + str(
                EPOCHS + epoch) + '.txt'
        else:
            eval_file2 = 'target_generate_neg_was/eval_file' + '_pretrain_gen_' + str(
                EPOCHS + epoch) + '.txt'

        loss = pre_train_epoch(sess, generator, gen_data_loader)
        if epoch % 5 == 0:
            generate_samples2(sess, generator, BATCH_SIZE, len(good_ids),
                              eval_file2, start_data_loader)
            generate_samples(sess, generator, BATCH_SIZE, len(good_ids),
                             eval_file, start_data_loader)

            likelihood_data_loader.create_batches(positive_file)
            train_loss = target_loss(sess, generator, likelihood_data_loader)
            likelihood_data_loader.create_batches(test_file)
            test_loss = target_loss(sess, generator, likelihood_data_loader)
            print 'pre-train epoch ', epoch, 'test_loss ', test_loss, 'train_loss', train_loss
            buffer = str(epoch) + ' test_loss : ' + str(
                test_loss) + ' train_loss : ' + str(train_loss) + '\n'
            log.write(buffer)
            if (pos == 1):
                saver.save(sess,
                           'target_generate_pos_was/pretrain',
                           global_step=EPOCHS + epoch)
            else:
                saver.save(sess,
                           'target_generate_neg_was/pretrain',
                           global_step=EPOCHS + epoch)

    if (pos == 1):
        eval_file2 = 'target_generate_pos_was/eval_file' + '_pretrain_gen_' + str(
            EPOCHS + epoch) + '.txt'
    else:
        eval_file2 = 'target_generate_neg_was/eval_file' + '_pretrain_gen_' + str(
            EPOCHS + epoch) + '.txt'

    generate_samples2(sess, generator, BATCH_SIZE, len(good_ids), eval_file2,
                      start_data_loader)
    likelihood_data_loader.create_batches(positive_file)
    train_loss = target_loss(sess, generator, likelihood_data_loader)
    likelihood_data_loader.create_batches(test_file)
    test_loss = target_loss(sess, generator, likelihood_data_loader)
    print 'pre-train epoch ', epoch, 'test_loss ', test_loss, 'train_loss', train_loss
    buffer = str(epoch) + ' test_loss : ' + str(
        test_loss) + ' train_loss : ' + str(train_loss) + '\n'
    log.write(buffer)

    def batch_iter(data, batch_size, num_epochs):
        """
        Generates a batch iterator for a dataset.
        """
        data = np.array(data)
        data_size = len(data)
        num_batches_per_epoch = int(len(data) / batch_size) + 1
        for epoch in range(num_epochs):
            # Shuffle the data at each epoch
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
            for batch_num in range(num_batches_per_epoch):
                start_index = batch_num * batch_size
                end_index = min((batch_num + 1) * batch_size, data_size)
                yield np.array(shuffled_data[start_index:end_index],
                               dtype='int32')

    def load_train_data(file):
        """
        Returns input vectors, labels, vocabulary, and inverse vocabulary.
        """
        examples = []
        with open(file) as fin:
            for line in fin:
                line = line.strip()
                line = line.split()
                parse_line = [int(x) for x in line]
                examples.append(parse_line)
        return np.array(examples)

    EPOCHS = EPOCHS + PRE_EPOCH_NUM
    print 'Start training discriminator...'
    for epoch in range(dis_alter_epoch):
        print('disctrainingepoch: ' + str(epoch))

        #  train discriminator
        pos_data = load_train_data(positive_file)
        pos_batches = batch_iter(pos_data, BATCH_SIZE, 1)
        for i in range(int(len(pos_data) / BATCH_SIZE) + 1):
            A = pos_batches.next()
            if (np.shape(A)[0] == BATCH_SIZE):
                _disc_cost, _ = sess.run([disc_cost, disc_train_op],
                                         feed_dict={real_inputs_discrete: A})
            else:
                break
        if (epoch % 30 == 0):
            if (pos == 1):
                saver.save(sess,
                           'target_generate_pos_was/disc',
                           global_step=EPOCHS + epoch)
            else:
                saver.save(sess,
                           'target_generate_neg_was/disc',
                           global_step=EPOCHS + epoch)

    EPOCHS = EPOCHS + dis_alter_epoch

    for iteration in xrange(ITERS):
        start_time = time.time()
        print 'training wgan...'
        #  train discriminator
        pos_data = load_train_data(positive_file)
        pos_batches = batch_iter(pos_data, BATCH_SIZE, 1)

        # Train generator

        for ii in range(int(len(pos_data) / BATCH_SIZE) + 1):
            A = pos_batches.next()
            if (np.shape(A)[0] == BATCH_SIZE):
                if iteration > 0:
                    _gen_cost, _ = sess.run(
                        [gen_cost, gen_train_op],
                        feed_dict={real_inputs_discrete: A})
                # Train critic
                for pp in xrange(CRITIC_ITERS):
                    _disc_cost, _ = sess.run(
                        [disc_cost, disc_train_op],
                        feed_dict={real_inputs_discrete: A})
            else:
                break

            if ii % 10 == 0:
                if (pos == 1):
                    eval_file2 = 'target_generate_pos_was/eval_file_reinforce_' + str(
                        EPOCHS + iteration) + '_' + str(ii) + '.txt'
                else:
                    eval_file2 = 'target_generate_neg_was/eval_file_reinforce_' + str(
                        EPOCHS + iteration) + '_' + str(ii) + '.txt'

                generate_samples2(sess, generator, BATCH_SIZE, len(good_ids),
                                  eval_file2, start_data_loader)
                generate_samples(sess, generator, BATCH_SIZE, len(good_ids),
                                 eval_file, start_data_loader)
                hyp = []

                likelihood_data_loader.create_batches(positive_file)
                train_loss = target_loss(sess, generator,
                                         likelihood_data_loader)
                likelihood_data_loader.create_batches(test_file)
                test_loss = target_loss(sess, generator,
                                        likelihood_data_loader)
                print 'reinf-train epoch ', iteration, 'test_loss ', test_loss, 'train_loss', train_loss, 'disc_cost', _disc_cost
                buffer = str(iteration) + ' test_loss : ' + str(
                    test_loss) + ' train_loss : ' + str(
                        train_loss) + ' _disc_cost ' + str(_disc_cost) + '\n'

                log.write(buffer)

    log.close()
Ejemplo n.º 7
0
def train(run_name, start_epoch, stop_epoch, img_w):
    # Input Parameters
    img_h = 32
    words_per_epoch = 2
    val_split = 0.2
    val_words = int(words_per_epoch * (val_split))

    # Network parameters
    conv_filters = 16
    kernel_size = (3, 3)
    pool_size = 2
    time_dense_size = 32
    rnn_size = 512

    if K.image_data_format() == 'channels_first':
        input_shape = (1, img_w, img_h)
    else:
        input_shape = (img_w, img_h, 1)   

    lexicon = np.genfromtxt('../data/mnt/ramdisk/max/90kDICT32px/lexicon.txt', dtype='str' )
    img_gen = TextGenerator(minibatch_size=32,
                                img_w=100,
                                img_h=32,
                                downsample_factor=4, 
                                valid_class = 320,
                                valid_examples = 1,    
                                lexicon = lexicon                            
                            )
   
    act = 'relu'
    input_data = Input(name='the_input', shape=input_shape, dtype='float32')
    inner = Conv2D(conv_filters, kernel_size, padding='same',
                   activation=act, kernel_initializer='he_normal',
                   name='conv1')(input_data)
    inner = MaxPooling2D(pool_size=(pool_size, pool_size), name='max1')(inner)
    inner = Conv2D(conv_filters, kernel_size, padding='same',
                   activation=act, kernel_initializer='he_normal',
                   name='conv2')(inner)
    inner = MaxPooling2D(pool_size=(pool_size, pool_size), name='max2')(inner)

    conv_to_rnn_dims = (img_w // (pool_size ** 2), (img_h // (pool_size ** 2)) * conv_filters)
    inner = Reshape(target_shape=conv_to_rnn_dims, name='reshape')(inner)

    # cuts down input size going into RNN:
    inner = Dense(time_dense_size, activation=act, name='dense1')(inner)

    # Two layers of bidirecitonal GRUs
    # GRU seems to work as well, if not better than LSTM:
    gru_1 = GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru1')(inner)
    gru_1b = GRU(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru1_b')(inner)
    gru1_merged = add([gru_1, gru_1b])
    gru_2 = GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru2')(gru1_merged)
    gru_2b = GRU(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru2_b')(gru1_merged)

    # transforms RNN output to character activations:
    inner = Dense(img_gen.get_output_size(), kernel_initializer='he_normal',
                  name='dense2')(concatenate([gru_2, gru_2b]))
    y_pred = Activation('softmax', name='softmax')(inner)
    Model(inputs=input_data, outputs=y_pred).summary()

    labels = Input(name='the_labels', shape=[23], dtype='float32')
    input_length = Input(name='input_length', shape=[1], dtype='int64')
    label_length = Input(name='label_length', shape=[1], dtype='int64')
    # Keras doesn't currently support loss funcs with extra parameters
    # so CTC loss is implemented in a lambda layer
    loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length])

    # clipnorm seems to speeds up convergence
    sgd = SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5)

    model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)

    # the loss calc occurs elsewhere, so use a dummy lambda func for the loss
    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=sgd)
    # if start_epoch > 0:
    #     weight_file = os.path.join(OUTPUT_DIR, os.path.join(run_name, 'weights%02d.h5' % (start_epoch - 1)))
    #     model.load_weights(weight_file)
    # captures output of softmax so we can decode the output during visualization
    test_func = K.function([input_data], [y_pred])

    viz_cb = VizCallback(test_func, img_gen.next_val())

    model.fit_generator(generator=img_gen.next_train(), steps_per_epoch=words_per_epoch,
                        epochs=stop_epoch, validation_data=img_gen.next_val(), validation_steps=320,
                        callbacks=[viz_cb, img_gen], initial_epoch=start_epoch)
Ejemplo n.º 8
0
class TwitterBot:
    def __init__(self, credentials, stalked_account):
        self.api = TwitterAPI(credentials['api_key'],
                              credentials['api_key_secret'],
                              credentials['access_token'],
                              credentials['access_token_secret'])

        self.text_generator = TextGenerator()
        self.stalked_account = stalked_account
        self.stalked_account_id = self.__get_stalked_account_id()

    def __get_stalked_account_id(self):
        response = self.api.request('users/show',
                                    {'screen_name': self.stalked_account})

        if response.status_code == 200:
            return json.loads(response.text)['id_str']
        else:
            print('PROBLEM: ' + response.text)
            exit()

    def reply_tweets(self):
        stream = self.api.request('statuses/filter',
                                  {'follow': self.stalked_account_id})

        for tweet in stream:
            print(tweet['text'] if 'text' in tweet else tweet)

            if self.__is_tweet_valid(tweet):
                print(tweet['text'] if 'text' in tweet else tweet)
                self.__reply(tweet['id'])

    def fav_tweets(self):
        stream = self.api.request('statuses/filter',
                                  {'follow': self.stalked_account_id})

        for tweet in stream:
            print(tweet['text'] if 'text' in tweet else tweet)

            if self.__is_tweet_valid(tweet):
                print(tweet['text'] if 'text' in tweet else tweet)
                self.__fav(tweet['id'])

    def __is_tweet_valid(self, tweet):
        if 'user' in tweet:
            return (self.__tweeted_by_stalked_account(tweet) & ('id' in tweet))
        else:
            return False

    def __tweeted_by_stalked_account(self, tweet):
        return (tweet['text'][0:4] != 'RT @')

    def __reply(self, tweet_id):
        tweet_text = self.text_generator.get_random_text()
        response = self.api.request(
            'statuses/update', {
                'status': f'@{self.stalked_account} {tweet_text}',
                'in_reply_to_status_id': tweet_id
            })
        print('SUCCESS' if response.status_code == 200 else 'PROBLEM: ' +
              response.text)

    def __fav(self, tweet_id):
        response = self.api.request('favorites/create', {'id': tweet_id})
        print('SUCCESS' if response.status_code == 200 else 'PROBLEM: ' +
              response.text)
Ejemplo n.º 9
0
def main():
    random.seed(SEED)
    np.random.seed(SEED)
    if (pos == 1):
        stringGenerator = TextGenerator(
            '../corpus_uncond_pos/index2word.pickle',
            '../corpus_uncond_pos/word2index.pickle',
            '../corpus_uncond_pos/input_file.txt',
            '../corpus_uncond_pos/target_file.txt',
            '../corpus_uncond_pos/vocab_creation_file.txt')
    else:
        stringGenerator = TextGenerator(
            '../corpus_uncond_neg/index2word.pickle',
            '../corpus_uncond_neg/word2index.pickle',
            '../corpus_uncond_neg/input_file.txt',
            '../corpus_uncond_neg/target_file.txt',
            '../corpus_uncond_neg/vocab_creation_file.txt')
    generated_num_inp = stringGenerator.sentencesCount_inp
    generated_num_test = stringGenerator.sentencesCount_test

    with open(starting_word_file, "w+") as op:
        for i in range(len(good_ids)):
            tokensSequence = [good_ids[i]]
            tokensSequence += [0] * (SEQ_LENGTH - 1)
            strSentence = " ".join([str(index)
                                    for index in tokensSequence]) + "\n"
            op.write(strSentence)

    assert START_TOKEN == 0

    gen_data_loader = Gen_Data_loader(BATCH_SIZE, SEQ_LENGTH)
    start_data_loader = Gen_Data_loader(BATCH_SIZE, SEQ_LENGTH)
    likelihood_data_loader = Likelihood_data_loader(BATCH_SIZE, SEQ_LENGTH)
    vocab_size = len(stringGenerator.index2Word)
    dis_data_loader = Dis_dataloader(SEQ_LENGTH)
    #Embedding matrix from google vec:
    GLOVE_DIR = '../corpus_uncond_neg/glove.6B/'
    embeddings_index = {}
    f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

    print('Found %s word vectors.' % len(embeddings_index))
    EmbeddingMatrix = np.zeros((vocab_size, EMB_DIM))

    #embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
    for i, word in index2word.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            EmbeddingMatrix[i] = embedding_vector
        else:
            EmbeddingMatrix[i] = np.random.uniform(-1, 1, EMB_DIM)
    if (pos == 1):
        np.savez('embedding_pos.npz', EmbeddingMatrix)
    else:
        np.savez('embedding_neg.npz', EmbeddingMatrix)
    ###############################################################################

    best_score = 1000
    generator = get_trainable_model(vocab_size)

    with tf.variable_scope('discriminator'):
        cnn = TextLSTM(sequence_length=SEQ_LENGTH,
                       num_classes=2,
                       vocab_size=vocab_size,
                       embedding_size=dis_embedding_dim,
                       num_hidden=dis_num_hidden,
                       num_layers=dis_num_layers,
                       pos=pos,
                       BATCH_SIZE=BATCH_SIZE,
                       start_token=START_TOKEN,
                       l2_reg_lambda=dis_l2_reg_lambda)

    cnn_params = [
        param for param in tf.trainable_variables()
        if 'discriminator' in param.name
    ]
    # Define Discriminator Training procedure
    dis_global_step = tf.Variable(0, name="global_step", trainable=False)
    dis_optimizer = tf.train.AdamOptimizer(1e-4)
    dis_grads_and_vars = dis_optimizer.compute_gradients(cnn.loss,
                                                         cnn_params,
                                                         aggregation_method=2)
    dis_train_op = dis_optimizer.apply_gradients(dis_grads_and_vars,
                                                 global_step=dis_global_step)

    config = tf.ConfigProto()
    # config.gpu_options.per_process_gpu_memory_fraction = 0.5
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.initialize_all_variables())
    saver = tf.train.Saver()
    #generate_samples(sess, target_lstm, 64, 10000, positive_file)
    stringGenerator.saveSamplesToFile_inp(SEQ_LENGTH, generated_num_inp,
                                          positive_file)
    stringGenerator.saveSamplesToFile_inp_text(SEQ_LENGTH, generated_num_inp,
                                               inp_ref_file)
    stringGenerator.saveSamplesToFile_test_text(SEQ_LENGTH, generated_num_test,
                                                test_ref_file)

    stringGenerator.saveSamplesToFile_test(SEQ_LENGTH, generated_num_test,
                                           test_file)
    gen_data_loader.create_batches(positive_file)

    print('Hey........................................')
    start_data_loader.create_batches(starting_word_file)
    print('ehyssl')

    if (pos == 1):
        log = open('log_pos/experiment-log.txt', 'w')
    else:
        log = open('log_neg/experiment-log.txt', 'w')
    #  pre-train generator
    print 'Start pre-training...'
    log.write('pre-training...\n')

    EPOCHS = 0
    load = 0
    if (load == 1):
        epoch = 5
        if (pos == 1):
            path = tf.train.get_checkpoint_state('target_generate_pos')
            print(path)
            saver.restore(sess, path.model_checkpoint_path)
            #saver = tf.train.import_meta_graph("target_generate_pos/disc-90.meta")
            #saver.restore(sess,"target_generate_pos/disc-90.ckpt")
        else:
            path = tf.train.get_checkpoint_state('target_generate_neg')
            print(path)
            saver.restore(sess, path.model_checkpoint_path)
        EPOCHS = EPOCHS + PRE_EPOCH_NUM + dis_alter_epoch

    for epoch in xrange(0, PRE_EPOCH_NUM):
        print 'pre-train epoch:', epoch
        if (pos == 1):
            eval_file2 = 'target_generate_pos/eval_file' + '_pretrain_gen_' + str(
                EPOCHS + epoch) + '.txt'
        else:
            eval_file2 = 'target_generate_neg/eval_file' + '_pretrain_gen_' + str(
                EPOCHS + epoch) + '.txt'

        loss = pre_train_epoch(sess, generator, gen_data_loader)
        if epoch % 5 == 0:
            generate_samples2(sess, generator, BATCH_SIZE, len(good_ids),
                              eval_file2, start_data_loader)
            generate_samples(sess, generator, BATCH_SIZE, len(good_ids),
                             eval_file, start_data_loader)
            per_tri_test = 0
            per_di_test = 0
            per_quad_test = 0
            per_di = 0
            per_tri = 0
            per_quad = 0

            likelihood_data_loader.create_batches(positive_file)
            train_loss = target_loss(sess, generator, likelihood_data_loader)
            likelihood_data_loader.create_batches(test_file)
            test_loss = target_loss(sess, generator, likelihood_data_loader)
            print 'pre-train epoch ', epoch, 'test_loss ', test_loss, 'train_loss', train_loss, 'per_di', per_di, 'per_quad', per_quad, 'per_tri', per_tri, 'per_di_test', per_di_test, 'per_quad_test', per_quad_test, 'per_tri_test', per_tri_test
            buffer = str(epoch) + ' test_loss : ' + str(
                test_loss
            ) + ' train_loss : ' + str(train_loss) + 'per_di : ' + str(
                per_di) + 'per_quad : ' + str(per_quad) + 'per_tri' + str(
                    per_tri
                ) + 'per_di : ' + str(per_di_test) + 'per_quad : ' + str(
                    per_quad_test) + 'per_tri' + str(per_tri_test) + '\n'
            log.write(buffer)
            if (pos == 1):
                saver.save(sess,
                           'target_generate_pos/pretrain',
                           global_step=EPOCHS + epoch)
            else:
                saver.save(sess,
                           'target_generate_neg/pretrain',
                           global_step=EPOCHS + epoch)

    if (pos == 1):
        eval_file2 = 'target_generate_pos/eval_file' + '_pretrain_gen_' + str(
            EPOCHS + epoch) + '.txt'
    else:
        eval_file2 = 'target_generate_neg/eval_file' + '_pretrain_gen_' + str(
            EPOCHS + epoch) + '.txt'

    generate_samples2(sess, generator, BATCH_SIZE, len(good_ids), eval_file2,
                      start_data_loader)
    likelihood_data_loader.create_batches(positive_file)
    train_loss = target_loss(sess, generator, likelihood_data_loader)
    likelihood_data_loader.create_batches(test_file)
    test_loss = target_loss(sess, generator, likelihood_data_loader)
    print 'pre-train epoch ', epoch, 'test_loss ', test_loss, 'train_loss', train_loss
    buffer = str(epoch) + ' testloss : ' + str(
        test_loss) + ' trainloss : ' + str(train_loss) + '\n'
    log.write(buffer)

    EPOCHS = EPOCHS + PRE_EPOCH_NUM

    print 'Start training discriminator...'
    for epoch in range(0):  #dis_alter_epoch):
        print('disctrainingepoch: ' + str(epoch))
        #generate from start same as actual data
        generate_samples(sess, generator, BATCH_SIZE, len(good_ids),
                         negative_file, start_data_loader)

        #  train discriminator
        dis_x_train, dis_y_train = dis_data_loader.load_train_data(
            positive_file, negative_file)
        dis_batches = dis_data_loader.batch_iter(zip(dis_x_train, dis_y_train),
                                                 dis_batch_size,
                                                 dis_num_epochs)

        for batch in dis_batches:
            try:
                x_batch, y_batch = zip(*batch)
                feed = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: dis_dropout_keep_prob
                }
                _, step = sess.run([dis_train_op, dis_global_step], feed)
            except ValueError:
                pass
        if (epoch % 30 == 0):
            if (pos == 1):
                saver.save(sess,
                           'target_generate_pos/disc',
                           global_step=EPOCHS + epoch)
            else:
                saver.save(sess,
                           'target_generate_neg/disc',
                           global_step=EPOCHS + epoch)

    EPOCHS = EPOCHS + dis_alter_epoch

    rollout = ROLLOUT(generator, 0.8)

    print '#########################################################################'
    print 'Start Reinforcement Training Generator...'
    log.write('Reinforcement Training...\n')
    per_tri_test = 0
    per_di_test = 0
    per_quad_test = 0
    per_di = 0
    per_tri = 0
    per_quad = 0

    for total_batch in range(TOTAL_BATCH):
        gen_data_loader.reset_pointer()

        for it in xrange(start_data_loader.num_batch):
            print('start gen training')
            batch = gen_data_loader.next_batch()
            samples = generator.generate(sess, batch)
            rewards = rollout.get_reward(sess, samples, 16, cnn)
            feed = {generator.x: samples, generator.rewards: rewards}
            _, g_loss = sess.run([generator.g_updates, generator.g_loss],
                                 feed_dict=feed)
            if (pos == 1):
                eval_file2 = 'target_generate_pos/eval_file_reinforce_' + str(
                    EPOCHS + total_batch) + '_' + str(it) + '.txt'
            else:
                eval_file2 = 'target_generate_neg/eval_file_reinforce_' + str(
                    EPOCHS + total_batch) + '_' + str(it) + '.txt'

            generate_samples2(sess, generator, BATCH_SIZE, len(good_ids),
                              eval_file2, start_data_loader)
            generate_samples(sess, generator, BATCH_SIZE, len(good_ids),
                             eval_file, start_data_loader)

            likelihood_data_loader.create_batches(positive_file)
            train_loss = target_loss(sess, generator, likelihood_data_loader)
            likelihood_data_loader.create_batches(test_file)
            test_loss = target_loss(sess, generator, likelihood_data_loader)
            print 'reinf-train epoch ', total_batch, 'test_loss ', test_loss, 'train_loss', train_loss, 'per_di', per_di, 'per_quad', per_quad, 'per_tri', per_tri, 'per_di_test', per_di_test, 'per_quad_test', per_quad_test, 'per_tri_test', per_tri_test
            buffer = str(total_batch) + ' test_loss : ' + str(
                test_loss
            ) + ' train_loss : ' + str(train_loss) + 'per_di : ' + str(
                per_di) + 'per_quad : ' + str(per_quad) + 'per_tri' + str(
                    per_tri
                ) + 'per_di : ' + str(per_di_test) + 'per_quad : ' + str(
                    per_quad_test) + 'per_tri' + str(per_tri_test) + '\n'

            log.write(buffer)

            rollout.update_params()
            #here i generate samples with start_data_loader
            generate_samples(sess, generator, BATCH_SIZE, len(good_ids),
                             negative_file, start_data_loader)
            dis_x_train, dis_y_train = dis_data_loader.load_train_data(
                positive_file, negative_file)
            dis_batches = dis_data_loader.batch_iter(
                zip(dis_x_train, dis_y_train), dis_batch_size, 3)
            for batch2 in dis_batches:
                try:
                    x_batch, y_batch = zip(*batch2)
                    feed = {
                        cnn.input_x: x_batch,
                        cnn.input_y: y_batch,
                        cnn.dropout_keep_prob: dis_dropout_keep_prob
                    }
                    _, step = sess.run([dis_train_op, dis_global_step], feed)
                except ValueError:
                    pass

    log.close()
Ejemplo n.º 10
0
def main():
    random.seed(SEED)
    np.random.seed(SEED)

    stringGenerator = TextGenerator('../corpus/index2word.pickle',
                                    '../corpus/word2index.pickle',
                                    '../corpus/all.code')

    assert START_TOKEN == 0

    gen_data_loader = Gen_Data_loader(BATCH_SIZE)
    likelihood_data_loader = Likelihood_data_loader(BATCH_SIZE)
    vocab_size = len(stringGenerator.index2Word)
    dis_data_loader = Dis_dataloader()

    best_score = 1000
    generator = get_trainable_model(vocab_size)
    target_params = cPickle.load(open('save/target_params.pkl'))
    target_params[00] = np.random.rand(vocab_size, 32).astype(np.float32)
    target_params[-2] = np.random.rand(32, vocab_size).astype(np.float32)
    target_params[-1] = np.random.rand(vocab_size).astype(np.float32)
    target_lstm = TARGET_LSTM(vocab_size, 64, 32, 32, 20, 0, target_params)

    with tf.variable_scope('discriminator'):
        cnn = TextCNN(sequence_length=20,
                      num_classes=2,
                      vocab_size=vocab_size,
                      embedding_size=dis_embedding_dim,
                      filter_sizes=dis_filter_sizes,
                      num_filters=dis_num_filters,
                      l2_reg_lambda=dis_l2_reg_lambda)

    cnn_params = [
        param for param in tf.trainable_variables()
        if 'discriminator' in param.name
    ]
    # Define Discriminator Training procedure
    dis_global_step = tf.Variable(0, name="global_step", trainable=False)
    dis_optimizer = tf.train.AdamOptimizer(1e-4)
    dis_grads_and_vars = dis_optimizer.compute_gradients(cnn.loss,
                                                         cnn_params,
                                                         aggregation_method=2)
    dis_train_op = dis_optimizer.apply_gradients(dis_grads_and_vars,
                                                 global_step=dis_global_step)

    config = tf.ConfigProto()
    # config.gpu_options.per_process_gpu_memory_fraction = 0.5
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.initialize_all_variables())

    #generate_samples(sess, target_lstm, 64, 10000, positive_file)
    stringGenerator.saveSamplesToFile(20, 10000, positive_file)
    gen_data_loader.create_batches(positive_file)

    log = open('log/experiment-log.txt', 'w')
    #  pre-train generator
    print 'Start pre-training...'
    log.write('pre-training...\n')
    for epoch in xrange(PRE_EPOCH_NUM):
        print 'pre-train epoch:', epoch
        loss = pre_train_epoch(sess, generator, gen_data_loader)
        if epoch % 5 == 0:
            generate_samples(sess, generator, BATCH_SIZE, generated_num,
                             eval_file)
            likelihood_data_loader.create_batches(eval_file)
            test_loss = target_loss(sess, target_lstm, likelihood_data_loader)
            print 'pre-train epoch ', epoch, 'test_loss ', test_loss
            buffer = str(epoch) + ' ' + str(test_loss) + '\n'
            log.write(buffer)

    generate_samples(sess, generator, BATCH_SIZE, generated_num, eval_file)
    likelihood_data_loader.create_batches(eval_file)
    test_loss = target_loss(sess, target_lstm, likelihood_data_loader)
    buffer = 'After pre-training:' + ' ' + str(test_loss) + '\n'
    log.write(buffer)

    generate_samples(sess, generator, BATCH_SIZE, generated_num, eval_file)
    likelihood_data_loader.create_batches(eval_file)
    significance_test(sess, target_lstm, likelihood_data_loader,
                      'significance/supervise.txt')

    print 'Start training discriminator...'
    for _ in range(dis_alter_epoch):
        generate_samples(sess, generator, BATCH_SIZE, generated_num,
                         negative_file)

        #  train discriminator
        dis_x_train, dis_y_train = dis_data_loader.load_train_data(
            positive_file, negative_file)
        dis_batches = dis_data_loader.batch_iter(zip(dis_x_train, dis_y_train),
                                                 dis_batch_size,
                                                 dis_num_epochs)

        for batch in dis_batches:
            try:
                x_batch, y_batch = zip(*batch)
                feed = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: dis_dropout_keep_prob
                }
                _, step = sess.run([dis_train_op, dis_global_step], feed)
            except ValueError:
                pass

    rollout = ROLLOUT(generator, 0.8)

    print '#########################################################################'
    print 'Start Reinforcement Training Generator...'
    log.write('Reinforcement Training...\n')

    for total_batch in range(TOTAL_BATCH):
        for it in range(TRAIN_ITER):
            samples = generator.generate(sess)
            rewards = rollout.get_reward(sess, samples, 16, cnn)
            feed = {generator.x: samples, generator.rewards: rewards}
            _, g_loss = sess.run([generator.g_updates, generator.g_loss],
                                 feed_dict=feed)

        if total_batch % 1 == 0 or total_batch == TOTAL_BATCH - 1:
            generate_samples(sess, generator, BATCH_SIZE, generated_num,
                             eval_file)
            likelihood_data_loader.create_batches(eval_file)
            test_loss = target_loss(sess, target_lstm, likelihood_data_loader)
            buffer = str(total_batch) + ' ' + str(test_loss) + '\n'
            print 'total_batch: ', total_batch, 'test_loss: ', test_loss
            log.write(buffer)

            if test_loss < best_score:
                best_score = test_loss
                print 'best score: ', test_loss
                significance_test(sess, target_lstm, likelihood_data_loader,
                                  'significance/seqgan.txt')

        rollout.update_params()

        # generate for discriminator
        print 'Start training discriminator'
        for _ in range(5):
            generate_samples(sess, generator, BATCH_SIZE, generated_num,
                             negative_file)

            dis_x_train, dis_y_train = dis_data_loader.load_train_data(
                positive_file, negative_file)
            dis_batches = dis_data_loader.batch_iter(
                zip(dis_x_train, dis_y_train), dis_batch_size, 3)

            for batch in dis_batches:
                try:
                    x_batch, y_batch = zip(*batch)
                    feed = {
                        cnn.input_x: x_batch,
                        cnn.input_y: y_batch,
                        cnn.dropout_keep_prob: dis_dropout_keep_prob
                    }
                    _, step = sess.run([dis_train_op, dis_global_step], feed)
                except ValueError:
                    pass

    log.close()
Ejemplo n.º 11
0
from magic_dict import MagicDict
from text_generator import TextGenerator

with open("windows1251.txt", "r", encoding="windows-1251") as file:
    text = file.read()

base_text = MagicDict(text)
base_model = base_text.generate()
generated_text = TextGenerator(base_model, length=100000).create_text()
print(generated_text)
Ejemplo n.º 12
0
                    type=bool,
                    help="Если True, загружает заданную модель")
parser.add_argument("--length",
                    type=int,
                    help="Длинна генерируемого текста",
                    default=20)
parser.add_argument("--encoding",
                    type=str,
                    help="Кодировка файла",
                    default="utf-8")

args = parser.parse_args()

if args.load_model:
    # Загрузка модели
    with open(args.dir, "rb") as file:
        base_model = pickle.load(file)
else:
    # Открытие файла с текстом и создание модели
    with open(args.dir, "r", encoding=args.encoding) as file:
        text = file.read()
    base_text = MagicDict(text)
    base_model = base_text.generate()

# Создание модели и генерация текста
if args.save_model:  # Сохранение модели в файл(pickle)
    with open(args.save_dir, "wb") as file:
        pickle.dump(base_model, file, fix_imports=False)
generated_text = TextGenerator(base_model, args.length).create_text()
print(generated_text)