Beispiel #1
0
    def toShakespeare(self):
        """Given a line of text, return that text in the indicated style.
        
        Args:
          modern_text: (string) The input.
          
        Returns:
          string: The translated text, if generated.
        """

        args = load_arguments()
        vocab = Vocabulary(self.vocab_path, args.embedding, args.dim_emb)

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        with tf.Session(config=config) as sess:
            model = Model(args, vocab)
            model.saver.restore(sess, args.model)

            if args.beam > 1:
                decoder = beam_search.Decoder(sess, args, vocab, model)
            else:
                decoder = greedy_decoding.Decoder(sess, args, vocab, model)

                batch = get_batch([self.modern_text], [1], vocab.word2id)
                ori, tsf = decoder.rewrite(batch)

                out = ' '.join(w for w in tsf[0])

        return out
Beispiel #2
0
def transform_text(text):
    tf.compat.v1.disable_eager_execution()
    args = load_arguments()
    ah = vars(args)
    ah['vocab'] = '../model/yelp.vocab'
    ah['model'] = '../model/model'
    ah['load_model'] = True
    ah['beam'] = 8
    ah['batch_size'] = 1
    inp = [text]

    vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb)
    print('vocabulary size:', vocab.size)

    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True

    with tf.compat.v1.Session(config=config) as sess:
        model = create_model(sess, args, vocab)
        decoder = beam_search.Decoder(sess, args, vocab, model)
        '''test_losses = transfer(model, decoder, sess, args, vocab,
                               test0, test1, args.output)'''

        batches, order0, order1 = get_batches(inp, inp, vocab.word2id,
                                              args.batch_size)

        data0_tsf, data1_tsf = [], []
        losses = Accumulator(len(batches), ['loss', 'rec', 'adv', 'd0', 'd1'])

        # rec, tsf = decoder.rewrite(inp)

        # print(rec)
        # print(tsf)
        for batch in batches:
            rec, tsf = decoder.rewrite(batch)
            half = batch['size'] // 2
            print("rec:")
            print(rec)
            print("tsf:")
            print(tsf)
            data0_tsf += tsf[:half]
            data1_tsf += tsf[half:]
        n0, n1 = len(inp), len(inp)
        data0_tsf = reorder(order0, data0_tsf)[:n0]
        data1_tsf = reorder(order1, data1_tsf)[:n1]
        print(data0_tsf)
        print(data1_tsf)
Beispiel #3
0
def main():
    
    print("ARGUMENTS: ")
    args = load_arguments()
    print("Arguments loaded. ")

    if args.num_clusters_end < args.num_clusters_start:
        args.num_clusters_end = args.num_clusters_start
   
    if args.plot_extensions.lower() == 'y':
        extensions.plot_extensions(args.dataset_path,
                                  args.num_extensions)

    if args.convert.lower() == 'y':
        converting_utilities.convert(args.dataset_path, 
                                     args.num_extensions)

    start = args.num_clusters_start
    end = args.num_clusters_end
    num_clusters = start

    print("Clustering for all k: " + str(start) + "<=k<=" + str(end) + "...\n")
    while num_clusters <= end:
        if args.cluster_struct.lower() == 'y':
            schema_clustering.runflow(args.dataset_path, 
                                      num_clusters, 
                                      args.overwrite_distmat_struct, 
                                      args.overwrite_plot_struct,
                                      args.fill_threshold)

        if args.cluster_text.lower() == 'y':
            document_clustering.runflow(num_clusters, 
                                        args.overwrite_tokens_text,
                                        args.overwrite_clusters_text,
                                        args.dataset_path,
                                        args.minibatch_kmeans)
        num_clusters += 1
def main():

    # Setting theano environment flags
    theano.config.floatX = 'float64'
    theano.config.optimizer = 'fast_run'
    theano.config.exception_verbosity = 'high'
    args = options.load_arguments()
    print(args)

    # Loading data
    print("Loading Review data.")
    prod_to_sent = load_reviews(args)
    flic = FLIC()
    load_flic_data(args, flic)
    sent_emb_dim = np.shape(prod_to_sent.values()[0][0])[1]

    # Evaluate model using cross validation
    if args.evaluate_model:

        # If training gets cancelled using the SIGINT exception, the model will still be evaluated on the testsets.
        signal_handler = GracefullExit(parent=True)

        # The following lists store the errors, accuracies and num_samples_average of the k processes (from the k-fold
        # cross validation).
        error_train_l = []
        error_valid_l = []
        error_test_l = []

        accuracy_train_l = []
        accuracy_valid_l = []
        accuracy_test_l = []

        num_samp_train_l = []
        num_samp_valid_l = []
        num_samp_test_l = []

        k = args.num_cross_validation  # The number of runs in the crossvalidation loop (k-fold cross validation)
        # The products in the testset is disjoint from the products in the training and validation setes.
        if args.testset_has_new_products:
            cross_validation = CrossValidationProducts(k=k, args=args, perc_validation_data= 0.15)
        else:
            cross_validation = CrossValidationLinks(k=k, args=args, perc_validation_data=0.5)

        # Parallelize cross validation runs
        pool = NoDaemonPool(processes=k)
        rand_generator = random.Random()
        rand_generator.jumpahead(random.randint(1, 10000000))

        # A function to store the return values of the k processes (from the k-fold cross validation).
        def store_results(return_value):
            error_train, samples_train, accuracy_train, \
            error_valid, samples_valid, accuracy_valid, \
            error_test, samples_test, accuracy_test = return_value

            def average_sample_size(sampled_sents):
                return np.mean([len(samp) for samples in map(lambda x: x[2], sampled_sents) for samp in samples])

            error_train_l.append(error_train)
            num_samp_train_l.append(average_sample_size(samples_train))
            accuracy_train_l.append(accuracy_train)

            error_valid_l.append(error_valid)
            num_samp_valid_l.append(average_sample_size(samples_valid))
            accuracy_valid_l.append(accuracy_valid)

            error_test_l.append(error_test)
            num_samp_test_l.append(average_sample_size(samples_test))
            accuracy_test_l.append(accuracy_test)

        print("Starting Cross-Validation. I am process {}.".format(os.getpid()))
        k_0 = 0

        # Cross validation loop
        for D, D_valid, D_test, D_C, D_C_valid, D_C_test in cross_validation.next_sample():
            rand_generator.jumpahead(random.randint(1, 10000000))
            pool.apply_async(_train_and_evaluate,
                             (D, D_valid, D_test, D_C, D_C_valid, D_C_test,
                                   flic, prod_to_sent, args, sent_emb_dim, k_0),
                             callback=store_results)
            k_0 += 1

        pool.close()
        pool.join()
        sys.stdout = sys.__stdout__

        # Store the results in a Pandas DataFrame.
        store_results_in_DF(args, flic._dim_att, sent_emb_dim,
                            error_train_l, accuracy_train_l, num_samp_train_l,
                            error_valid_l, accuracy_valid_l, num_samp_valid_l,
                            error_test_l, accuracy_test_l, num_samp_test_l)

        # Console output for fast interpretation.
        print("Train error = {}".format(error_train_l))
        print("Mean = {}".format(np.array(error_train_l).mean(axis=0)))
        print("Std = {}".format(np.array(error_train_l).std(axis=0)))
        print("Average sample size = {}".format(np.mean(num_samp_train_l)))
        print("Accuracy = {}\n".format(accuracy_train_l))

        print("Validation error = {}".format(error_valid_l))
        print("Mean = {}".format(np.array(error_valid_l).mean(axis=0)))
        print("Std = {}".format(np.array(error_valid_l).std(axis=0)))
        print("Average sample size = {}".format(np.mean(num_samp_valid_l)))
        print("Accuracy = {}\n".format(accuracy_valid_l))

        print("Test error = {}".format(error_test_l))
        print("Mean = {}".format(np.array(error_test_l).mean(axis=0)))
        print("Std = {}".format(np.array(error_test_l).std(axis=0)))
        print("Average sample size = {}".format(np.mean(num_samp_test_l)))
        print("Accuracy = {}\n".format(accuracy_test_l))

    # Train the data
    else:
        # Loads the data
        print("Load training Data.")
        D, D_C, D_valid, D_C_valid, D_test, D_C_test = load_link_data(args, perc_validation_data=args.perc_validation_data,
                                                                      perc_test_data= args.perc_test_data)
        best_val_error = train_model(args, sent_emb_dim, flic, prod_to_sent, D, D_C, D_valid, D_C_valid)

        print("Best validation error = {}".format(best_val_error))
Beispiel #5
0
def online_transfer(neg_lines, write_file='neg.txt', style=0):
    with open(write_file, 'w') as f:
        for line in neg_lines:
            y = style
            batch = get_batch([line], [y], vocab.word2id)
            ori, tsf = decoder.rewrite(batch)
            # f.write('input: {}\n'.format(' '.join(line)))
            # f.write('original: {}\n'.format(' '.join(w for w in ori[0])))
            # f.write('transfer: {}\n'.format(' '.join(w for w in tsf[0])))
            f.write('{}\n'.format(' '.join(line)))
            f.write('{}\n'.format(' '.join(w for w in ori[0])))
            f.write('{}\n'.format(' '.join(w for w in tsf[0])))


if __name__ == '__main__':
    args = load_arguments()

    if not os.path.exists(args.model):
        os.system("mkdir -p {}".format(args.model))

    #####   data preparation   #####
    if args.train or args.latent_train:
        chosen = args.train if len(args.train) > len(args.latent_train) else \
          args.latent_train
        # train0 = load_sent(chosen + '.0', args.max_train_size)
        # train1 = load_sent(chosen + '.1', args.max_train_size)

        train0 = load_sent(chosen + 'formal', args.max_train_size)
        train1 = load_sent(chosen + 'informal', args.max_train_size)

        print('#sents of training file 0:', len(train0))
Beispiel #6
0
            #            "  prec2={:.4f} generator time={:.4f} encoder time={:.4f} total test time={:.4f}\n").format(
            #        r_mse,
            #        r_p1,
            #        r_prec1,
            #        r_prec2,
            #        gen_time,
            #        enc_time,
            #        time.time() - start_rational_time
            #))

            data = str('%.5f' % r_mse) + "\t" + str(
                '%4.2f' % r_p1) + "\t" + str('%4.4f' % r_prec1) + "\t" + str(
                    '%4.4f' %
                    r_prec2) + "\t" + str('%4.2f' % gen_time) + "\t" + str(
                        '%4.2f' % enc_time) + "\t" + str(
                            '%4.2f' % prec_cal_time) + "\t" + str(
                                '%4.2f' % (time.time() - start_rational_time)
                            ) + "\t" + str(args.sparsity) + "\t" + str(
                                args.coherent) + "\t" + str(
                                    args.max_epochs) + "\t" + str(
                                        args.cur_epoch)

            with open(args.graph_data_path, 'a') as g_f:
                print 'writning to file: ', data
                g_f.write(data + "\n")


if __name__ == "__main__":
    args = options.load_arguments()
    main()
Beispiel #7
0
        dev_x = [ embedding_layer.map_to_ids(x)[:max_len] for x in dev_x ]

    if args.test:
        test_x, test_y = myio.read_annotations(args.test)
        test_x = [ embedding_layer.map_to_ids(x)[:max_len] for x in test_x ]

    if args.train:
        model = Model(
                    args = args,
                    embedding_layer = embedding_layer,
                    nclasses = len(train_y[0])
                )
        model.ready()

        #debug_func2 = theano.function(
        #        inputs = [ model.x, model.z ],
        #        outputs = model.generator.logpz
        #    )
        #theano.printing.debugprint(debug_func2)
        #return

        model.train(
                (train_x, train_y),
                (dev_x, dev_y) if args.dev else None,
                (test_x, test_y) if args.test else None
            )

if __name__=="__main__":
    args = options.load_arguments()
    main()
Beispiel #8
0
def main():

    print("ARGUMENTS: ")
    args = load_arguments()
    print("Arguments loaded. ")

    if args.num_clusters_end < args.num_clusters_start:
        args.num_clusters_end = args.num_clusters_start

    if args.plot_extensions.lower() == 'y':
        extensions.plot_extensions(args.dataset_path, args.num_extensions)

    if args.convert.lower() == 'y':
        converting_utilities.convert(args.dataset_path, args.num_extensions,
                                     args.num_processes)

    if args.cluster_struct.lower() == 'y':
        start = args.num_clusters_start
        end = args.num_clusters_end
        num_clusters = start
        max_struct_score = 0
        optimal_num_clusters = start
        print("Clustering structured files for all k: " + str(start) +
              "<=k<=" + str(end) + "...\n")
        while num_clusters <= end:
            scores = schema_clustering.runflow(args.dataset_path, num_clusters,
                                               args.overwrite_distmat_struct,
                                               args.overwrite_plot_struct,
                                               args.fill_threshold)
            struct_score = scores[0]
            print("Schema clustering with k=" + str(num_clusters) +
                  " yields freqdrop score of " + str(struct_score))
            if struct_score > max_struct_score:
                max_struct_score = struct_score
                optimal_num_clusters = num_clusters
            num_clusters += 1
        print("k with highest freqdrop score:", str(optimal_num_clusters))

    if args.cluster_text.lower() == 'y':
        start = args.num_clusters_start
        end = args.num_clusters_end
        num_clusters = start
        max_text_score = 0
        optimal_num_clusters = start
        allscores = []
        retokenize = args.overwrite_tokens_text
        print("Clustering text files for all k: " + str(start) + "<=k<=" +
              str(end) + "...\n")
        while num_clusters <= end:
            if num_clusters > start:
                retokenize = "n"
            sil, frqdrop, text_score = document_clustering.runflow(
                num_clusters, args.overwrite_tokens_text,
                args.overwrite_clusters_text, args.dataset_path,
                args.minibatch_kmeans, args.num_processes)
            print("Text clustering with k=" + str(num_clusters) +
                  " yields freqdrop score of " + str(frqdrop))
            allscores.append(frqdrop)
            if frqdrop > max_text_score:
                max_text_score = frqdrop
                optimal_num_clusters = num_clusters
            num_clusters += 1
        for x in range(len(allscores)):
            print("k=" + str(start + x) + " cleanliness=" + str(allscores[x]))
        print("k with highest cleanliness score:", str(optimal_num_clusters))
    batches = get_lm_batches(x, vocab.word2id, args.batch_size)
    tot_loss, n_words = 0, 0

    for batch in batches:
        tot_loss += sess.run(model.tot_loss,
            feed_dict={model.batch_size: batch['size'],
                       model.inputs: batch['inputs'],
                       model.targets: batch['targets'],
                       model.weights: batch['weights'],
                       model.dropout: 1})
        n_words += np.sum(batch['weights'])

    return np.exp(tot_loss / n_words)

if __name__ == '__main__':
    args = load_arguments()

    if args.train:
        train = load_sent(args.train)

        if not os.path.isfile(args.vocab):
            build_vocab(train, args.vocab)

    vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb)
    print 'vocabulary size', vocab.size

    if args.dev:
        dev = load_sent(args.dev)

    if args.test:
        test = load_sent(args.test)
Beispiel #10
0
                               str(test_prediction_scores[k]) + '\n')
        summary_file.write('\nParameters:\n')
        for key in options.IMPORTANT_PARAMS:
            summary_file.write(str(key) + ": " + str(vars(args)[key]) + "\n")
    print(
        f'Load Time: {asMinutes(TRAIN_START_TIME-LOAD_START_TIME)}\nTrain Time: {asMinutes(EVAL_START_TIME-TRAIN_START_TIME)}\nEval Time: {asMinutes(time() - EVAL_START_TIME)}\nTotal Time: {asMinutes(time()-LOAD_START_TIME)}'
    )


def get_user_yesno_answer(question):
    answer = input(question + '(y/n)')
    if answer == 'y': return True
    elif answer == 'n': return False
    else:
        print("Please answer 'y' or 'n'")
        return (get_user_yesno_answer(question))


if __name__ == "__main__":
    ARGS = options.load_arguments()

    import torch
    torch.manual_seed(ARGS.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    os.environ['CUDA_VISIBLE_DEVICES'] = ARGS.cuda_visible_devices
    import numpy as np
    np.random.seed(ARGS.seed)
    main(ARGS)