コード例 #1
0
    data_files = [path + 'data/train.txt', path + 'data/test.txt']

    idx_files = [
        path + 'data/words_vocab.pkl', path + 'data/characters_vocab.pkl'
    ]

    tensor_files = [[
        path + 'data/train_word_tensor.npy',
        path + 'data/valid_word_tensor.npy'
    ],
                    [
                        path + 'data/train_character_tensor.npy',
                        path + 'data/valid_character_tensor.npy'
                    ]]

    batch_loader = BatchLoader(data_files, idx_files, tensor_files, path)
    parameters = Parameters(batch_loader.max_word_len,
                            batch_loader.max_seq_len,
                            batch_loader.words_vocab_size,
                            batch_loader.chars_vocab_size)
    ''' ============================= BatchLoader loading ===============================================
    '''

    data_files = [
        path + 'data/super/train_2.txt', path + 'data/super/test_2.txt'
    ]

    idx_files = [
        path + 'data/super/words_vocab_2.pkl',
        path + 'data/super/characters_vocab_2.pkl'
    ]
コード例 #2
0
def main():
    parser = argparse.ArgumentParser(description='Paraphraser')
    parser.add_argument('--use-cuda',
                        type=bool,
                        default=False,
                        metavar='CUDA',
                        help='use cuda (default: False)')
    parser.add_argument('--seq-len',
                        default=30,
                        metavar='SL',
                        help='max length of sequence (default: 30)')
    parser.add_argument('--ml',
                        type=bool,
                        default=True,
                        metavar='ML',
                        help='sample by maximum likelihood')

    args = parser.parse_args()

    # Read data
    if not os.path.exists('datasets/human_test.csv'):
        source_file = 'datasets/test.csv'
        source_data = pd.read_csv(source_file)[['question1', 'question2']]
        sentence_categories = [[] for _ in range(5)]
        for i in range(len(source_data)):

            sent = clean_str(source_data['question1'][i])
            sent_len = len(sent.split())
            if sent_len < 6:
                j = 0
            elif sent_len < 11:
                j = 1
            elif sent_len < 16:
                j = 2
            elif sent_len < 21:
                j = 3
            else:
                j = 4
            sentence_categories[j].append(
                [source_data['question1'][i], source_data['question2'][i]])

        sample_data = []
        for category in sentence_categories:
            sample_data += random.sample(category, 20)
        source_data = pd.DataFrame(sample_data,
                                   columns=['question1', 'question2'])
        source_data.to_csv('datasets/human_test.csv')
    else:
        source_data = pd.read_csv('datasets/human_test_1.csv')[[
            'question1', 'question2'
        ]]

    # Sample from Guptas original model
    batch_loader = BatchLoader()
    from model.parameters import Parameters
    parameters = Parameters(batch_loader.max_seq_len, batch_loader.vocab_size)
    paraphraser = Paraphraser(parameters)
    paraphraser.load_state_dict(
        t.load('saved_models/trained_paraphraser_ori_32',
               map_location=t.device('cpu')))

    samples_ori, target, source_ori = sample_with_input(
        batch_loader,
        paraphraser,
        args,
        decoder_only=True,
        file_name='datasets/human_test.csv')

    ref_items = generate_items(source_ori, target, 'ref')
    ori_items = generate_items(source_ori, samples_ori[0], 'ori')

    # Sample from Guptas model with two-path-loss
    batch_loader = BatchLoader()
    parameters = Parameters(batch_loader.max_seq_len,
                            batch_loader.vocab_size,
                            use_two_path_loss=True)
    paraphraser = Paraphraser(parameters)
    paraphraser.load_state_dict(
        t.load('saved_models/trained_paraphraser_tpl_16_32',
               map_location=t.device('cpu')))

    samples_tpl, target, source_tpl = sample_with_input(
        batch_loader,
        paraphraser,
        args,
        decoder_only=False,
        file_name='datasets/human_test.csv')
    tpl_items = generate_items(source_tpl, samples_tpl[0], 'tpl')

    # Sample from GAN model
    batch_loader = BatchLoader()
    from model.parametersGAN import Parameters
    parameters = Parameters(batch_loader.max_seq_len, batch_loader.vocab_size)
    paraphraser = Generator(parameters)
    paraphraser.load_state_dict(
        t.load('saved_models/trained_generator_gan_140k',
               map_location=t.device('cpu')))
    samples_gan, target, source_gan = sample_with_input(
        batch_loader,
        paraphraser,
        args,
        decoder_only=False,
        file_name='datasets/human_test.csv')
    gan_items = generate_items(source_gan, samples_gan[0], 'gan')

    # Sample from synonym model
    paraphraser = SynonymParaphraser()
    samples_synonym = paraphraser.generate_paraphrases(
        'datasets/human_test.csv')
    base_items = generate_items(source_data['question1'], samples_synonym,
                                'base')

    all_items = ref_items + ori_items + tpl_items + gan_items + base_items

    eval_results = {
        'name': 'Paraphrase Survey Full Ordered',
        'items': all_items
    }
    res = json.dumps(eval_results, ensure_ascii=False)
    with open('datasets/human_test_ordered.json', 'w') as f:
        f.write(res)

    random.shuffle(all_items)

    eval_results = {
        'name': 'Paraphrase Survey Full Shuffled',
        'items': all_items
    }
    res = json.dumps(eval_results, ensure_ascii=False)
    with open('datasets/human_test_shuffled.json', 'w') as f:
        f.write(res)

    for i in range(10):
        eval_results = {
            'name': f'Paraphrase Survey Part {i+1}/{10}',
            'items': all_items[i * 50:((i + 1) * 50) - 1]
        }
        res = json.dumps(eval_results, ensure_ascii=False)
        with open(f'datasets/human_test_p_{i}_{10}.json', 'w') as f:
            f.write(res)
コード例 #3
0
    idx_files = [
        path + f"data/words_vocab_{args.embeddings_name}.pkl",
        path + f"data/characters_vocab_{args.embeddings_name}.pkl",
    ]

    tensor_files = [
        [path + f"data/test_word_tensor_{args.embeddings_name}.npy"],
        [path + f"data/test_character_tensor_{args.embeddings_name}.npy"],
    ]

    preprocessor = PreProcessor(idx_files)
    preprocessor.preprocess_data(data_files, idx_files, tensor_files,
                                 args.use_file, str)

    batch_loader = BatchLoader(data_files, idx_files, tensor_files)
    parameters = Parameters(
        batch_loader.max_word_len,
        batch_loader.max_seq_len,
        batch_loader.words_vocab_size,
        batch_loader.chars_vocab_size,
        args.embeddings_name,
        args.res_model,
        args.hrvae,
        args.wae,
    )
    """ ============================ BatchLoader for Question-2 ===============================================
    """
    data_files = [path + f"data/super/train_{args.data_name}_2.txt"]

    idx_files = [
コード例 #4
0
        path + f"data/words_vocab_{args.embeddings_name}.pkl",
        path + f"data/characters_vocab_{args.embeddings_name}.pkl",
    ]

    tensor_files = [
        [
            path + f"data/train_word_tensor_{args.embeddings_name}.npy",
            path + f"data/valid_word_tensor_{args.embeddings_name}.npy",
        ],
        [
            path + f"data/train_character_tensor_{args.embeddings_name}.npy",
            path + f"data/valid_character_tensor_{args.embeddings_name}.npy",
        ],
    ]

    batch_loader = BatchLoader(data_files, idx_files, tensor_files, path)
    parameters = Parameters(
        batch_loader.max_word_len,
        batch_loader.max_seq_len,
        batch_loader.words_vocab_size,
        batch_loader.chars_vocab_size,
        args.embeddings_name,
        args.res_model,
        args.hrvae,
        args.wae,
    )

    """ =================== Doing the same for encoder-2 ===============================================
    """
    data_files = [path + f"data/super/train_{args.data_name}_2.txt", path + f"data/super/test_{args.data_name}_2.txt"]
コード例 #5
0
    parser.add_argument('--num-sample',
                        type=int,
                        default=5,
                        metavar='NS',
                        help='num sample (default: 5)')
    parser.add_argument('--use-cuda',
                        type=bool,
                        default=True,
                        metavar='CUDA',
                        help='use cuda (default: True)')
    args = parser.parse_args()

    prefix = 'poem'
    word_is_char = True

    batch_loader = BatchLoader('', prefix, word_is_char)
    params = Parameters(batch_loader.max_word_len, batch_loader.max_seq_len,
                        batch_loader.words_vocab_size,
                        batch_loader.chars_vocab_size, word_is_char)

    neg_loss = NEG_loss(params.word_vocab_size, params.word_embed_size)
    if args.use_cuda and t.cuda.is_available():
        neg_loss = neg_loss.cuda()

    # NEG_loss is defined over two embedding matrixes with shape of [params.word_vocab_size, params.word_embed_size]
    optimizer = SGD(neg_loss.parameters(), 0.1)

    for iteration in range(args.num_iterations):

        input_idx, target_idx = batch_loader.next_embedding_seq(
            args.batch_size)
コード例 #6
0
from selfModules.neg import NEG_loss

if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='word2vec')
    parser.add_argument('--num-iterations', type=int, default=1000000, metavar='NI',
                        help='num iterations (default: 1000000)')
    parser.add_argument('--batch-size', type=int, default=10, metavar='BS',
                        help='batch size (default: 10)')
    parser.add_argument('--num-sample', type=int, default=5, metavar='NS',
                        help='num sample (default: 5)')
    parser.add_argument('--use-cuda', type=bool, default=True, metavar='CUDA',
                        help='use cuda (default: True)')
    args = parser.parse_args()

    batch_loader = BatchLoader('')
    params = Parameters(batch_loader.max_word_len,
                        batch_loader.max_seq_len,
                        batch_loader.words_vocab_size,
                        batch_loader.chars_vocab_size)

    neg_loss = NEG_loss(params.word_vocab_size, params.word_embed_size)
    if args.use_cuda:
        neg_loss = neg_loss.cuda()

    # NEG_loss is defined over two embedding matrixes with shape of [params.word_vocab_size, params.word_embed_size]
    optimizer = SGD(neg_loss.parameters(), 0.1)

    for iteration in range(args.num_iterations):

        input_idx, target_idx = batch_loader.next_embedding_seq(args.batch_size)
コード例 #7
0
                        default=20,
                        metavar='BS',
                        help='batch size (default: 20)')
    parser.add_argument('--num-sample',
                        type=int,
                        default=14,
                        metavar='NS',
                        help='num sample (default: 14)')
    parser.add_argument('--use-cuda',
                        type=bool,
                        default=False,
                        metavar='CUDA',
                        help='use cuda (default: True)')
    args = parser.parse_args()

    batch_loader = BatchLoader('')
    params = Parameters(batch_loader.max_seq_len, batch_loader.vocab_size)

    neg_loss = NEG_loss(
        params.vocab_size,
        params.word_embed_size,
        weights=[1 - sqrt(5e-5 / i) for i in batch_loader.words_freq])
    if args.use_cuda:
        neg_loss = neg_loss.cuda()
    """NEG_loss is defined over two embedding matrixes with shape of [params.vocab_size, params.word_embed_size]"""
    optimizer = SGD(neg_loss.parameters(), 0.1)

    for iteration in range(args.num_iterations):

        input_idx, target_idx = batch_loader.next_embedding_seq(
            args.batch_size)
コード例 #8
0
    parser.add_argument('--use-cuda', type=bool, default=False, metavar='CUDA',
                        help='use cuda (default: True)')
    parser.add_argument('--learning-rate', type=float, default=0.00005, metavar='LR',
                        help='learning rate (default: 0.00005)')
    parser.add_argument('--dropout', type=float, default=0.3, metavar='DR',
                        help='dropout (default: 0.3)')
    parser.add_argument('--use-trained', type=bool, default=False, metavar='UT',
                        help='load pretrained model (default: False)')
    parser.add_argument('--ce-result', default='', metavar='CE',
                        help='ce result path (default: '')')
    parser.add_argument('--kld-result', default='', metavar='KLD',
                        help='ce result path (default: '')')

    args = parser.parse_args()

    batch_loader = BatchLoader('')
    parameters = Parameters(batch_loader.max_word_len,
                            batch_loader.max_seq_len,
                            batch_loader.words_vocab_size,
                            batch_loader.chars_vocab_size)

    rvae = RVAE(parameters)
    if args.use_trained:
        rvae.load_state_dict(t.load('trained_RVAE'))
    if args.use_cuda:
        rvae = rvae.cuda()

    optimizer = Adam(rvae.learnable_parameters(), args.learning_rate)

    train_step = rvae.trainer(optimizer)
    # validate = rvae.validater()
コード例 #9
0
    parser = argparse.ArgumentParser(description='word2vec')
    parser.add_argument('--num-iterations', type=int, default=1000000, metavar='NI',
                        help='num iterations (default: 1000000)')
    parser.add_argument('--batch-size', type=int, default=10, metavar='BS',
                        help='batch size (default: 10)')
    parser.add_argument('--num-sample', type=int, default=5, metavar='NS',
                        help='num sample (default: 5)')
    parser.add_argument('--use-cuda', type=bool, default=True, metavar='CUDA',
                        help='use cuda (default: True)')
    parser.add_argument('--train-data', default='', metavar='TD',
                        help='load custom training dataset (default: '')')

    args = parser.parse_args()

    batch_loader = BatchLoader(path = '', 
                               custom_index = False, 
                               train_data_name=args.train_data)

    params = Parameters(batch_loader.max_word_len,
                        batch_loader.max_seq_len,
                        batch_loader.words_vocab_size,
                        batch_loader.chars_vocab_size)

    neg_loss = NEG_loss(params.word_vocab_size, params.word_embed_size)
    if args.use_cuda:
        neg_loss = neg_loss.cuda()

    # NEG_loss is defined over two embedding matrixes with shape of [params.word_vocab_size, params.word_embed_size]
    optimizer = SGD(neg_loss.parameters(), 0.1)

    for iteration in range(args.num_iterations):
コード例 #10
0
        data_files = [path + 'data/train.txt', path + 'data/test.txt']

        idx_files = [
            path + 'data/words_vocab.pkl', path + 'data/characters_vocab.pkl'
        ]

        tensor_files = [[
            path + 'data/train_word_tensor.npy',
            path + 'data/valid_word_tensor.npy'
        ],
                        [
                            path + 'data/train_character_tensor.npy',
                            path + 'data/valid_character_tensor.npy'
                        ]]

        batch_loader = BatchLoader(data_files, idx_files, tensor_files, path)
        batch_loader.load_preprocessed(data_files, idx_files, tensor_files)
        parameters = para(batch_loader.max_word_len, batch_loader.max_seq_len,
                          batch_loader.words_vocab_size,
                          batch_loader.chars_vocab_size)
        ''' =================== Doing the same for encoder-2 ===============================================
        '''
        data_files = [
            path + 'data/super/train_2.txt', path + 'data/super/test_2.txt'
        ]

        idx_files = [
            path + 'data/super/words_vocab_2.pkl',
            path + 'data/super/characters_vocab_2.pkl'
        ]
コード例 #11
0
def sample(args):
    # Create locations to store samples
    if not os.path.isdir('logs/' + args.model_name + '/samples'):
        os.mkdir('logs/' + args.model_name + '/samples')

    batch_loader = BatchLoader()
    # Load model...
    if 'ori' in args.model_name.lower() and not 'gan' in args.model_name.lower(
    ) or 'tpl' in args.model_name.lower():
        from model.parameters import Parameters
        parameters = Parameters(batch_loader.max_seq_len,
                                batch_loader.vocab_size,
                                use_two_path_loss=('tpl'
                                                   in args.model_name.lower()))
        paraphraser = Paraphraser(parameters)
        if args.use_cuda:
            paraphraser.load_state_dict(
                t.load('saved_models/trained_paraphraser_' + args.model_name,
                       map_location=t.device('cuda:0')))
        else:
            paraphraser.load_state_dict(
                t.load('saved_models/trained_paraphraser_' + args.model_name,
                       map_location=t.device('cpu')))
    elif 'gan' in args.model_name.lower():
        from model.parametersGAN import Parameters
        parameters = Parameters(batch_loader.max_seq_len,
                                batch_loader.vocab_size)
        paraphraser = Generator(parameters)
        if args.use_cuda:
            paraphraser.load_state_dict(
                t.load('saved_models/trained_generator_' + args.model_name,
                       map_location=t.device('cuda:0')))
        else:
            paraphraser.load_state_dict(
                t.load('saved_models/trained_generator_' + args.model_name,
                       map_location=t.device('cpu')))
    if args.beam:
        samples, target, source = sample_with_beam(
            batch_loader,
            paraphraser,
            args,
            decoder_only=('ori' in args.model_name.lower()
                          and not 'gan' in args.model_name.lower()),
            beam_size=(args.num_samples if args.num_samples != 1 else 5))
        for i in range(args.num_samples):
            np.savetxt(f'logs/{args.model_name}/samples/sampled_beam_{i}.txt',
                       np.array(samples[i]),
                       delimiter='\n',
                       fmt='%s')
        np.savetxt(f'logs/{args.model_name}/samples/target_beam.txt',
                   np.array(target),
                   delimiter='\n',
                   fmt='%s')
        np.savetxt(f'logs/{args.model_name}/samples/source_beam.txt',
                   np.array(source),
                   delimiter='\n',
                   fmt='%s')
    else:
        samples, target, source = sample_with_input(
            batch_loader,
            paraphraser,
            args,
            decoder_only=('ori' in args.model_name.lower()
                          and not 'gan' in args.model_name.lower()),
            num_samples=args.num_samples,
            ml=args.ml)
        for i in range(args.num_samples):
            np.savetxt(f'logs/{args.model_name}/samples/sampled' +
                       ('_ml' if args.ml else '_s') + f'_{i}.txt',
                       np.array(samples[i]),
                       delimiter='\n',
                       fmt='%s')
        np.savetxt(f'logs/{args.model_name}/samples/target' +
                   ('_ml' if args.ml else '_s') + '.txt',
                   np.array(target),
                   delimiter='\n',
                   fmt='%s')
        np.savetxt(f'logs/{args.model_name}/samples/source' +
                   ('_ml' if args.ml else '_s') + '.txt',
                   np.array(source),
                   delimiter='\n',
                   fmt='%s')
コード例 #12
0
    # getting validation data
    train_images, train_labels, valid_images, valid_labels = train_test_split( (train_images, train_labels), n=10000 )
    
    # flattening images
    dim = train_images.shape[1] * train_images.shape[2]
    train_images = train_images.reshape(-1, dim)
    valid_images = valid_images.reshape(-1, dim)
    test_images = test_images.reshape(-1, dim)

    # reformat labels to be a binary classifier (even and odd)
    train_labels = np.vectorize(lambda n: 1 if n % 2 == 0 else -1)(train_labels)
    valid_labels = np.vectorize(lambda n: 1 if n % 2 == 0 else -1)(valid_labels)
    test_labels = np.vectorize(lambda n: 1 if n % 2 == 0 else -1)(test_labels)

    train_loader = BatchLoader((train_images, train_labels), batch_size=1, seed=seed)
    valid_loader = BatchLoader((valid_images, valid_labels), batch_size=None, seed=seed)
    test_loader = BatchLoader((test_images, test_labels), batch_size=None, seed=seed)

    """
    NN = PLA(dim)
    for i, (images, labels) in enumerate(test_loader):
        print(NN.predict(images))
        break
    """

    # creating model
    NN = PLA(dim, seed=seed)
    statistics = NN.train(train_loader, valid_loader, test_loader, epochs=5, eval_freq=10000, plot=False)

    # printing statistics
コード例 #13
0
import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE
from utils.batch_loader import BatchLoader

if __name__ == "__main__":
    if not os.path.exists('../data/preprocessings/word_embeddings.npy'):
        raise FileNotFoundError("word embeddings file was't found")

    pca = TSNE(n_components=2)
    word_embeddings = np.load('../data/preprocessings/word_embeddings.npy')
    word_embeddings_pca = pca.fit_transform(word_embeddings)

    batch_loader = BatchLoader('../')
    words = batch_loader.idx_to_word

    fig, ax = plt.subplots()
    fig.set_size_inches(150, 150)
    x = word_embeddings_pca[:, 0]
    y = word_embeddings_pca[:, 1]
    ax.scatter(x, y)

    for i, word in enumerate(words):
        ax.annotate(word, (x[i], y[i]))

    fig.savefig('word_embedding.png', dpi=100)
コード例 #14
0
ファイル: train.py プロジェクト: AbdollahRida/RGAN-1
                        default=False,
                        metavar='CUDA',
                        help='use cuda (default: True)')
    parser.add_argument('--learning-rate',
                        type=float,
                        default=5e-5,
                        metavar='LR',
                        help='learning rate (default: 5e-5)')
    parser.add_argument('--use-trained',
                        type=bool,
                        default=False,
                        metavar='UT',
                        help='load pretrained model (default: False)')
    args = parser.parse_args()

    batch_loader = BatchLoader('')
    parameters = Parameters(batch_loader.max_word_len,
                            batch_loader.max_seq_len, batch_loader.vocab_size)

    rgan = RGAN(parameters)
    if args.use_trained:
        rgan.load_state_dict(t.load('trained_RGAN'))
    if args.use_cuda:
        rgan = rgan.cuda()

    g_optimizer = Adam(rgan.generator.parameters(), args.learning_rate)
    d_optimizer = Adam(rgan.discriminator.parameters(), args.learning_rate)

    for iteration in range(args.num_iterations):
        for _ in range(5):
            '''Dicriminator forward-loss-backward-update'''
コード例 #15
0
                    help='if include snli dataset (default: True)')
    parser.add_argument('--use-coco', default=False, type=bool, metavar='coco', 
                    help='if include mscoco dataset (default: False)')

    args = parser.parse_args()
    datasets = set()
    if args.use_quora is True:
        datasets.add('quora')
    if args.use_snli is True:
        datasets.add('snli')
    if args.use_coco is True:
        datasets.add('mscoco')

    print('use mean' , args.use_mean)

    batch_loader = BatchLoader(datasets=datasets)
    parameters = Parameters(batch_loader.max_seq_len,
                            batch_loader.vocab_size)

    paraphraser = Paraphraser(parameters)
    paraphraser.load_state_dict(t.load('saved_models/trained_paraphraser_' + args.model_name))
    # 自己导入模型的结构信息
    if args.use_cuda:
        paraphraser = paraphraser.cuda()

    result, target, source = sample_with_input_file(batch_loader, paraphraser, args, args.input_file)

    if args.input_file not in ['snli_test', 'mscoco_test', 'quora_test', 'snips']:
        args.input_file = 'custom_file'

    sampled_file_dst = 'logs/sampled_out_{}_{}{}.txt'.format(args.input_file,
コード例 #16
0
    data_files = [path + 'data/train.txt', path + 'data/test.txt']

    idx_files = [
        path + 'data/words_vocab.pkl', path + 'data/characters_vocab.pkl'
    ]

    tensor_files = [[
        path + 'data/train_word_tensor.npy',
        path + 'data/valid_word_tensor.npy'
    ],
                    [
                        path + 'data/train_character_tensor.npy',
                        path + 'data/valid_character_tensor.npy'
                    ]]

    batch_loader = BatchLoader(data_files, idx_files, tensor_files, path)
    parameters = Parameters(batch_loader.max_word_len,
                            batch_loader.max_seq_len,
                            batch_loader.words_vocab_size,
                            batch_loader.chars_vocab_size)
    ''' =================== Doing the same for encoder-2 ===============================================
    '''
    data_files = [
        path + 'data/super/train_2.txt', path + 'data/super/test_2.txt'
    ]

    idx_files = [
        path + 'data/super/words_vocab_2.pkl',
        path + 'data/super/characters_vocab_2.pkl'
    ]
コード例 #17
0
import os

import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA

from utils.batch_loader import BatchLoader

if __name__ == "__main__":

    prefix = 'poem'
    word_is_char = True

    batch_loader = BatchLoader('', prefix, word_is_char)

    if not os.path.exists('data/' + batch_loader.prefix +
                          'word_embeddings.npy'):
        raise FileNotFoundError("word embeddings file was't found")

    pca = PCA(n_components=2)
    word_embeddings = np.load('data/' + batch_loader.prefix +
                              'word_embeddings.npy')
    word_embeddings_pca = pca.fit_transform(word_embeddings)

    words = batch_loader.idx_to_word

    fig, ax = plt.subplots()
    fig.set_size_inches(150, 150)
    x = word_embeddings_pca[:, 0]
    y = word_embeddings_pca[:, 1]
    ax.scatter(x, y)
        data_files = [args.train_file]
        data = [open(file, "r").read() for file in data_files]

    elif args.train_file.endswith('.pkl'):
        data_files = args.train_file.split(',')
        print data_files
        data = [pkl.load(open(file, "rb")) for file in data_files]
        sentence_array = True

    idx_files = [
        args.save_at + '/words_vocab.pkl',
        args.save_at + '/characters_vocab.pkl'
    ]

    batch_loader = BatchLoader(data,
                               idx_files,
                               train_embedding=True,
                               sentence_array=sentence_array)

    params = Parameters(batch_loader.max_word_len, batch_loader.max_seq_len,
                        batch_loader.words_vocab_size,
                        batch_loader.chars_vocab_size)

    neg_loss = NEG_loss(params.word_vocab_size, params.word_embed_size)
    if args.use_cuda:
        neg_loss = neg_loss.cuda()

    # NEG_loss is defined over two embedding matrixes with shape of [params.word_vocab_size, params.word_embed_size]
    optimizer = SGD(neg_loss.parameters(), 0.1)

    for iteration in range(args.num_iterations):
コード例 #19
0
        str = input("Input: ")
    else:
        file_1 = open(args.test_file, 'r')
        data = file_1.readlines()

    data_files = [args.test_file]

    idx_files = ['data/words_vocab.pkl',
                 'data/characters_vocab.pkl']

    tensor_files = [['data/test_word_tensor.npy'],
                    ['data/test_character_tensor.npy']]

    preprocess_data(data_files, idx_files, tensor_files, args.use_file, str)

    batch_loader = BatchLoader(data_files, idx_files, tensor_files)
    parameters = Parameters(batch_loader.max_word_len,
                            batch_loader.max_seq_len,
                            batch_loader.words_vocab_size,
                            batch_loader.chars_vocab_size)

    data_files = ['data/super/train_2.txt']

    idx_files = ['data/super/words_vocab_2.pkl',
                 'data/super/characters_vocab_2.pkl']

    tensor_files = [['data/super/train_word_tensor_2.npy'],
                    ['data/super/train_character_tensor_2.npy']]
    batch_loader_2 = BatchLoader(data_files, idx_files, tensor_files)
    parameters_2 = Parameters(batch_loader_2.max_word_len,
                              batch_loader_2.max_seq_len,
コード例 #20
0
ファイル: prepare_corpus.py プロジェクト: xushenkun/vae
def generate_tensor_file(prefix, word_is_char, gen_tensors):
    batch_loader = BatchLoader('', prefix, word_is_char, gen_tensors)

    path=''

    data_files = [path + 'data/train.txt',
                       path + 'data/test.txt']

    idx_files = [path + 'data/words_vocab.pkl',
                      path + 'data/characters_vocab.pkl']

    tensor_files = [[path + 'data/train_word_tensor.npy',
                          path + 'data/valid_word_tensor.npy'],
                         [path + 'data/train_character_tensor.npy',
                          path + 'data/valid_character_tensor.npy']]

    batch_loader = BatchLoader(data_files, idx_files, tensor_files, path)
    
    # batch_loader = BatchLoader('')
    params = Parameters(batch_loader.max_word_len,
                        batch_loader.max_seq_len,
                        batch_loader.words_vocab_size,
                        batch_loader.chars_vocab_size)

    neg_loss = NEG_loss(params.word_vocab_size, params.word_embed_size)
    if args.use_cuda:
        neg_loss = neg_loss.cuda()

    optimizer = SGD(neg_loss.parameters(), 0.1)

    for iteration in range(args.num_iterations):
コード例 #22
0
    parser.add_argument('--embedding-size',
                        type=int,
                        default=100,
                        metavar='ES',
                        help='embeddings size (default: 100)')
    parser.add_argument('--interm-sampling',
                        type=bool,
                        default=False,
                        metavar='IS',
                        help='if sample while training (default: False)')

    args = parser.parse_args()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    batch_loader = BatchLoader(datapath=args.data_path,
                               use_glove=args.use_glove,
                               glove_path=args.glove_path,
                               embedding_size=args.embedding_size)
    parameters = Parameters(batch_loader.max_seq_len, batch_loader.vocab_size,
                            batch_loader.embedding_size)
    paraphraser = Paraphraser(parameters, device).to(device)

    cross_entropy_result_train = []
    kld_result_train = []
    cross_entropy_result_valid = []
    kld_result_valid = []
    cross_entropy_cur_train = []
    kld_cur_train = []

    if args.use_trained:
        # load the pretrained model
        paraphraser.load_state_dict(torch.load(args.pretrained_model_name))
コード例 #23
0
                        help='num samplings (default: 10)')
    parser.add_argument('--sample-data',
                        default='',
                        metavar='TD',
                        help='load custom training dataset (default: '
                        ')')
    parser.add_argument('--model-name',
                        default='',
                        metavar='TD',
                        help='name of saved model (default: '
                        ')')

    args = parser.parse_args()

    batch_loader = BatchLoader('',
                               custom_index=True,
                               train_data_name=args.sample_data)
    parameters = Parameters(batch_loader.max_word_len,
                            batch_loader.max_seq_len,
                            batch_loader.words_vocab_size,
                            batch_loader.chars_vocab_size)

    rvae = RVAE(parameters)
    rvae.load_state_dict(t.load('./trained_model/{}'.format(args.model_name)))
    if args.use_cuda:
        rvae = rvae.cuda()

    sampler = rvae.latent_sampler(batch_loader)

    zs = {}
    for i in range(
コード例 #24
0
                        help='num samplings (default: 10)')
    parser.add_argument('--model-name',
                        default='',
                        metavar='TD',
                        help='name of saved model (default: '
                        ')')
    parser.add_argument('--train-data',
                        default='',
                        metavar='TD',
                        help='load custom training dataset (default: '
                        ')')

    args = parser.parse_args()

    batch_loader = BatchLoader(path='',
                               custom_index=False,
                               train_data_name=args.train_data)

    parameters = Parameters(batch_loader.max_word_len,
                            batch_loader.max_seq_len,
                            batch_loader.words_vocab_size,
                            batch_loader.chars_vocab_size)

    rvae = RVAE(parameters)
    rvae.load_state_dict(t.load('./trained_model/{}'.format(args.model_name)))
    if args.use_cuda:
        rvae = rvae.cuda()

    sents = []
    seeds = {}
コード例 #25
0
    parser = argparse.ArgumentParser(description='Sampler')
    parser.add_argument('--use-cuda',
                        type=bool,
                        default=True,
                        metavar='CUDA',
                        help='use cuda (default: True)')
    parser.add_argument('--num-sample',
                        type=int,
                        default=10,
                        metavar='NS',
                        help='num samplings (default: 10)')

    args = parser.parse_args()

    batch_loader = BatchLoader('')
    parameters = Parameters(batch_loader.max_word_len,
                            batch_loader.max_seq_len,
                            batch_loader.words_vocab_size,
                            batch_loader.chars_vocab_size)

    rvae = RVAE_dilated(parameters)
    rvae.load_state_dict(t.load('trained_RVAE'))
    if args.use_cuda:
        rvae = rvae.cuda()

    for iteration in range(args.num_sample):
        seed = np.random.normal(size=[1, parameters.latent_variable_size])
        result = rvae.sample(batch_loader, 50, seed, args.use_cuda)
        print(result)
        print()
コード例 #26
0
    assert os.path.exists('trained_RVAE'), \
        'trained model not found'

    parser = argparse.ArgumentParser(description='Sampler')
    parser.add_argument('--use-cuda',
                        type=bool,
                        default=True,
                        metavar='CUDA',
                        help='use cuda (default: True)')
    # parser.add_argument('--num-sample', type=int, default=10, metavar='NS',
    #                     help='num samplings (default: 10)')

    args = parser.parse_args()

    batch_loader = BatchLoader('')
    parameters = Parameters(batch_loader.max_word_len,
                            batch_loader.max_seq_len,
                            batch_loader.words_vocab_size,
                            batch_loader.chars_vocab_size)

    rvae = RVAE(parameters)
    rvae.load_state_dict(torch.load('trained_RVAE'))
    if args.use_cuda:
        rvae = rvae.cuda()

    seq_len = 50
    seed = np.random.normal(size=[1, parameters.latent_variable_size])

    data = [["how are you ?"], ["how are you doing"]]
    data_words = [[line.split() for line in target] for target in data]