Example #1
0
def main():
    foldername = '/cs/labs/dshahaf/omribloch/data/text_lord/restorant/train/note_tiny_no_noise_dim_32_ntokens_5_nconv_10_nsamples_102400_content_noise_0.001/'
    # foldername = '/cs/labs/dshahaf/omribloch/data/text_lord/restorant/train/note_EM_no_noise_dim_32_ntokens_10_nconv_4_nsamples_1024_content_noise_0.0/'
    vocab_path = os.path.join(foldername, 'vocab.pickle')
    model_ckpt_path = os.path.join(foldername, 'last_checkpoint.ckpt')

    with open(vocab_path, 'rb') as file:
        vocab = pickle.load(file)
        print('vocab was loaded')

    decoder_dictionary = vocab_to_dictionary(vocab)

    device = 'cpu'
    nsamples = 102400
    ntokens = 5
    dim = 32
    content_noise = 0.001
    dropout = 0
    nconv = 10

    model = load_checkpoint(model_ckpt_path, 'cpu', device, nsamples,
                            decoder_dictionary.pad(), ntokens, dim,
                            content_noise, dropout, decoder_dictionary, 50,
                            nconv)

    print('model loaded')

    model.eval()

    dataset, vocab = get_dataset(
        10000, '/cs/labs/dshahaf/omribloch/data/text_lord/restorant/', vocab)

    for i in range(10):
        sid = dataset[i].id
        stars = dataset[i].stars
        # stars = 1
        review_sentence = ' '.join(dataset[i].review)
        print(review_sentence)
        decoded_sentence = gready_decode_single(model, vocab, stars, sid)
        print(decoded_sentence)
        decoded_sentence = gready_decode_single(model, vocab, 1 - stars, sid)
        print(decoded_sentence)
        print('-------------')

        decoded_sentence = beam_decode_single(model,
                                              vocab,
                                              sid,
                                              stars,
                                              topk=10,
                                              beam_width=4)
        for d in decoded_sentence:
            print(d)
        print('==============================')
Example #2
0
def main():
    # parse command line arguments
    parser = argparse.ArgumentParser(description='Train lord-seq2seq-convnet.')
    # session
    parser.add_argument('--note', type=str, help='a comment', required=True)
    parser.add_argument('--device',
                        type=str,
                        default='cpu',
                        help='cuda device: cuda:0 / cuda:1')
    parser.add_argument('--overwrite',
                        action='store_true',
                        help='delete old ckpt with this configuration')
    parser.add_argument(
        '--resume',
        action='store_true',
        help='resume training from  old ckpt with this configuration')
    parser.add_argument('--shuffle', action='store_true', help='shuffle input')
    parser.add_argument('--ckpt_every',
                        type=int,
                        default=25,
                        help='how many epochs between checkpoints')
    parser.add_argument(
        '--dir',
        type=str,
        default='/cs/labs/dshahaf/omribloch/data/text_lord/restorant/train',
        help='here the script will create a directory named by the parameters')
    parser.add_argument(
        '--data_dir',
        type=str,
        default='/cs/labs/dshahaf/omribloch/data/text_lord/restorant/')
    parser.add_argument('-f', action='store_true')
    # training
    parser.add_argument('--batch_size',
                        type=int,
                        help='batch size',
                        required=True)
    parser.add_argument('--epochs',
                        type=int,
                        help='number pf epochs to train',
                        required=True)
    parser.add_argument('--it',
                        type=int,
                        help='number pf train-eval iterations',
                        required=True)
    parser.add_argument('--content_wdecay',
                        type=float,
                        help='weight decay for the content embedding',
                        required=True)
    parser.add_argument('--drop_connect',
                        type=float,
                        help='drop connect rate',
                        default=0)
    # model
    parser.add_argument('--dim',
                        type=int,
                        help='model dimension',
                        required=True)
    parser.add_argument(
        '--content_noise',
        type=float,
        help='standard deviation for the content embedding noise',
        required=True)
    parser.add_argument('--dropout',
                        type=float,
                        default=0.1,
                        help='embedding dropout')
    parser.add_argument('--lr', type=float, default=0.01, help='learning rate')
    parser.add_argument('--nsamples',
                        type=int,
                        default=300000,
                        help='number of examples to use')
    parser.add_argument('--ntokens',
                        type=int,
                        default=5,
                        help='number of latent input vectors')
    parser.add_argument(
        '--nconv',
        type=int,
        default=20,
        help='number of conv layers, i think :) default as in the original.')
    args = parser.parse_args()

    device = torch.device(args.device)
    if 'cuda' in args.device:
        torch.cuda.set_device(device)

    # create directory for checkpoints and logs
    foldername = os.path.join(args.dir, args_to_comment(args))
    print(foldername)

    folder_setup(args.overwrite, args.resume, foldername, force=args.f)

    # configure logger
    logger = configure_logger(os.path.join(foldername, 'trainer.log'))

    vocab_path = os.path.join(foldername, 'vocab.pickle')
    vocab = None
    if args.resume:
        with open(vocab_path, 'rb') as file:
            vocab = pickle.load(file)
            print('vocab was loaded')

    # create dataset
    dataset, vocab = get_dataset(args.nsamples, args.data_dir, vocab)
    logger.info(f'dataset loaded, vocab size is {len(vocab)}')

    # serialize the vocab object
    if not args.resume:
        with open(vocab_path, "wb") as file:
            pickle.dump(vocab, file)
            logger.info(f'vocab was pickled into {vocab_path}')

    # the dictionary is used for decoder construction but will never be in use after that.
    decoder_dictionary = vocab_to_dictionary(vocab)

    # build model
    if not args.resume:
        model = create_model(device, args.nsamples, decoder_dictionary.pad(),
                             args.ntokens, args.dim, args.content_noise,
                             args.dropout, decoder_dictionary, 50, args.nconv)
    else:
        model = load_checkpoint(
            os.path.join(foldername, 'last_checkpoint.ckpt'), device, device,
            args.nsamples, decoder_dictionary.pad(), args.ntokens, args.dim,
            args.content_noise, 0.1, decoder_dictionary, 50, args.nconv)

    writer = SummaryWriter(log_dir=foldername, comment=args_to_comment(args))

    global_step = 0
    global_epoch = 0
    for it in range(args.it):
        logger.info('-- iteration {} --'.format(it))
        global_step, global_epoch = train(model,
                                          dataset,
                                          device,
                                          args.epochs,
                                          args.batch_size,
                                          decoder_dictionary.pad(),
                                          logger,
                                          args.content_wdecay,
                                          writer,
                                          foldername,
                                          global_step=global_step,
                                          global_epoch=global_epoch,
                                          shuffle=args.shuffle)
        evaluate(model,
                 vocab,
                 dataset,
                 10,
                 it,
                 logger,
                 writer,
                 device=device,
                 gready=False)
    print('finished')
Example #3
0
def main():
    # parse command line arguments
    parser = argparse.ArgumentParser(description='Train lord-seq2seq-convnet.')
    # session
    parser.add_argument('--note', type=str, help='a comment', required=True)
    parser.add_argument('--device',
                        type=str,
                        default='cpu',
                        help='cuda device: cuda:0 / cuda:1')
    parser.add_argument('--overwrite',
                        action='store_true',
                        help='delete old ckpt with this configuration')
    parser.add_argument(
        '--resume',
        action='store_true',
        help='resume training from  old ckpt with this configuration')
    parser.add_argument('--ckpt_every',
                        type=int,
                        default=25,
                        help='how many epochs between checkpoints')
    parser.add_argument(
        '--dir',
        type=str,
        default=
        '/cs/labs/dshahaf/omribloch/data/text_lord/restorant/train/lstm',
        help='here the script will create a directory named by the parameters')
    parser.add_argument(
        '--data_dir',
        type=str,
        default='/cs/labs/dshahaf/omribloch/data/text_lord/restorant/')
    # training
    parser.add_argument('--batch_size',
                        type=int,
                        help='batch size',
                        required=True)
    parser.add_argument('--epochs',
                        type=int,
                        help='number pf epochs to train',
                        required=True)
    parser.add_argument('--content_wdecay',
                        type=float,
                        help='weight decay for the content embedding',
                        required=True)
    # model
    parser.add_argument('--dim',
                        type=int,
                        help='model dimension',
                        required=True)
    parser.add_argument(
        '--content_noise',
        type=float,
        help='standard deviation for the content embedding noise',
        required=True)
    parser.add_argument('--nsamples',
                        type=int,
                        default=300000,
                        help='number of examples to use')
    parser.add_argument('--nlayers',
                        type=int,
                        default=2,
                        help='number of lstm layers')

    args = parser.parse_args()

    if args.overwrite and args.resume:
        raise Exception("can't use overwrite and resume together!!!")

    device = torch.device(args.device)
    if 'cuda' in args.device:
        torch.cuda.set_device(device)

    # create directory for checkpoints and logs
    foldername = os.path.join(args.dir, args_to_comment(args))
    print(foldername)

    vocab = None
    model = None

    if os.path.exists(foldername):
        if args.overwrite:
            if ask_user_confirmation('overwriting'):
                shutil.rmtree(foldername)
            else:
                print('okey, exiting. not removing anything.')
                exit(0)
        elif args.resume:
            if ask_user_confirmation('resuming'):
                print("resuming!")
            else:
                print('okey, exiting. not resuming.')
                exit(0)
        else:
            raise Exception(
                'you had already tried this configuration! aborting. try --overwrite or --resume.'
            )

    if not os.path.exists(foldername):
        os.makedirs(foldername)

    # configure logger
    logger = configure_logger(os.path.join(foldername, 'trainer.log'))

    vocab_path = os.path.join(foldername, 'vocab.pickle')

    if args.resume:
        with open(vocab_path, 'rb') as file:
            vocab = pickle.load(file)
            print('vocab was loaded')

    # create dataset
    dataset, vocab = get_dataset(args.nsamples, args.data_dir, vocab)
    logger.info(f'dataset loaded, vocab size is {len(vocab)}')

    # serialize the vocab object

    if not args.resume:
        with open(vocab_path, "wb") as file:
            pickle.dump(vocab, file)
            logger.info(f'vocab was pickled into {vocab_path}')

    # build model
    if not args.resume:
        model = LSTM_LORD(args.dim, args.nlayers, len(vocab), args.nsamples,
                          args.content_noise)
    else:
        model = load_checkpoint(
            os.path.join(foldername, 'last_checkpoint.ckpt'), device, args.dim,
            args.nlayers, len(vocab), args.nsample)

    # if torch.cuda.device_count() > 1:
    #     print("Let's use", torch.cuda.device_count(), "GPUs!")
    #     # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
    #     model = nn.DataParallel(model)

    model.to(device)
    model.train()

    writer = SummaryWriter(log_dir=foldername, comment=args_to_comment(args))

    logger.info('before entering the training loop, '
                'we are going to have {} iterations per epoch'.format(
                    args.nsamples // args.batch_size))

    acc_writer = AccuracyTensorboradWriter(writer, logger)

    global_step = 0
    for epoch in range(args.epochs):

        # optimizers are created each epoch because from time to time there lr is reduced.
        model_parameters = [p for p in model.lstm.parameters()] + \
            [p for p in model.stars_embedding.parameters()] + \
            [p for p in model.fc.parameters()]

        content_parameters = [p for p in model.sample_embedding.parameters()]

        # optimizer = optim.Adam(model_parameters, lr=0.001)
        # content_optimizer = optim.Adam(content_parameters, lr=0.1, weight_decay=args.content_wdecay)
        optimizer = optim.Adagrad(model.parameters())

        losses = []

        train_iter = data.BucketIterator(dataset=dataset,
                                         batch_size=args.batch_size,
                                         sort_key=lambda x: len(x.review),
                                         sort=False,
                                         sort_within_batch=True,
                                         repeat=False,
                                         device=device)

        for batch in tqdm(train_iter):

            # create input
            reviews = batch.review.transpose(1, 0)

            state = model.create_initial_hiddens(batch.stars, batch.id)

            # run!
            model.zero_grad()
            logits, state = model(reviews, state)

            logits_flat = logits.view(-1, len(vocab))
            targets_flat = shift_left(reviews, 1, device).reshape(-1)

            loss = F.cross_entropy(logits_flat, targets_flat, ignore_index=1)
            loss.backward()

            optimizer.step()
            # content_optimizer.step()

            # finished training step, now logging
            losses.append(loss.item())
            writer.add_scalar('Loss/per-step', loss.item(), global_step)

            # acc
            if global_step % 1 == 0:
                acc_writer.write_step(logits_flat,
                                      targets_flat,
                                      global_step,
                                      ignore_index=1)

            global_step += 1

        logger.info('epoch {} loss {}'.format(epoch, np.average(losses)))
        writer.add_scalar('Loss/per-epoch', np.average(losses), epoch)
        acc_writer.write_epoch(epoch)

        checkpoint(model, os.path.join(foldername, 'last_checkpoint.ckpt'))
        if epoch % 100 == 0:
            checkpoint(
                model, os.path.join(foldername,
                                    f'epoch{epoch}_checkpoint.ckpt'))
Example #4
0
def main():
    # parse command line arguments
    parser = argparse.ArgumentParser(
        description='evaluate lord-seq2seq-convnet.')
    # session
    parser.add_argument('--foldername', type=str)
    parser.add_argument(
        '--data_dir',
        type=str,
        default='/cs/labs/dshahaf/omribloch/data/text_lord/restorant/')
    parser.add_argument('--ckpt_name',
                        type=str,
                        default='last_checkpoint.ckpt')
    parser.add_argument('--device', type=str, default='cpu')
    parser.add_argument('--nsamples', type=int)
    parser.add_argument('--ntokens', type=int)
    parser.add_argument('--dim', type=int)
    parser.add_argument('--content_noise', type=float)
    parser.add_argument('--nconv', type=int)
    parser.add_argument('--samples_to_eval', type=int)
    parser.add_argument('--gready', action='store_true')
    parser.add_argument('--partitioned', action='store_true')

    args = parser.parse_args()

    vocab_path = os.path.join(args.foldername, 'vocab.pickle')
    model_ckpt_path = os.path.join(args.foldername, 'last_checkpoint.ckpt')

    with open(vocab_path, 'rb') as file:
        vocab = pickle.load(file)
        print('vocab was loaded')

    decoder_dictionary = vocab_to_dictionary(vocab)
    dropout = 0

    if not args.partitioned:
        model = load_checkpoint(model_ckpt_path, args.device,
                                args.device, args.nsamples,
                                decoder_dictionary.pad(), args.ntokens,
                                args.dim, args.content_noise, dropout,
                                decoder_dictionary, 50, args.nconv)
    else:
        model = load_checkpoint_partitioned(model_ckpt_path, args.device,
                                            args.device, args.nsamples,
                                            decoder_dictionary.pad(),
                                            args.ntokens, args.dim,
                                            args.content_noise, dropout,
                                            decoder_dictionary, 50, args.nconv)

    print('model loaded')

    model.eval()

    dataset, vocab = get_dataset(args.nsamples, args.data_dir, vocab)

    evaluator = Evaluator()
    fasttext_classfier = fasttext.FastText.load_model(
        '/cs/labs/dshahaf/omribloch/data/text_lord/restorant/fasttext_model.bin'
    )

    dataset_ppl = []

    orig_ppl = []
    orig_bleu = []

    new_ppl = []
    new_bleu = []

    orig_wbleu = []
    new_wbleu = []

    correct_counter = 0
    counter = 0

    with open('/tmp/results_final.txt', 'w') as file:
        for i in tqdm(range(args.samples_to_eval), disable=False):
            sid = dataset[i].id
            stars = dataset[i].stars
            # stars = 1
            review_sentence = ' '.join(dataset[i].review)

            ppl, bleu, classified, soriginal, sgenerated, original_ppl, bleu_weighted = evaluator.eval(
                model, vocab, review_sentence, stars, sid, gready=args.gready)
            orig_ppl.append(ppl)
            orig_bleu.append(bleu)
            orig_wbleu.append(bleu_weighted)
            dataset_ppl.append(original_ppl)

            ppl, bleu, classified, soriginal, sgenerated_new, original_ppl, bleu_weighted_new = evaluator.eval(
                model,
                vocab,
                review_sentence,
                1 - stars,
                sid,
                gready=args.gready)
            new_ppl.append(ppl)
            new_bleu.append(bleu)
            new_wbleu.append(bleu_weighted_new)

            predicted_label = fasttext_classfier.predict(sgenerated_new)[0][0]
            if labels_dictionary[predicted_label] == 1 - stars:
                correct_counter += 1
            counter += 1

            file.write('\n\n===========================')
            file.write('orig - {}\n'.format(soriginal))
            file.write('reco - {}\n'.format(sgenerated))
            file.write('opos - {}\n'.format(sgenerated_new))

    print('dataset ppl {}'.format(np.average(dataset_ppl)))

    print(f'orig ppl: {np.average(orig_ppl)}')
    print(f'new ppl: {np.average(new_ppl)}')

    print(f'orig bleu: {np.average(orig_bleu)}')
    print(f'new bleu: {np.average(new_bleu)}')

    print(f'orig wbleu: {np.average(orig_wbleu)}')
    print(f'new wbleu: {np.average(new_wbleu)}')

    print(f'classifier accuracy: {correct_counter / counter}')