コード例 #1
0
    args = parser.parse_args()

    vocab = args.vocab
    size = args.train_len
    batch_size = args.batch_size
    sample_fn = getattr(d, args.sample_fn)

    if args.path is not None:
        with open(args.path, 'rb+') as f:
            dataset = PairedDataset.from_disk(f)
        dataset.set_batch_size(args.batch_size)
        dataset.set_device(args.device)
        train, valid = dataset.splits(sort_by='src', dev=args.dev, test=None)
        src_dict = dataset.dicts['src']
    else:
        str_generator = d.generate_set(size, vocab, args.min_len, args.max_len,
                                       sample_fn)
        src, trg = zip(*str_generator)
        src, trg = list(map(list, src)), list(map(list, trg))
        src_dict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS)
        src_dict.fit(src, trg)
        trg_dict = src_dict
        if args.reverse:
            trg_dict = copy.deepcopy(src_dict)
            trg_dict.align_right = True
        train, valid = PairedDataset(
            src,
            trg,
            {
                'src': src_dict,
                'trg': trg_dict
            },
コード例 #2
0
    args = parser.parse_args()

    vocab = args.vocab
    size = args.train_len
    batch_size = args.batch_size
    sample_fn = getattr(d, args.sample_fn)

    if args.path != '':
        with open(args.path, 'rb+') as f:
            dataset = PairedDataset.from_disk(f)
        dataset.set_batch_size(args.batch_size)
        dataset.set_gpu(args.gpu)
        train, valid = dataset.splits(sort_by='src', dev=args.dev, test=None)
        src_dict = dataset.dicts['src']
    else:
        src, trg = zip(*d.generate_set(
            size, vocab, args.min_len, args.max_len, sample_fn))
        src, trg = list(map(list, src)), list(map(list, trg))
        src_dict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS)
        src_dict.fit(src, trg)
        train, valid = PairedDataset(
            src, trg, {'src': src_dict, 'trg': src_dict},
            batch_size=args.batch_size, gpu=args.gpu
        ).splits(dev=args.dev, test=None, sort_by='src')

    print(' * vocabulary size. %d' % len(src_dict))
    print(' * number of train batches. %d' % len(train))
    print(' * maximum batch size. %d' % batch_size)

    print('Building model...')

    model = EncoderDecoder(
コード例 #3
0
    args = parser.parse_args()

    vocab = args.vocab
    size = args.train_len
    batch_size = args.batch_size
    sample_fn = getattr(d, args.sample_fn)

    if args.path != '':
        with open(args.path, 'rb+') as f:
            dataset = PairedDataset.from_disk(f)
        dataset.set_batch_size(args.batch_size)
        dataset.set_gpu(args.gpu)
        train, valid = dataset.splits(sort_by='src', dev=args.dev, test=None)
        src_dict = dataset.dicts['src']
    else:
        src, trg = zip(*d.generate_set(size, vocab, args.min_len, args.max_len,
                                       sample_fn))
        src, trg = list(map(list, src)), list(map(list, trg))
        src_dict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS)
        src_dict.fit(src, trg)
        train, valid = PairedDataset(src,
                                     trg, {
                                         'src': src_dict,
                                         'trg': src_dict
                                     },
                                     batch_size=args.batch_size,
                                     gpu=args.gpu).splits(dev=args.dev,
                                                          test=None,
                                                          sort_by='src')

    print(' * vocabulary size. %d' % len(src_dict))
    print(' * number of train batches. %d' % len(train))
コード例 #4
0
    parser.add_argument('--vocab', default=list(string.ascii_letters) + [' '])
    parser.add_argument('--checkpoint', default=100, type=int)
    parser.add_argument('--hooks_per_epoch', default=5, type=int)
    parser.add_argument('--optim', default='Adam', type=str)
    parser.add_argument('--learning_rate', default=0.01, type=float)
    parser.add_argument('--learning_rate_decay', default=0.5, type=float)
    parser.add_argument('--start_decay_at', default=8, type=int)
    parser.add_argument('--max_grad_norm', default=5., type=float)
    parser.add_argument('--gpu', action='store_true')
    parser.add_argument('--beam', action='store_true')
    args = parser.parse_args()

    datasets = {}
    for target in args.targets:
        sample_fn = wrap_autoencode(getattr(d, target))
        src, trg = zip(*d.generate_set(
            args.train_len, args.vocab, args.min_len, args.max_len, sample_fn))
        src, trg = list(map(list, src)), list(map(list, trg))
        datasets[target] = {'src': src, 'trg': trg}

    src_dict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS)
    src_dict.fit(*[data
                   for target in datasets
                   for data in datasets[target].values()])

    for target in datasets:
        train, valid = PairedDataset(
            datasets[target]['src'], datasets[target]['trg'],
            {'src': src_dict, 'trg': src_dict},
            batch_size=args.batch_size, gpu=args.gpu).splits(
                dev=args.dev, test=None,
                shuffle=True, sort_key=lambda pair: len(pair[0]))