args = parser.parse_args() vocab = args.vocab size = args.train_len batch_size = args.batch_size sample_fn = getattr(d, args.sample_fn) if args.path is not None: with open(args.path, 'rb+') as f: dataset = PairedDataset.from_disk(f) dataset.set_batch_size(args.batch_size) dataset.set_device(args.device) train, valid = dataset.splits(sort_by='src', dev=args.dev, test=None) src_dict = dataset.dicts['src'] else: str_generator = d.generate_set(size, vocab, args.min_len, args.max_len, sample_fn) src, trg = zip(*str_generator) src, trg = list(map(list, src)), list(map(list, trg)) src_dict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS) src_dict.fit(src, trg) trg_dict = src_dict if args.reverse: trg_dict = copy.deepcopy(src_dict) trg_dict.align_right = True train, valid = PairedDataset( src, trg, { 'src': src_dict, 'trg': trg_dict },
args = parser.parse_args() vocab = args.vocab size = args.train_len batch_size = args.batch_size sample_fn = getattr(d, args.sample_fn) if args.path != '': with open(args.path, 'rb+') as f: dataset = PairedDataset.from_disk(f) dataset.set_batch_size(args.batch_size) dataset.set_gpu(args.gpu) train, valid = dataset.splits(sort_by='src', dev=args.dev, test=None) src_dict = dataset.dicts['src'] else: src, trg = zip(*d.generate_set( size, vocab, args.min_len, args.max_len, sample_fn)) src, trg = list(map(list, src)), list(map(list, trg)) src_dict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS) src_dict.fit(src, trg) train, valid = PairedDataset( src, trg, {'src': src_dict, 'trg': src_dict}, batch_size=args.batch_size, gpu=args.gpu ).splits(dev=args.dev, test=None, sort_by='src') print(' * vocabulary size. %d' % len(src_dict)) print(' * number of train batches. %d' % len(train)) print(' * maximum batch size. %d' % batch_size) print('Building model...') model = EncoderDecoder(
args = parser.parse_args() vocab = args.vocab size = args.train_len batch_size = args.batch_size sample_fn = getattr(d, args.sample_fn) if args.path != '': with open(args.path, 'rb+') as f: dataset = PairedDataset.from_disk(f) dataset.set_batch_size(args.batch_size) dataset.set_gpu(args.gpu) train, valid = dataset.splits(sort_by='src', dev=args.dev, test=None) src_dict = dataset.dicts['src'] else: src, trg = zip(*d.generate_set(size, vocab, args.min_len, args.max_len, sample_fn)) src, trg = list(map(list, src)), list(map(list, trg)) src_dict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS) src_dict.fit(src, trg) train, valid = PairedDataset(src, trg, { 'src': src_dict, 'trg': src_dict }, batch_size=args.batch_size, gpu=args.gpu).splits(dev=args.dev, test=None, sort_by='src') print(' * vocabulary size. %d' % len(src_dict)) print(' * number of train batches. %d' % len(train))
parser.add_argument('--vocab', default=list(string.ascii_letters) + [' ']) parser.add_argument('--checkpoint', default=100, type=int) parser.add_argument('--hooks_per_epoch', default=5, type=int) parser.add_argument('--optim', default='Adam', type=str) parser.add_argument('--learning_rate', default=0.01, type=float) parser.add_argument('--learning_rate_decay', default=0.5, type=float) parser.add_argument('--start_decay_at', default=8, type=int) parser.add_argument('--max_grad_norm', default=5., type=float) parser.add_argument('--gpu', action='store_true') parser.add_argument('--beam', action='store_true') args = parser.parse_args() datasets = {} for target in args.targets: sample_fn = wrap_autoencode(getattr(d, target)) src, trg = zip(*d.generate_set( args.train_len, args.vocab, args.min_len, args.max_len, sample_fn)) src, trg = list(map(list, src)), list(map(list, trg)) datasets[target] = {'src': src, 'trg': trg} src_dict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS) src_dict.fit(*[data for target in datasets for data in datasets[target].values()]) for target in datasets: train, valid = PairedDataset( datasets[target]['src'], datasets[target]['trg'], {'src': src_dict, 'trg': src_dict}, batch_size=args.batch_size, gpu=args.gpu).splits( dev=args.dev, test=None, shuffle=True, sort_key=lambda pair: len(pair[0]))