def load_lines(path, max_len=None, processor=text_processor()): """Auxiliary function for sentence-per-line data""" lines = [] with open(os.path.expanduser(path)) as f: for line in f: line = line.strip() if processor is not None: line = processor(line) if not line or (max_len is not None and len(line) > max_len): continue lines.append(line) return lines
def load_lines(path, processor=text_processor()): lines = [] if os.path.isfile(path): input_files = [path] else: input_files = [os.path.join(path, f) for f in os.listdir(path)] for path in input_files: with open(path) as f: for line in f: line = line.strip() if processor is not None: line = processor(line) if line: lines.append(line) return lines
import seqmod.utils as u if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('--path', required=True) parser.add_argument('--output', help='prefix for the stored dataset', required=True) parser.add_argument('--max_size', type=int, default=100000) parser.add_argument('--min_freq', default=1, type=int) parser.add_argument('--lower', action='store_true') parser.add_argument('--num', action='store_true') parser.add_argument('--level', default='char') args = parser.parse_args() processor = text_processor( lower=args.lower, num=args.num, level=args.level) d = Dict(max_size=args.max_size, min_freq=args.min_freq, eos_token=u.EOS, force_unk=True) trainpath = os.path.join(args.path, 'train.txt') testpath = os.path.join(args.path, 'test.txt') outputformat = (args.output + ".{}.npz").format if os.path.isfile(outputformat("train")): raise ValueError("Output train file already exists") if os.path.isfile(outputformat("test")): raise ValueError("Output test file already exists") print("Fitting dictionary") d.fit(load_lines(trainpath, processor=processor), load_lines(testpath, processor=processor))
parser.add_argument('--min_freq', default=5, type=int) parser.add_argument('--max_size', default=50000, type=int) parser.add_argument('--level', default='token') parser.add_argument('--concat', action='store_true') parser.add_argument('--cache_data', action='store_true') args = parser.parse_args() print("Loading data...") prefix = '{source}.{level}.{min_len}.{min_freq}.{concat}.{max_size}' \ .format(**vars(args)) # preprocess if not args.cache_data or not os.path.isfile( 'data/{}_train.pt'.format(prefix)): processor = text_processor(lower=False, level=args.level) if args.source == 'twisty': src, trg = load_twisty(min_len=args.min_len, concat=args.concat, processor=processor) train, test, valid = load_twisty_dataset( src, trg, args.batch_size, min_freq=args.min_freq, max_size=args.max_size, device=args.device, dev=args.dev, test=args.test, max_tweets=None if args.max_tweets == 0 else args.max_tweets)
outputfile = None if checkpoint is not None: outputfile = checkpoint.checkpoint_path() trainer.add_loggers(StdLogger(outputfile=outputfile)) # hook scheduler = None if args.lr_schedule_factor < 1: scheduler = optim.lr_scheduler.StepLR(optimizer, args.lr_schedule_epochs, args.lr_schedule_factor) num_checkpoints = args.save_freq // args.checkpoint trainer.add_hook(make_skipthought_hook(checkpoint, scheduler), num_checkpoints=num_checkpoints) # dataset paths = glob.glob(os.path.expanduser(args.paths)) includes = ('prev' in args.mode, 'same' in args.mode, 'post' in args.mode) processor = text_processor(max_len=args.max_len, min_len=args.min_len) data = SkipthoughtIter(embeddings.d, *paths, always_reverse=args.clone, includes=includes, processor=processor, device=args.device, verbose=True) generator = data.batch_generator(args.batch_size, buffer_size=int(5e+6)) print() print("Training model...") trainer.train_generator(args.epochs, generator, args.checkpoint)