Esempio n. 1
0
def load_lines(path, max_len=None, processor=text_processor()):
    """Auxiliary function for sentence-per-line data"""
    lines = []

    with open(os.path.expanduser(path)) as f:
        for line in f:
            line = line.strip()
            if processor is not None:
                line = processor(line)
            if not line or (max_len is not None and len(line) > max_len):
                continue
            lines.append(line)

    return lines
Esempio n. 2
0
def load_lines(path, processor=text_processor()):
    lines = []
    if os.path.isfile(path):
        input_files = [path]
    else:
        input_files = [os.path.join(path, f) for f in os.listdir(path)]
    for path in input_files:
        with open(path) as f:
            for line in f:
                line = line.strip()
                if processor is not None:
                    line = processor(line)
                if line:
                    lines.append(line)
    return lines
Esempio n. 3
0
import seqmod.utils as u


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--path', required=True)
    parser.add_argument('--output', help='prefix for the stored dataset', required=True)
    parser.add_argument('--max_size', type=int, default=100000)
    parser.add_argument('--min_freq', default=1, type=int)
    parser.add_argument('--lower', action='store_true')
    parser.add_argument('--num', action='store_true')
    parser.add_argument('--level', default='char')
    args = parser.parse_args()

    processor = text_processor(
        lower=args.lower, num=args.num, level=args.level)
    d = Dict(max_size=args.max_size, min_freq=args.min_freq,
             eos_token=u.EOS, force_unk=True)

    trainpath = os.path.join(args.path, 'train.txt')
    testpath = os.path.join(args.path, 'test.txt')
    outputformat = (args.output + ".{}.npz").format

    if os.path.isfile(outputformat("train")):
        raise ValueError("Output train file already exists")
    if os.path.isfile(outputformat("test")):
        raise ValueError("Output test file already exists")

    print("Fitting dictionary")
    d.fit(load_lines(trainpath, processor=processor),
          load_lines(testpath, processor=processor))
Esempio n. 4
0
    parser.add_argument('--min_freq', default=5, type=int)
    parser.add_argument('--max_size', default=50000, type=int)
    parser.add_argument('--level', default='token')
    parser.add_argument('--concat', action='store_true')
    parser.add_argument('--cache_data', action='store_true')
    args = parser.parse_args()

    print("Loading data...")
    prefix = '{source}.{level}.{min_len}.{min_freq}.{concat}.{max_size}' \
             .format(**vars(args))

    # preprocess
    if not args.cache_data or not os.path.isfile(
            'data/{}_train.pt'.format(prefix)):

        processor = text_processor(lower=False, level=args.level)

        if args.source == 'twisty':
            src, trg = load_twisty(min_len=args.min_len,
                                   concat=args.concat,
                                   processor=processor)
            train, test, valid = load_twisty_dataset(
                src,
                trg,
                args.batch_size,
                min_freq=args.min_freq,
                max_size=args.max_size,
                device=args.device,
                dev=args.dev,
                test=args.test,
                max_tweets=None if args.max_tweets == 0 else args.max_tweets)
Esempio n. 5
0
    outputfile = None
    if checkpoint is not None:
        outputfile = checkpoint.checkpoint_path()
    trainer.add_loggers(StdLogger(outputfile=outputfile))
    # hook
    scheduler = None
    if args.lr_schedule_factor < 1:
        scheduler = optim.lr_scheduler.StepLR(optimizer,
                                              args.lr_schedule_epochs,
                                              args.lr_schedule_factor)
    num_checkpoints = args.save_freq // args.checkpoint
    trainer.add_hook(make_skipthought_hook(checkpoint, scheduler),
                     num_checkpoints=num_checkpoints)

    # dataset
    paths = glob.glob(os.path.expanduser(args.paths))
    includes = ('prev' in args.mode, 'same' in args.mode, 'post' in args.mode)
    processor = text_processor(max_len=args.max_len, min_len=args.min_len)
    data = SkipthoughtIter(embeddings.d,
                           *paths,
                           always_reverse=args.clone,
                           includes=includes,
                           processor=processor,
                           device=args.device,
                           verbose=True)
    generator = data.batch_generator(args.batch_size, buffer_size=int(5e+6))

    print()
    print("Training model...")
    trainer.train_generator(args.epochs, generator, args.checkpoint)