Ejemplo n.º 1
0
def load_data(path, exts, text_processor=text_processor()):
    src_data, trg_data = [], []
    path = os.path.expanduser(path)
    with open(path + exts[0]) as src, open(path + exts[1]) as trg:
        for src_line, trg_line in zip(src, trg):
            src_line, trg_line = src_line.strip(), trg_line.strip()
            if text_processor is not None:
                src_line = text_processor(src_line)
                trg_line = text_processor(trg_line)
            if src_line and trg_line:
                src_data.append(src_line), trg_data.append(trg_line)
    return src_data, trg_data
Ejemplo n.º 2
0
def load_data(path, exts, text_processor=text_processor()):
    src_data, trg_data = [], []
    path = os.path.expanduser(path)
    with open(path + exts[0]) as src, open(path + exts[1]) as trg:
        for src_line, trg_line in zip(src, trg):
            src_line, trg_line = src_line.strip(), trg_line.strip()
            if text_processor is not None:
                src_line = text_processor(src_line)
                trg_line = text_processor(trg_line)
            if src_line and trg_line:
                src_data.append(src_line), trg_data.append(trg_line)
    return src_data, trg_data
Ejemplo n.º 3
0
def load_lines(path, processor=text_processor()):
    lines = []
    with open(os.path.expanduser(path)) as f:
        for line in f:
            line = line.strip()
            if processor is not None:
                line = processor(line)
            if line:
                lines.append(line)
    return lines
Ejemplo n.º 4
0
def load_lines(path, processor=text_processor()):
    lines = []
    with open(os.path.expanduser(path)) as f:
        for line in f:
            line = line.strip()
            if processor is not None:
                line = processor(line)
            if line:
                lines.append(line)
    return lines
Ejemplo n.º 5
0
def load_lines(path, processor=text_processor()):
    lines = []
    if os.path.isfile(path):
        input_files = [path]
    else:
        input_files = [os.path.join(path, f) for f in os.listdir(path)]
    for path in input_files:
        with open(path) as f:
            for line in f:
                line = line.strip()
                if processor is not None:
                    line = processor(line)
                if line:
                    lines.append(line)
    return lines
Ejemplo n.º 6
0
def load_lines(path, processor=text_processor()):
    """Auxiliary function for sentence-per-line data"""
    if os.path.isdir(path):
        input_files = [os.path.join(path, f) for f in os.listdir(path)]
    elif os.path.isfile(path):
        input_files = [path]
    else:
        return

    for path in input_files:
        with open(os.path.expanduser(path)) as f:
            for line in f:
                line = line.strip()
                if processor is not None:
                    line = processor(line)
                if not line:
                    continue
                yield line
Ejemplo n.º 7
0
    parser.add_argument('--level', default='token')
    parser.add_argument('--concat', action='store_true')
    parser.add_argument('--cache_data', action='store_true')
    args = parser.parse_args()

    prefix = '{source}.{level}.{min_len}.{min_freq}.{concat}.{max_size}' \
             .format(**vars(args))

    print("Loading data...")
    # preprocess
    if not args.cache_data or not os.path.isfile('data/%s_train.pt' % prefix):
        if args.source == 'twisty':
            src, trg = load_twisty(min_len=args.min_len,
                                   level=args.level,
                                   concat=args.concat,
                                   processor=text_processor(lower=False))
            train, test, valid = load_dataset(src,
                                              trg,
                                              args.batch_size,
                                              min_freq=args.min_freq,
                                              max_size=args.max_size,
                                              gpu=args.gpu,
                                              dev=args.dev,
                                              test=args.test)
        elif args.source == 'penn':
            train, test, valid = load_penn("~/corpora/penn",
                                           args.batch_size,
                                           min_freq=args.min_freq,
                                           max_size=args.max_size,
                                           gpu=args.gpu)
        else:
Ejemplo n.º 8
0
    parser.add_argument('--min_len', default=0, type=int)
    parser.add_argument('--min_freq', default=5, type=int)
    parser.add_argument('--max_size', default=50000, type=int)
    parser.add_argument('--level', default='token')
    parser.add_argument('--concat', action='store_true')
    parser.add_argument('--cache_data', action='store_true')
    args = parser.parse_args()

    print("Loading data...")
    prefix = '{source}.{level}.{min_len}.{min_freq}.{concat}.{max_size}' \
             .format(**vars(args))
    if not args.cache_data or not os.path.isfile('data/%s_train.pt' % prefix):
        if args.source == 'twisty':
            src, trg = load_twisty(
                min_len=args.min_len, level=args.level, concat=args.concat,
                processor=text_processor(lower=False))
            train, test, valid = load_dataset(
                src, trg, args.batch_size,
                min_freq=args.min_freq, max_size=args.max_size,
                gpu=args.gpu, dev=args.dev, test=args.test)
        elif args.source == 'penn':
            train, test, valid = load_penn(
                "~/corpora/penn", args.batch_size,
                min_freq=args.min_freq, max_size=args.max_size, gpu=args.gpu)
        else:
            train, test, valid = load_from_lines(
                args.source_path, args.batch_size,
                min_freq=args.min_freq, max_size=args.max_size,
                gpu=args.gpu, dev=args.dev, test=args.text)
        if args.cache_data:
            train.to_disk('data/%s_train.pt' % prefix)
Ejemplo n.º 9
0
    parser.add_argument('--visdom_server', default='localhost')
    parser.add_argument('--save', action='store_true')
    parser.add_argument('--prefix', default='model', type=str)
    args = parser.parse_args()

    if args.processed:
        print("Loading preprocessed datasets...")
        assert args.dict_path, "Processed data requires DICT_PATH"
        data, d = load_from_file(args.path), u.load_model(args.dict_path)
        train, test, valid = BlockDataset(
            data, d, args.batch_size, args.bptt, gpu=args.gpu, fitted=True
        ).splits(test=0.1, dev=0.1)
        del data
    else:
        print("Processing datasets...")
        proc = text_processor(
            lower=args.lower, num=args.num, level=args.level)
        train_data = load_lines(args.path + 'train.txt', processor=proc)
        valid_data = load_lines(args.path + 'valid.txt', processor=proc)
        test_data = load_lines(args.path + 'test.txt', processor=proc)
        d = Dict(max_size=args.max_size, min_freq=args.min_freq, eos_token=u.EOS)
        d.fit(train_data, valid_data)
        train = BlockDataset(
            train_data, d, args.batch_size, args.bptt, gpu=args.gpu)
        valid = BlockDataset(
            valid_data, d, args.batch_size, args.bptt, gpu=args.gpu,
            evaluation=True)
        test = BlockDataset(
            test_data, d, args.batch_size, args.bptt, gpu=args.gpu,
            evaluation=True)
        del train_data, valid_data, test_data
Ejemplo n.º 10
0
    parser.add_argument('--visdom_server', default='localhost')
    parser.add_argument('--save', action='store_true')
    parser.add_argument('--prefix', default='model', type=str)
    args = parser.parse_args()

    if args.processed:
        print("Loading preprocessed datasets...")
        assert args.dict_path, "Processed data requires DICT_PATH"
        data, d = load_from_file(args.path), u.load_model(args.dict_path)
        train, test, valid = BlockDataset(
            data, d, args.batch_size, args.bptt, gpu=args.gpu, fitted=True
        ).splits(test=0.1, dev=0.1)
        del data
    else:
        print("Processing datasets...")
        proc = text_processor(
            lower=args.lower, num=args.num, level=args.level)
        train_data = load_lines(args.path + 'train.txt', processor=proc)
        valid_data = load_lines(args.path + 'valid.txt', processor=proc)
        test_data = load_lines(args.path + 'test.txt', processor=proc)
        d = Dict(max_size=args.max_size, min_freq=args.min_freq,
                 eos_token=u.EOS, bos_token=u.BOS)
        d.fit(train_data, valid_data)
        train = BlockDataset(
            train_data, d, args.batch_size, args.bptt, gpu=args.gpu)
        valid = BlockDataset(
            valid_data, d, args.batch_size, args.bptt, gpu=args.gpu,
            evaluation=True)
        test = BlockDataset(
            test_data, d, args.batch_size, args.bptt, gpu=args.gpu,
            evaluation=True)
        del train_data, valid_data, test_data