Exemple #1
0
def prepare_data(args, field, logger):

    if field is None: 
        logger.info(f'Constructing field')
        FIELD = torchtext.data.ReversibleField(batch_first=True, init_token='<init>', eos_token='<eos>', lower=args.lower, include_lengths=True)
    else:
        FIELD = field

    train_sets, val_sets, vocab_sets = [], [], []
    for task in args.train_tasks:
        logger.info(f'Loading {task}')
        kwargs = {'test': None}
        kwargs['subsample'] = args.subsample
        kwargs['validation'] = None
        logger.info(f'Adding {task} to training datasets')
        split = get_splits(args, task, FIELD, **kwargs)[0]
        logger.info(f'{task} has {len(split)} training examples')
        train_sets.append(split)
        if args.vocab_tasks is not None and task in args.vocab_tasks:
            vocab_sets.extend(split)

    for task in args.val_tasks:
        logger.info(f'Loading {task}')
        kwargs = {'test': None}
        kwargs['subsample'] = args.subsample
        kwargs['train'] = None
        logger.info(f'Adding {task} to validation datasets')
        split = get_splits(args, task, FIELD, **kwargs)[0]
        logger.info(f'{task} has {len(split)} validation examples')
        val_sets.append(split)
        if args.vocab_tasks is not None and task in args.vocab_tasks:
            vocab_sets.extend(split) 

    if args.load is None:
        logger.info(f'Getting pretrained word vectors')
        char_vectors = torchtext.vocab.CharNGram(cache=args.embeddings)
        glove_vectors = torchtext.vocab.GloVe(cache=args.embeddings)
        vectors = [char_vectors, glove_vectors]
        vocab_sets = (train_sets + val_sets) if len(vocab_sets) == 0 else vocab_sets
        logger.info(f'Building vocabulary')
        FIELD.build_vocab(*vocab_sets, max_size=args.max_effective_vocab, vectors=vectors)

    FIELD.decoder_itos = FIELD.vocab.itos[:args.max_generative_vocab]
    FIELD.decoder_stoi = {word: idx for idx, word in enumerate(FIELD.decoder_itos)} 
    FIELD.decoder_to_vocab = {idx: FIELD.vocab.stoi[word] for idx, word in enumerate(FIELD.decoder_itos)}
    FIELD.vocab_to_decoder = {idx: FIELD.decoder_stoi[word] for idx, word in enumerate(FIELD.vocab.itos) if word in FIELD.decoder_stoi}

    logger.info(f'Vocabulary has {len(FIELD.vocab)} tokens')
    logger.info(f'The first 500 tokens:')
    print(FIELD.vocab.itos[:500])

    logger.info('Preprocessing training data')
    preprocess_examples(args, args.train_tasks, train_sets, FIELD, logger, train=True) 
    logger.info('Preprocessing validation data')
    preprocess_examples(args, args.val_tasks, val_sets, FIELD, logger, train=args.val_filter)

    return FIELD, train_sets, val_sets
Exemple #2
0
def prepare_data(args, field, logger):

    if field is None: 
        logger.info(f'Constructing field')
        FIELD = torchtext.data.ReversibleField(batch_first=True, init_token='<init>', eos_token='<eos>', lower=args.lower, include_lengths=True)
    else:
        FIELD = field

    train_sets, val_sets, vocab_sets = [], [], []
    for task in args.train_tasks:
        logger.info(f'Loading {task}')
        kwargs = {'test': None}
        kwargs['subsample'] = args.subsample
        kwargs['validation'] = None
        logger.info(f'Adding {task} to training datasets')
        split = get_splits(args, task, FIELD, **kwargs)[0]
        logger.info(f'{task} has {len(split)} training examples')
        train_sets.append(split)
        if args.vocab_tasks is not None and task in args.vocab_tasks:
            vocab_sets.extend(split)

    for task in args.val_tasks:
        logger.info(f'Loading {task}')
        kwargs = {'test': None}
        kwargs['subsample'] = args.subsample
        kwargs['train'] = None
        logger.info(f'Adding {task} to validation datasets')
        split = get_splits(args, task, FIELD, **kwargs)[0]
        logger.info(f'{task} has {len(split)} validation examples')
        val_sets.append(split)
        if args.vocab_tasks is not None and task in args.vocab_tasks:
            vocab_sets.extend(split) 

    if args.load is None:
        logger.info(f'Building vocabulary')
        char_vectors = torchtext.vocab.CharNGram(cache=args.embeddings)
        glove_vectors = torchtext.vocab.GloVe(cache=args.embeddings)
        vectors = [char_vectors, glove_vectors]
        vocab_sets = (train_sets + val_sets) if len(vocab_sets) == 0 else vocab_sets
        FIELD.build_vocab(*vocab_sets, max_size=args.max_effective_vocab, vectors=vectors)

    FIELD.decoder_itos = FIELD.vocab.itos[:args.max_generative_vocab]
    FIELD.decoder_stoi = {word: idx for idx, word in enumerate(FIELD.decoder_itos)} 
    FIELD.decoder_to_vocab = {idx: FIELD.vocab.stoi[word] for idx, word in enumerate(FIELD.decoder_itos)}
    FIELD.vocab_to_decoder = {idx: FIELD.decoder_stoi[word] for idx, word in enumerate(FIELD.vocab.itos) if word in FIELD.decoder_stoi}

    logger.info(f'Vocabulary has {len(FIELD.vocab)} tokens')
    logger.info(f'The first 500 tokens:')
    print(FIELD.vocab.itos[:500])

    logger.info('Preprocessing training data')
    preprocess_examples(args, args.train_tasks, train_sets, FIELD, logger, train=True) 
    logger.info('Preprocessing validation data')
    preprocess_examples(args, args.val_tasks, val_sets, FIELD, logger, train=args.val_filter)

    return FIELD, train_sets, val_sets
def get_all_splits(args, new_vocab):
    splits = []
    for task in args.tasks:
        print(f'Loading {task}')
        kwargs = {}
        
            kwargs['validation'] =  None
        # if not 'test' in args.evaluate:
        #     kwargs['test'] =  None
        s = get_splits(args, task, new_vocab, **kwargs)[0]
        preprocess_examples(args, [task], [s], new_vocab, train=False)
        splits.append(s)
Exemple #4
0
def get_all_splits(args, new_vocab):
    splits = []
    for task in args.tasks:
        print(f'Loading {task}')
        kwargs = {}
        if not 'train' in args.evaluate:
            kwargs['train'] =  None
        if not 'valid' in  args.evaluate:
            kwargs['validation'] =  None
        if not 'test' in args.evaluate:
            kwargs['test'] =  None
        s = get_splits(args, task, new_vocab, **kwargs)[0]
        preprocess_examples(args, [task], [s], new_vocab, train=False)
        splits.append(s)
    return splits
Exemple #5
0
def prepare_data(args, field, logger):
    if field is None:
        logger.info(f'Constructing field')
        FIELD = torchtext.data.ReversibleField(batch_first=True,
                                               init_token='<init>',
                                               eos_token='<eos>',
                                               lower=args.lower,
                                               include_lengths=True)
    else:
        FIELD = field

    train_sets, val_sets, vocab_sets = [], [], []
    for task in args.train_tasks:
        logger.info(f'Loading {task}')
        kwargs = {'test': None}
        kwargs['subsample'] = args.subsample
        kwargs['validation'] = None
        logger.info(f'Adding {task} to training datasets')
        split = get_splits(args, task, FIELD, **kwargs)[0]
        logger.info(f'{task} has {len(split)} training examples')
        train_sets.append(split)
        if args.vocab_tasks is not None and task in args.vocab_tasks:
            vocab_sets.extend(split)

    for task in args.val_tasks:
        logger.info(f'Loading {task}')
        kwargs = {'test': None}
        kwargs['subsample'] = args.subsample
        kwargs['train'] = None
        logger.info(f'Adding {task} to validation datasets')
        split = get_splits(args, task, FIELD, **kwargs)[0]
        logger.info(f'{task} has {len(split)} validation examples')
        val_sets.append(split)
        if args.vocab_tasks is not None and task in args.vocab_tasks:
            vocab_sets.extend(split)
    return train_sets, val_sets
Exemple #6
0
def prepare_data(args, field, logger):

    if field is None:
        logger.info(f'Constructing field')
        FIELD = torchtext.data.ReversibleField(batch_first=True,
                                               init_token='<init>',
                                               eos_token='<eos>',
                                               lower=args.lower,
                                               include_lengths=True)
    else:
        FIELD = field

    logger.debug(FIELD)

    train_sets, val_sets, vocab_sets = [], [], []
    # train sets, validation sets
    for task in args.train_tasks:
        logger.info(f'Loading {task}')
        # kwargs = {'test': None}
        # kwargs['subsample'] = args.subsample
        # kwargs['validation'] = None
        kwargs = {
            'test': None,
            'subsample': args.subsample,
            # 'subsample': 20000000
            'validation': None
        }
        logger.info(f'Adding {task} to training datasets')
        split = get_splits(args, task, FIELD, **kwargs)[0]
        # 取了tuple的第一个元素,只保留train_data,不要validation data
        # split = torchtext.datasets.generic.SQuAD.splits(fields=FIELD,
        # root=args.data, **kwargs)
        logger.info(f'{task} has {len(split)} training examples')
        logger.debug(type(split))
        train_sets.append(split)

        logger.debug(args.vocab_tasks)

        if args.vocab_tasks is not None and task in args.vocab_tasks:
            vocab_sets.extend(split)

    logger.debug(train_sets)
        #     kwargs['test'] =  None
        s = get_splits(args, task, new_vocab, **kwargs)[0]
        preprocess_examples(args, [task], [s], new_vocab, train=False)
        splits.append(s)
    return splits

    for task in args.tasks:
        print(f'Loading {task}')
        kwargs = {}
        if not 'train' in args.evaluate:
            kwargs['train'] =  None
        if not 'valid' in  args.evaluate:
            kwargs['validation'] =  None
        # if not 'test' in args.evaluate:
        #     kwargs['test'] =  None
        s = get_splits(args, task, new_vocab, **kwargs)[0]
        preprocess_examples(args, [task], [s], new_vocab, train=False)
        splits.append(s)
    return splits

# new update on prepare function

def prepare_data(args, FIELD):
    new_vocab = torchtext.data.ReversibleField(batch_first=True, init_token='<init>', eos_token='<eos>', lower=args.lower, include_lengths=True)
    splits = get_all_splits(args, new_vocab)
    new_vocab.build_vocab(*splits)
    print(f'Vocabulary has {len(FIELD.vocab)} tokens from training')
    args.max_generative_vocab = min(len(FIELD.vocab), args.max_generative_vocab)
    FIELD.append_vocab(new_vocab)
    print(f'Vocabulary has expanded to {len(FIELD.vocab)} tokens')
# new update