def prepare_data(args, field, logger): if field is None: logger.info(f'Constructing field') FIELD = torchtext.data.ReversibleField(batch_first=True, init_token='<init>', eos_token='<eos>', lower=args.lower, include_lengths=True) else: FIELD = field train_sets, val_sets, vocab_sets = [], [], [] for task in args.train_tasks: logger.info(f'Loading {task}') kwargs = {'test': None} kwargs['subsample'] = args.subsample kwargs['validation'] = None logger.info(f'Adding {task} to training datasets') split = get_splits(args, task, FIELD, **kwargs)[0] logger.info(f'{task} has {len(split)} training examples') train_sets.append(split) if args.vocab_tasks is not None and task in args.vocab_tasks: vocab_sets.extend(split) for task in args.val_tasks: logger.info(f'Loading {task}') kwargs = {'test': None} kwargs['subsample'] = args.subsample kwargs['train'] = None logger.info(f'Adding {task} to validation datasets') split = get_splits(args, task, FIELD, **kwargs)[0] logger.info(f'{task} has {len(split)} validation examples') val_sets.append(split) if args.vocab_tasks is not None and task in args.vocab_tasks: vocab_sets.extend(split) if args.load is None: logger.info(f'Getting pretrained word vectors') char_vectors = torchtext.vocab.CharNGram(cache=args.embeddings) glove_vectors = torchtext.vocab.GloVe(cache=args.embeddings) vectors = [char_vectors, glove_vectors] vocab_sets = (train_sets + val_sets) if len(vocab_sets) == 0 else vocab_sets logger.info(f'Building vocabulary') FIELD.build_vocab(*vocab_sets, max_size=args.max_effective_vocab, vectors=vectors) FIELD.decoder_itos = FIELD.vocab.itos[:args.max_generative_vocab] FIELD.decoder_stoi = {word: idx for idx, word in enumerate(FIELD.decoder_itos)} FIELD.decoder_to_vocab = {idx: FIELD.vocab.stoi[word] for idx, word in enumerate(FIELD.decoder_itos)} FIELD.vocab_to_decoder = {idx: FIELD.decoder_stoi[word] for idx, word in enumerate(FIELD.vocab.itos) if word in FIELD.decoder_stoi} logger.info(f'Vocabulary has {len(FIELD.vocab)} tokens') logger.info(f'The first 500 tokens:') print(FIELD.vocab.itos[:500]) logger.info('Preprocessing training data') preprocess_examples(args, args.train_tasks, train_sets, FIELD, logger, train=True) logger.info('Preprocessing validation data') preprocess_examples(args, args.val_tasks, val_sets, FIELD, logger, train=args.val_filter) return FIELD, train_sets, val_sets
def prepare_data(args, field, logger): if field is None: logger.info(f'Constructing field') FIELD = torchtext.data.ReversibleField(batch_first=True, init_token='<init>', eos_token='<eos>', lower=args.lower, include_lengths=True) else: FIELD = field train_sets, val_sets, vocab_sets = [], [], [] for task in args.train_tasks: logger.info(f'Loading {task}') kwargs = {'test': None} kwargs['subsample'] = args.subsample kwargs['validation'] = None logger.info(f'Adding {task} to training datasets') split = get_splits(args, task, FIELD, **kwargs)[0] logger.info(f'{task} has {len(split)} training examples') train_sets.append(split) if args.vocab_tasks is not None and task in args.vocab_tasks: vocab_sets.extend(split) for task in args.val_tasks: logger.info(f'Loading {task}') kwargs = {'test': None} kwargs['subsample'] = args.subsample kwargs['train'] = None logger.info(f'Adding {task} to validation datasets') split = get_splits(args, task, FIELD, **kwargs)[0] logger.info(f'{task} has {len(split)} validation examples') val_sets.append(split) if args.vocab_tasks is not None and task in args.vocab_tasks: vocab_sets.extend(split) if args.load is None: logger.info(f'Building vocabulary') char_vectors = torchtext.vocab.CharNGram(cache=args.embeddings) glove_vectors = torchtext.vocab.GloVe(cache=args.embeddings) vectors = [char_vectors, glove_vectors] vocab_sets = (train_sets + val_sets) if len(vocab_sets) == 0 else vocab_sets FIELD.build_vocab(*vocab_sets, max_size=args.max_effective_vocab, vectors=vectors) FIELD.decoder_itos = FIELD.vocab.itos[:args.max_generative_vocab] FIELD.decoder_stoi = {word: idx for idx, word in enumerate(FIELD.decoder_itos)} FIELD.decoder_to_vocab = {idx: FIELD.vocab.stoi[word] for idx, word in enumerate(FIELD.decoder_itos)} FIELD.vocab_to_decoder = {idx: FIELD.decoder_stoi[word] for idx, word in enumerate(FIELD.vocab.itos) if word in FIELD.decoder_stoi} logger.info(f'Vocabulary has {len(FIELD.vocab)} tokens') logger.info(f'The first 500 tokens:') print(FIELD.vocab.itos[:500]) logger.info('Preprocessing training data') preprocess_examples(args, args.train_tasks, train_sets, FIELD, logger, train=True) logger.info('Preprocessing validation data') preprocess_examples(args, args.val_tasks, val_sets, FIELD, logger, train=args.val_filter) return FIELD, train_sets, val_sets
def get_all_splits(args, new_vocab): splits = [] for task in args.tasks: print(f'Loading {task}') kwargs = {} kwargs['validation'] = None # if not 'test' in args.evaluate: # kwargs['test'] = None s = get_splits(args, task, new_vocab, **kwargs)[0] preprocess_examples(args, [task], [s], new_vocab, train=False) splits.append(s)
def get_all_splits(args, new_vocab): splits = [] for task in args.tasks: print(f'Loading {task}') kwargs = {} if not 'train' in args.evaluate: kwargs['train'] = None if not 'valid' in args.evaluate: kwargs['validation'] = None if not 'test' in args.evaluate: kwargs['test'] = None s = get_splits(args, task, new_vocab, **kwargs)[0] preprocess_examples(args, [task], [s], new_vocab, train=False) splits.append(s) return splits
def prepare_data(args, field, logger): if field is None: logger.info(f'Constructing field') FIELD = torchtext.data.ReversibleField(batch_first=True, init_token='<init>', eos_token='<eos>', lower=args.lower, include_lengths=True) else: FIELD = field train_sets, val_sets, vocab_sets = [], [], [] for task in args.train_tasks: logger.info(f'Loading {task}') kwargs = {'test': None} kwargs['subsample'] = args.subsample kwargs['validation'] = None logger.info(f'Adding {task} to training datasets') split = get_splits(args, task, FIELD, **kwargs)[0] logger.info(f'{task} has {len(split)} training examples') train_sets.append(split) if args.vocab_tasks is not None and task in args.vocab_tasks: vocab_sets.extend(split) for task in args.val_tasks: logger.info(f'Loading {task}') kwargs = {'test': None} kwargs['subsample'] = args.subsample kwargs['train'] = None logger.info(f'Adding {task} to validation datasets') split = get_splits(args, task, FIELD, **kwargs)[0] logger.info(f'{task} has {len(split)} validation examples') val_sets.append(split) if args.vocab_tasks is not None and task in args.vocab_tasks: vocab_sets.extend(split) return train_sets, val_sets
def prepare_data(args, field, logger): if field is None: logger.info(f'Constructing field') FIELD = torchtext.data.ReversibleField(batch_first=True, init_token='<init>', eos_token='<eos>', lower=args.lower, include_lengths=True) else: FIELD = field logger.debug(FIELD) train_sets, val_sets, vocab_sets = [], [], [] # train sets, validation sets for task in args.train_tasks: logger.info(f'Loading {task}') # kwargs = {'test': None} # kwargs['subsample'] = args.subsample # kwargs['validation'] = None kwargs = { 'test': None, 'subsample': args.subsample, # 'subsample': 20000000 'validation': None } logger.info(f'Adding {task} to training datasets') split = get_splits(args, task, FIELD, **kwargs)[0] # 取了tuple的第一个元素,只保留train_data,不要validation data # split = torchtext.datasets.generic.SQuAD.splits(fields=FIELD, # root=args.data, **kwargs) logger.info(f'{task} has {len(split)} training examples') logger.debug(type(split)) train_sets.append(split) logger.debug(args.vocab_tasks) if args.vocab_tasks is not None and task in args.vocab_tasks: vocab_sets.extend(split) logger.debug(train_sets)
# kwargs['test'] = None s = get_splits(args, task, new_vocab, **kwargs)[0] preprocess_examples(args, [task], [s], new_vocab, train=False) splits.append(s) return splits for task in args.tasks: print(f'Loading {task}') kwargs = {} if not 'train' in args.evaluate: kwargs['train'] = None if not 'valid' in args.evaluate: kwargs['validation'] = None # if not 'test' in args.evaluate: # kwargs['test'] = None s = get_splits(args, task, new_vocab, **kwargs)[0] preprocess_examples(args, [task], [s], new_vocab, train=False) splits.append(s) return splits # new update on prepare function def prepare_data(args, FIELD): new_vocab = torchtext.data.ReversibleField(batch_first=True, init_token='<init>', eos_token='<eos>', lower=args.lower, include_lengths=True) splits = get_all_splits(args, new_vocab) new_vocab.build_vocab(*splits) print(f'Vocabulary has {len(FIELD.vocab)} tokens from training') args.max_generative_vocab = min(len(FIELD.vocab), args.max_generative_vocab) FIELD.append_vocab(new_vocab) print(f'Vocabulary has expanded to {len(FIELD.vocab)} tokens') # new update