def get_dataloader(dataset): """create data loader based on the dataset chunk""" lengths = dataset.get_field('valid_lengths') # A batch includes: input_id, masked_id, masked_position, masked_weight, # next_sentence_label, segment_id, valid_length batchify_fn = Tuple(Pad(), Pad(), Pad(), Pad(), Stack(), Pad(), Stack()) if use_avg_len: # sharded data loader sampler = nlp.data.FixedBucketSampler( lengths=lengths, # batch_size per shard batch_size=batch_size, num_buckets=num_buckets, shuffle=shuffle, use_average_length=True, num_shards=num_ctxes) dataloader = nlp.data.ShardedDataLoader(dataset, batch_sampler=sampler, batchify_fn=batchify_fn, num_workers=num_ctxes) else: sampler = nlp.data.FixedBucketSampler(lengths, batch_size=batch_size * num_ctxes, num_buckets=num_buckets, ratio=0, shuffle=shuffle) dataloader = DataLoader(dataset=dataset, batch_sampler=sampler, batchify_fn=batchify_fn, num_workers=1) logging.debug('Sampler created for a new dataset:\n%s', sampler.stats()) return dataloader
def __init__(self, num_ctxes, vocab): self._num_ctxes = num_ctxes pad_val = vocab[vocab.padding_token] self._batchify_fn = Tuple( Pad(pad_val=pad_val, round_to=8), # input_id Pad(pad_val=pad_val), # masked_id Pad(pad_val=0), # masked_position Pad(pad_val=0), # masked_weight Stack(), # next_sentence_label Pad(pad_val=0, round_to=8), # segment_id Stack()) # valid_length
def test_dataloader(): batch_size = 128 dataset = NLPCCDataset('train', data_root_dir) transformer = DataTransformer(['train']) word_vocab = transformer._word_vocab transformed_dataset = dataset.transform(transformer, lazy=False) batchify_fn = Tuple( Stack(), Pad(axis=0, pad_val=word_vocab[word_vocab.padding_token], ret_length=True), Stack()) sampler = FixedBucketSampler( lengths=[len(item[1]) for item in transformed_dataset], batch_size=batch_size, shuffle=True, num_buckets=30) data_loader = DataLoader(transformed_dataset, batchify_fn=batchify_fn, batch_sampler=sampler) for i, (rec_id, (data, original_length), label) in enumerate(data_loader): print(data.shape) assert data.shape[0] <= 128
def __call__(self, dataset, sampler): # A batch includes: input_id, masked_id, masked_position, masked_weight, # next_sentence_label, segment_id, valid_length batchify_fn = Tuple(Pad(), Pad(), Pad(), Pad(), Stack(), Pad(), Stack()) if self._use_avg_len: # sharded data loader dataloader = nlp.data.ShardedDataLoader(dataset, batch_sampler=sampler, batchify_fn=batchify_fn, num_workers=self._num_ctxes) else: dataloader = DataLoader(dataset=dataset, batch_sampler=sampler, batchify_fn=batchify_fn, num_workers=self._num_ctxes) return dataloader
def __call__(self, dataset): """create data loader based on the dataset chunk""" if isinstance(dataset, nlp.data.NumpyDataset): lengths = dataset.get_field('valid_lengths') elif isinstance(dataset, BERTPretrainDataset): lengths = dataset.transform(lambda input_ids, segment_ids, masked_lm_positions, \ masked_lm_ids, masked_lm_weights, \ next_sentence_labels, valid_lengths: \ valid_lengths, lazy=False) else: raise ValueError('unexpected dataset type: %s' % str(dataset)) # A batch includes: input_id, masked_id, masked_position, masked_weight, # next_sentence_label, segment_id, valid_length batchify_fn = Tuple(Pad(), Pad(), Pad(), Pad(), Stack(), Pad(), Stack()) if self._use_avg_len: # sharded data loader sampler = nlp.data.FixedBucketSampler( lengths=lengths, # batch_size per shard batch_size=self._batch_size, num_buckets=self._num_buckets, shuffle=self._shuffle, use_average_length=True, num_shards=self._num_ctxes) dataloader = nlp.data.ShardedDataLoader( dataset, batch_sampler=sampler, batchify_fn=batchify_fn, num_workers=self._num_ctxes) else: sampler = nlp.data.FixedBucketSampler( lengths, batch_size=self._batch_size * self._num_ctxes, num_buckets=self._num_buckets, ratio=0, shuffle=self._shuffle) dataloader = DataLoader(dataset=dataset, batch_sampler=sampler, batchify_fn=batchify_fn, num_workers=1) logging.debug('Sampler created for a new dataset:\n%s', sampler.stats()) return dataloader
def __init__(self, vocab_provider, question_max_length, context_max_length): self._word_vocab = vocab_provider.get_word_level_vocab() self._char_vocab = vocab_provider.get_char_level_vocab() self._question_max_length = question_max_length self._context_max_length = context_max_length self._padder = Pad()
def get_predictions(net, true_intent, intent_map, slots_map, context, batch_size): """Get predictions for every item in the intent. It returns a list where index is same as in validation item. Each record is of following format: Tuple(Predicted_Intent, List[(List[bits of text], slot)]""" result = [] idx_to_slot = {v: k for k, v in slots_map.items()} idx_to_intent = {v: k for k, v in intent_map.items()} intent_dev_dataset = NLUBenchmarkDataset(SacreMosesTokenizer(), 'val', intent_map, slots_map, intent_to_load=true_intent) transformer = DataTransformer(ELMoCharVocab()) transformed_dev_dataset = intent_dev_dataset.transform(transformer, lazy=False) batchify_fn = Tuple(Pad(), Stack(), Pad(), Stack()) dev_dataloader = DataLoader(transformed_dev_dataset, batch_size=batch_size, num_workers=multiprocessing.cpu_count() - 3, batchify_fn=batchify_fn) for i, (data, valid_lengths, entities, intent) in enumerate(dev_dataloader): items_per_iteration = data.shape[0] length = data.shape[1] data = data.as_in_context(context) hidden_state = net.elmo_container[0].begin_state(mx.nd.zeros, batch_size=items_per_iteration, ctx=context) mask = get_data_mask(length, valid_lengths, items_per_iteration, context) intents, slots = net(data, hidden_state, mask) score, slots_seq = net.crf(slots.transpose(axes=(1, 0, 2))) intents_prediction = intents.argmax(axis=1).asnumpy() slots_prediction = slots_seq.asnumpy() for rec_id, pred_intent in enumerate(intents_prediction): text = intent_dev_dataset[rec_id][0] tokens = intent_dev_dataset[rec_id][1] slot_prediction = slots_prediction[rec_id] prediction_item = get_prediction_item(idx_to_slot, slot_prediction, tokens) result.append((idx_to_intent[pred_intent], prediction_item, text, tokens)) return result
def get_dataloader(dataset): """create data loader based on the dataset chunk""" t0 = time.time() lengths = dataset.get_field('valid_lengths') logging.debug('Num samples = %d', len(lengths)) # A batch includes: input_id, masked_id, masked_position, masked_weight, # next_sentence_label, segment_id, valid_length batchify_fn = Tuple(Pad(), Pad(), Pad(), Pad(), Stack(), Pad(), Stack()) if args.by_token: # sharded data loader sampler = nlp.data.FixedBucketSampler( lengths=lengths, # batch_size per shard batch_size=batch_size, num_buckets=args.num_buckets, shuffle=is_train, use_average_length=True, num_shards=num_ctxes) dataloader = nlp.data.ShardedDataLoader(dataset, batch_sampler=sampler, batchify_fn=batchify_fn, num_workers=num_ctxes) logging.debug('Batch Sampler:\n%s', sampler.stats()) else: sampler = FixedBucketSampler(lengths, batch_size=batch_size * num_ctxes, num_buckets=args.num_buckets, ratio=0, shuffle=is_train) dataloader = DataLoader(dataset=dataset, batch_sampler=sampler, batchify_fn=batchify_fn, num_workers=1) logging.debug('Batch Sampler:\n%s', sampler.stats()) t1 = time.time() logging.debug('Dataloader creation cost = %.2f s', t1 - t0) return dataloader
def transform_segment(transformer, segment, options): dataset = NLPCCDataset(segment, './data/') transformed_dataset = dataset.transform(transformer, lazy=False) word_vocab = transformer.get_word_vocab() batchify_fn = Tuple( Stack(), Pad(axis=0, pad_val=word_vocab[word_vocab.padding_token], ret_length=True), Stack()) sampler = FixedBucketSampler( lengths=[len(item[1]) for item in transformed_dataset], batch_size=options.batch_size, shuffle=True, num_buckets=options.num_buckets) return DataLoader(transformed_dataset, batchify_fn=batchify_fn, batch_sampler=sampler)
f.close() if __name__ == '__main__': logging = get_logger(__name__) logging.info('numpy version:{} MXNet version::{}'.format(np.__version__, mx.__version__)) options = parse_args() ctx = mx.gpu() if options.num_gpus > 0 else mx.cpu() train_df = read_data(options.training_dir, options.num_datasets) train_datasets = [PredictiveMaintenanceDataset(df, is_train=True, is_many_to_one=options.is_many_to_one) for df in train_df] batchify = Tuple(Pad(ret_length=True), Stack() if options.is_many_to_one else Pad()) dataset_index = 0 train_data = gluon.data.DataLoader(train_datasets[dataset_index], shuffle=True, batch_size=options.batch_size, num_workers=8, batchify_fn=batchify) logging.info("We have {} training timeseries".format(len(train_datasets[dataset_index]))) net = TimeSeriesNet(options.num_layers, options.num_units, options.dropout) net.hybridize(static_alloc=True) net.initialize(mx.init.Normal(), ctx=ctx) logging.info('Model created and initialized') optimizer_params = {'learning_rate': options.learning_rate, 'wd': options.wd, 'clip_gradient': options.clip_gradient}
def evaluate(args): ctx_l = [mx.cpu()] if args.gpus is None or args.gpus == '' else [mx.gpu(int(x)) for x in args.gpus.split(',')] src_normalizer = get_normalizer(args.src_normalizer, args.src_lang) tgt_normalizer = get_normalizer(args.src_normalizer, args.tgt_lang) base_src_tokenizer = get_base_tokenizer(args.src_base_tokenizer, args.src_lang) base_tgt_tokenizer = get_base_tokenizer(args.tgt_base_tokenizer, args.tgt_lang) src_tokenizer = create_tokenizer(args.src_tokenizer, args.src_subword_model_path, args.src_vocab_path) tgt_tokenizer = create_tokenizer(args.tgt_tokenizer, args.tgt_subword_model_path, args.tgt_vocab_path) src_vocab = src_tokenizer.vocab tgt_vocab = tgt_tokenizer.vocab if args.cfg.endswith('.yml'): cfg = TransformerModel.get_cfg().clone_merge(args.cfg) else: cfg = TransformerModel.get_cfg(args.cfg) cfg.defrost() cfg.MODEL.src_vocab_size = len(src_vocab) cfg.MODEL.tgt_vocab_size = len(tgt_vocab) if args.fp16: cfg.MODEL.dtype = 'float16' cfg.freeze() model = TransformerModel.from_cfg(cfg) model.cast('float16') model.hybridize() model.load_parameters(args.param_path, ctx=ctx_l, cast_dtype=True) inference_model = TransformerInference(model=model) inference_model.hybridize() # Construct the BeamSearchSampler if args.stochastic: scorer = BeamSearchScorer(alpha=0.0, K=0.0, temperature=args.temperature, from_logits=False) else: scorer = BeamSearchScorer(alpha=args.lp_alpha, K=args.lp_k, from_logits=False) beam_search_sampler = BeamSearchSampler(beam_size=args.beam_size, decoder=inference_model, vocab_size=len(tgt_vocab), eos_id=tgt_vocab.eos_id, scorer=scorer, stochastic=args.stochastic, max_length_a=args.max_length_a, max_length_b=args.max_length_b) logging.info(beam_search_sampler) all_src_token_ids, all_src_lines = process_corpus( args.src_corpus, sentence_normalizer=src_normalizer, base_tokenizer=base_src_tokenizer, bpe_tokenizer=src_tokenizer, add_bos=False, add_eos=True ) if args.tgt_corpus is not None: all_tgt_token_ids, all_tgt_lines = process_corpus( args.tgt_corpus, sentence_normalizer=tgt_normalizer, base_tokenizer=base_tgt_tokenizer, bpe_tokenizer=tgt_tokenizer, add_bos=True, add_eos=True ) else: # when applying inference, populate the fake tgt tokens all_tgt_token_ids = all_tgt_lines = [[] for i in range(len(all_src_token_ids))] test_dataloader = gluon.data.DataLoader( list(zip(all_src_token_ids, [len(ele) for ele in all_src_token_ids], all_tgt_token_ids, [len(ele) for ele in all_tgt_token_ids])), batch_size=32, batchify_fn=Tuple(Pad(), Stack(), Pad(), Stack()), shuffle=False) ctx = ctx_l[0] pred_sentences = [] start_eval_time = time.time() # evaluate if not args.inference: avg_nll_loss = 0 ntokens = 0 for i, (src_token_ids, src_valid_length, tgt_token_ids, tgt_valid_length)\ in enumerate(test_dataloader): src_token_ids = mx.np.array(src_token_ids, ctx=ctx, dtype=np.int32) src_valid_length = mx.np.array(src_valid_length, ctx=ctx, dtype=np.int32) tgt_token_ids = mx.np.array(tgt_token_ids, ctx=ctx, dtype=np.int32) tgt_valid_length = mx.np.array(tgt_valid_length, ctx=ctx, dtype=np.int32) if model.layout == 'NT': tgt_pred = model(src_token_ids, src_valid_length, tgt_token_ids[:, :-1], tgt_valid_length - 1) pred_logits = mx.npx.log_softmax(tgt_pred, axis=-1) nll = - mx.npx.pick(pred_logits, tgt_token_ids[:, 1:]) avg_nll_loss += mx.npx.sequence_mask(nll, sequence_length=tgt_valid_length - 1, use_sequence_length=True, axis=1).sum().asnumpy() elif model.layout == 'TN': tgt_pred = model(src_token_ids.T, src_valid_length, tgt_token_ids.T[:-1, :], tgt_valid_length - 1) pred_logits = mx.npx.log_softmax(tgt_pred, axis=-1) nll = - mx.npx.pick(pred_logits, tgt_token_ids.T[1:, :]) avg_nll_loss += mx.npx.sequence_mask(nll, sequence_length=tgt_valid_length - 1, use_sequence_length=True, axis=0).sum().asnumpy() else: raise NotImplementedError ntokens += int((tgt_valid_length - 1).sum().asnumpy()) init_input = mx.np.array([tgt_vocab.bos_id for _ in range(src_token_ids.shape[0])], ctx=ctx) if model.layout == 'NT': states = inference_model.init_states(src_token_ids, src_valid_length) elif model.layout == 'TN': states = inference_model.init_states(src_token_ids.T, src_valid_length) else: raise NotImplementedError samples, scores, valid_length = beam_search_sampler(init_input, states, src_valid_length) for j in range(samples.shape[0]): pred_tok_ids = samples[j, 0, :valid_length[j, 0].asnumpy()].asnumpy().tolist() bpe_decode_line = tgt_tokenizer.decode(pred_tok_ids[1:-1]) pred_sentence = base_tgt_tokenizer.decode(bpe_decode_line.split(' ')) pred_sentences.append(pred_sentence) print(pred_sentence) print('Processed {}/{}'.format(len(pred_sentences), len(all_tgt_lines))) end_eval_time = time.time() avg_nll_loss = avg_nll_loss / ntokens with open(os.path.join(args.save_dir, 'gt_sentences.txt'), 'w', encoding='utf-8') as of: of.write('\n'.join(all_tgt_lines)) of.write('\n') with open(os.path.join(args.save_dir, 'pred_sentences.txt'), 'w', encoding='utf-8') as of: of.write('\n'.join(pred_sentences)) of.write('\n') sacrebleu_out = sacrebleu.corpus_bleu(sys_stream=pred_sentences, ref_streams=[all_tgt_lines]) logging.info('Time Spent: {}, #Sent={}, SacreBlEU={} ' '({:2.1f} {:2.1f} {:2.1f} {:2.1f}) ' '(BP={:.3f}, ratio={:.3f}, syslen={}, reflen={}), ' 'Avg NLL={}, Perplexity={}' .format(end_eval_time - start_eval_time, len(all_tgt_lines), sacrebleu_out.score, *sacrebleu_out.precisions, sacrebleu_out.bp, sacrebleu_out.sys_len / sacrebleu_out.ref_len, sacrebleu_out.sys_len, sacrebleu_out.ref_len, avg_nll_loss, np.exp(avg_nll_loss))) results = {'sacrebleu': sacrebleu_out.score, 'nll': avg_nll_loss} with open(os.path.join(args.save_dir, 'results.json'), 'w') as of: json.dump(results, of) # inference only else: with open(os.path.join(args.save_dir, 'pred_sentences.txt'), 'w', encoding='utf-8') as of: processed_sentences = 0 for src_token_ids, src_valid_length, _, _ in tqdm(test_dataloader): src_token_ids = mx.np.array(src_token_ids, ctx=ctx, dtype=np.int32) src_valid_length = mx.np.array(src_valid_length, ctx=ctx, dtype=np.int32) init_input = mx.np.array([tgt_vocab.bos_id for _ in range(src_token_ids.shape[0])], ctx=ctx) if model.layout == 'NT': states = inference_model.init_states(src_token_ids, src_valid_length) elif model.layout == 'TN': states = inference_model.init_states(src_token_ids.T, src_valid_length) else: raise NotImplementedError samples, scores, valid_length = beam_search_sampler(init_input, states, src_valid_length) for j in range(samples.shape[0]): pred_tok_ids = samples[j, 0, :valid_length[j, 0].asnumpy()].asnumpy().tolist() bpe_decode_line = tgt_tokenizer.decode(pred_tok_ids[1:-1]) pred_sentence = base_tgt_tokenizer.decode(bpe_decode_line.split(' ')) pred_sentences.append(pred_sentence) of.write('\n'.join(pred_sentences)) of.write('\n') processed_sentences += len(pred_sentences) pred_sentences = [] end_eval_time = time.time() logging.info('Time Spent: {}, Inferred sentences: {}' .format(end_eval_time - start_eval_time, processed_sentences))
args = parse_args() context = mx.cpu(0) if args.gpu is None else mx.gpu(args.gpu) train_dataset = NLUBenchmarkDataset(SacreMosesTokenizer(), 'train_full') print(train_dataset.get_intent_map()) print(train_dataset.get_slots_map()) dev_dataset = NLUBenchmarkDataset(SacreMosesTokenizer(), 'val', train_dataset.get_intent_map(), train_dataset.get_slots_map()) transformer = DataTransformer(ELMoCharVocab()) transformed_train_dataset = train_dataset.transform(transformer, lazy=False) transformed_dev_dataset = dev_dataset.transform(transformer, lazy=False) batchify_fn = Tuple(Pad(), Stack(), Pad(), Stack()) train_dataloader = DataLoader(transformed_train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=multiprocessing.cpu_count() - 3, batchify_fn=batchify_fn) dev_dataloader = DataLoader(transformed_dev_dataset, batch_size=args.batch_size, shuffle=True, num_workers=multiprocessing.cpu_count() - 3, batchify_fn=batchify_fn) slots_count = len(train_dataset.get_slots_map()) intents_count = len(train_dataset.get_intent_map())