def test_dataloader():
    batch_size = 128
    dataset = NLPCCDataset('train', data_root_dir)
    transformer = DataTransformer(['train'])

    word_vocab = transformer._word_vocab
    transformed_dataset = dataset.transform(transformer, lazy=False)

    batchify_fn = Tuple(
        Stack(),
        Pad(axis=0,
            pad_val=word_vocab[word_vocab.padding_token],
            ret_length=True), Stack())
    sampler = FixedBucketSampler(
        lengths=[len(item[1]) for item in transformed_dataset],
        batch_size=batch_size,
        shuffle=True,
        num_buckets=30)

    data_loader = DataLoader(transformed_dataset,
                             batchify_fn=batchify_fn,
                             batch_sampler=sampler)

    for i, (rec_id, (data, original_length), label) in enumerate(data_loader):
        print(data.shape)
        assert data.shape[0] <= 128
Beispiel #2
0
 def get_dataloader(dataset):
     """create data loader based on the dataset chunk"""
     lengths = dataset.get_field('valid_lengths')
     # A batch includes: input_id, masked_id, masked_position, masked_weight,
     #                   next_sentence_label, segment_id, valid_length
     batchify_fn = Tuple(Pad(), Pad(), Pad(), Pad(), Stack(), Pad(),
                         Stack())
     if use_avg_len:
         # sharded data loader
         sampler = nlp.data.FixedBucketSampler(
             lengths=lengths,
             # batch_size per shard
             batch_size=batch_size,
             num_buckets=num_buckets,
             shuffle=shuffle,
             use_average_length=True,
             num_shards=num_ctxes)
         dataloader = nlp.data.ShardedDataLoader(dataset,
                                                 batch_sampler=sampler,
                                                 batchify_fn=batchify_fn,
                                                 num_workers=num_ctxes)
     else:
         sampler = nlp.data.FixedBucketSampler(lengths,
                                               batch_size=batch_size *
                                               num_ctxes,
                                               num_buckets=num_buckets,
                                               ratio=0,
                                               shuffle=shuffle)
         dataloader = DataLoader(dataset=dataset,
                                 batch_sampler=sampler,
                                 batchify_fn=batchify_fn,
                                 num_workers=1)
     logging.debug('Sampler created for a new dataset:\n%s',
                   sampler.stats())
     return dataloader
Beispiel #3
0
 def __init__(self, num_ctxes, vocab):
     self._num_ctxes = num_ctxes
     pad_val = vocab[vocab.padding_token]
     self._batchify_fn = Tuple(
         Pad(pad_val=pad_val, round_to=8),  # input_id
         Pad(pad_val=pad_val),  # masked_id
         Pad(pad_val=0),  # masked_position
         Pad(pad_val=0),  # masked_weight
         Stack(),  # next_sentence_label
         Pad(pad_val=0, round_to=8),  # segment_id
         Stack())  # valid_length
    def __call__(self, dataset, sampler):
        # A batch includes: input_id, masked_id, masked_position, masked_weight,
        #                   next_sentence_label, segment_id, valid_length
        batchify_fn = Tuple(Pad(), Pad(), Pad(), Pad(), Stack(), Pad(), Stack())

        if self._use_avg_len:
            # sharded data loader
            dataloader = nlp.data.ShardedDataLoader(dataset,
                                                    batch_sampler=sampler,
                                                    batchify_fn=batchify_fn,
                                                    num_workers=self._num_ctxes)
        else:
            dataloader = DataLoader(dataset=dataset,
                                    batch_sampler=sampler,
                                    batchify_fn=batchify_fn,
                                    num_workers=self._num_ctxes)
        return dataloader
Beispiel #5
0
    def __call__(self, dataset):
        """create data loader based on the dataset chunk"""
        if isinstance(dataset, nlp.data.NumpyDataset):
            lengths = dataset.get_field('valid_lengths')
        elif isinstance(dataset, BERTPretrainDataset):
            lengths = dataset.transform(lambda input_ids, segment_ids, masked_lm_positions, \
                                               masked_lm_ids, masked_lm_weights, \
                                               next_sentence_labels, valid_lengths: \
                                               valid_lengths, lazy=False)
        else:
            raise ValueError('unexpected dataset type: %s' % str(dataset))

        # A batch includes: input_id, masked_id, masked_position, masked_weight,
        #                   next_sentence_label, segment_id, valid_length
        batchify_fn = Tuple(Pad(), Pad(), Pad(), Pad(), Stack(), Pad(),
                            Stack())
        if self._use_avg_len:
            # sharded data loader
            sampler = nlp.data.FixedBucketSampler(
                lengths=lengths,
                # batch_size per shard
                batch_size=self._batch_size,
                num_buckets=self._num_buckets,
                shuffle=self._shuffle,
                use_average_length=True,
                num_shards=self._num_ctxes)
            dataloader = nlp.data.ShardedDataLoader(
                dataset,
                batch_sampler=sampler,
                batchify_fn=batchify_fn,
                num_workers=self._num_ctxes)
        else:
            sampler = nlp.data.FixedBucketSampler(
                lengths,
                batch_size=self._batch_size * self._num_ctxes,
                num_buckets=self._num_buckets,
                ratio=0,
                shuffle=self._shuffle)
            dataloader = DataLoader(dataset=dataset,
                                    batch_sampler=sampler,
                                    batchify_fn=batchify_fn,
                                    num_workers=1)
        logging.debug('Sampler created for a new dataset:\n%s',
                      sampler.stats())
        return dataloader
def get_predictions(net, true_intent, intent_map, slots_map, context, batch_size):
    """Get predictions for every item in the intent.
    It returns a list where index is same as in validation item. Each record is of following format:
    Tuple(Predicted_Intent, List[(List[bits of text], slot)]"""
    result = []
    idx_to_slot = {v: k for k, v in slots_map.items()}
    idx_to_intent = {v: k for k, v in intent_map.items()}

    intent_dev_dataset = NLUBenchmarkDataset(SacreMosesTokenizer(), 'val', intent_map,
                                             slots_map, intent_to_load=true_intent)
    transformer = DataTransformer(ELMoCharVocab())
    transformed_dev_dataset = intent_dev_dataset.transform(transformer, lazy=False)
    batchify_fn = Tuple(Pad(), Stack(), Pad(), Stack())
    dev_dataloader = DataLoader(transformed_dev_dataset, batch_size=batch_size,
                                num_workers=multiprocessing.cpu_count() - 3,
                                batchify_fn=batchify_fn)

    for i, (data, valid_lengths, entities, intent) in enumerate(dev_dataloader):
        items_per_iteration = data.shape[0]
        length = data.shape[1]

        data = data.as_in_context(context)

        hidden_state = net.elmo_container[0].begin_state(mx.nd.zeros,
                                                         batch_size=items_per_iteration,
                                                         ctx=context)
        mask = get_data_mask(length, valid_lengths, items_per_iteration, context)

        intents, slots = net(data, hidden_state, mask)
        score, slots_seq = net.crf(slots.transpose(axes=(1, 0, 2)))

        intents_prediction = intents.argmax(axis=1).asnumpy()
        slots_prediction = slots_seq.asnumpy()

        for rec_id, pred_intent in enumerate(intents_prediction):
            text = intent_dev_dataset[rec_id][0]
            tokens = intent_dev_dataset[rec_id][1]
            slot_prediction = slots_prediction[rec_id]

            prediction_item = get_prediction_item(idx_to_slot, slot_prediction, tokens)
            result.append((idx_to_intent[pred_intent], prediction_item, text, tokens))

    return result
def transform_segment(transformer, segment, options):
    dataset = NLPCCDataset(segment, './data/')
    transformed_dataset = dataset.transform(transformer, lazy=False)

    word_vocab = transformer.get_word_vocab()

    batchify_fn = Tuple(
        Stack(),
        Pad(axis=0,
            pad_val=word_vocab[word_vocab.padding_token],
            ret_length=True), Stack())

    sampler = FixedBucketSampler(
        lengths=[len(item[1]) for item in transformed_dataset],
        batch_size=options.batch_size,
        shuffle=True,
        num_buckets=options.num_buckets)
    return DataLoader(transformed_dataset,
                      batchify_fn=batchify_fn,
                      batch_sampler=sampler)
 def get_dataloader(dataset):
     """create data loader based on the dataset chunk"""
     t0 = time.time()
     lengths = dataset.get_field('valid_lengths')
     logging.debug('Num samples = %d', len(lengths))
     # A batch includes: input_id, masked_id, masked_position, masked_weight,
     #                   next_sentence_label, segment_id, valid_length
     batchify_fn = Tuple(Pad(), Pad(), Pad(), Pad(), Stack(), Pad(),
                         Stack())
     if args.by_token:
         # sharded data loader
         sampler = nlp.data.FixedBucketSampler(
             lengths=lengths,
             # batch_size per shard
             batch_size=batch_size,
             num_buckets=args.num_buckets,
             shuffle=is_train,
             use_average_length=True,
             num_shards=num_ctxes)
         dataloader = nlp.data.ShardedDataLoader(dataset,
                                                 batch_sampler=sampler,
                                                 batchify_fn=batchify_fn,
                                                 num_workers=num_ctxes)
         logging.debug('Batch Sampler:\n%s', sampler.stats())
     else:
         sampler = FixedBucketSampler(lengths,
                                      batch_size=batch_size * num_ctxes,
                                      num_buckets=args.num_buckets,
                                      ratio=0,
                                      shuffle=is_train)
         dataloader = DataLoader(dataset=dataset,
                                 batch_sampler=sampler,
                                 batchify_fn=batchify_fn,
                                 num_workers=1)
         logging.debug('Batch Sampler:\n%s', sampler.stats())
     t1 = time.time()
     logging.debug('Dataloader creation cost = %.2f s', t1 - t0)
     return dataloader
    f.close()


if __name__ == '__main__':
    logging = get_logger(__name__)
    logging.info('numpy version:{} MXNet version::{}'.format(np.__version__, mx.__version__))
    options = parse_args()

    ctx = mx.gpu() if options.num_gpus > 0 else mx.cpu()

    train_df = read_data(options.training_dir, options.num_datasets)

    train_datasets = [PredictiveMaintenanceDataset(df, is_train=True, is_many_to_one=options.is_many_to_one) for df in
                      train_df]

    batchify = Tuple(Pad(ret_length=True), Stack() if options.is_many_to_one else Pad())

    dataset_index = 0
    train_data = gluon.data.DataLoader(train_datasets[dataset_index], shuffle=True, batch_size=options.batch_size,
                                       num_workers=8,
                                       batchify_fn=batchify)

    logging.info("We have {} training timeseries".format(len(train_datasets[dataset_index])))

    net = TimeSeriesNet(options.num_layers, options.num_units, options.dropout)
    net.hybridize(static_alloc=True)
    net.initialize(mx.init.Normal(), ctx=ctx)
    logging.info('Model created and initialized')

    optimizer_params = {'learning_rate': options.learning_rate, 'wd': options.wd,
                        'clip_gradient': options.clip_gradient}
Beispiel #10
0
def evaluate(args):
    ctx_l = [mx.cpu()] if args.gpus is None or args.gpus == '' else [mx.gpu(int(x)) for x in
                                                                     args.gpus.split(',')]
    src_normalizer = get_normalizer(args.src_normalizer, args.src_lang)
    tgt_normalizer = get_normalizer(args.src_normalizer, args.tgt_lang)
    base_src_tokenizer = get_base_tokenizer(args.src_base_tokenizer, args.src_lang)
    base_tgt_tokenizer = get_base_tokenizer(args.tgt_base_tokenizer, args.tgt_lang)

    src_tokenizer = create_tokenizer(args.src_tokenizer,
                                     args.src_subword_model_path,
                                     args.src_vocab_path)
    tgt_tokenizer = create_tokenizer(args.tgt_tokenizer,
                                     args.tgt_subword_model_path,
                                     args.tgt_vocab_path)
    src_vocab = src_tokenizer.vocab
    tgt_vocab = tgt_tokenizer.vocab
    if args.cfg.endswith('.yml'):
        cfg = TransformerModel.get_cfg().clone_merge(args.cfg)
    else:
        cfg = TransformerModel.get_cfg(args.cfg)
    cfg.defrost()
    cfg.MODEL.src_vocab_size = len(src_vocab)
    cfg.MODEL.tgt_vocab_size = len(tgt_vocab)
    if args.fp16:
        cfg.MODEL.dtype = 'float16'
    cfg.freeze()
    model = TransformerModel.from_cfg(cfg)
    model.cast('float16')
    model.hybridize()
    model.load_parameters(args.param_path, ctx=ctx_l, cast_dtype=True)
    inference_model = TransformerInference(model=model)
    inference_model.hybridize()
    # Construct the BeamSearchSampler
    if args.stochastic:
        scorer = BeamSearchScorer(alpha=0.0,
                                  K=0.0,
                                  temperature=args.temperature,
                                  from_logits=False)
    else:
        scorer = BeamSearchScorer(alpha=args.lp_alpha,
                                  K=args.lp_k,
                                  from_logits=False)
    beam_search_sampler = BeamSearchSampler(beam_size=args.beam_size,
                                            decoder=inference_model,
                                            vocab_size=len(tgt_vocab),
                                            eos_id=tgt_vocab.eos_id,
                                            scorer=scorer,
                                            stochastic=args.stochastic,
                                            max_length_a=args.max_length_a,
                                            max_length_b=args.max_length_b)   

    logging.info(beam_search_sampler)
    all_src_token_ids, all_src_lines = process_corpus(
        args.src_corpus,
        sentence_normalizer=src_normalizer,
        base_tokenizer=base_src_tokenizer,
        bpe_tokenizer=src_tokenizer,
        add_bos=False,
        add_eos=True
    )
    if args.tgt_corpus is not None:
        all_tgt_token_ids, all_tgt_lines = process_corpus(
            args.tgt_corpus,
            sentence_normalizer=tgt_normalizer,
            base_tokenizer=base_tgt_tokenizer,
            bpe_tokenizer=tgt_tokenizer,
            add_bos=True,
            add_eos=True
        )
    else:
        # when applying inference, populate the fake tgt tokens
        all_tgt_token_ids = all_tgt_lines = [[] for i in range(len(all_src_token_ids))]
    test_dataloader = gluon.data.DataLoader(
        list(zip(all_src_token_ids,
                 [len(ele) for ele in all_src_token_ids],
                 all_tgt_token_ids,
                 [len(ele) for ele in all_tgt_token_ids])),
        batch_size=32,
        batchify_fn=Tuple(Pad(), Stack(), Pad(), Stack()),
        shuffle=False)

    ctx = ctx_l[0]
    pred_sentences = []
    start_eval_time = time.time()
    # evaluate
    if not args.inference:
        avg_nll_loss = 0
        ntokens = 0
        for i, (src_token_ids, src_valid_length, tgt_token_ids, tgt_valid_length)\
                in enumerate(test_dataloader):
            src_token_ids = mx.np.array(src_token_ids, ctx=ctx, dtype=np.int32)
            src_valid_length = mx.np.array(src_valid_length, ctx=ctx, dtype=np.int32)
            tgt_token_ids = mx.np.array(tgt_token_ids, ctx=ctx, dtype=np.int32)
            tgt_valid_length = mx.np.array(tgt_valid_length, ctx=ctx, dtype=np.int32)
            if model.layout == 'NT':
                tgt_pred = model(src_token_ids, src_valid_length, tgt_token_ids[:, :-1],
                                tgt_valid_length - 1)
                pred_logits = mx.npx.log_softmax(tgt_pred, axis=-1)
                nll = - mx.npx.pick(pred_logits, tgt_token_ids[:, 1:])
                avg_nll_loss += mx.npx.sequence_mask(nll,
                                                     sequence_length=tgt_valid_length - 1,
                                                     use_sequence_length=True,
                                                     axis=1).sum().asnumpy()
            elif model.layout == 'TN':
                tgt_pred = model(src_token_ids.T, src_valid_length, tgt_token_ids.T[:-1, :],
                                 tgt_valid_length - 1)
                pred_logits = mx.npx.log_softmax(tgt_pred, axis=-1)
                nll = - mx.npx.pick(pred_logits, tgt_token_ids.T[1:, :])
                avg_nll_loss += mx.npx.sequence_mask(nll,
                                                     sequence_length=tgt_valid_length - 1,
                                                     use_sequence_length=True,
                                                     axis=0).sum().asnumpy()
            else:
                raise NotImplementedError
            ntokens += int((tgt_valid_length - 1).sum().asnumpy())
            init_input = mx.np.array([tgt_vocab.bos_id for _ in range(src_token_ids.shape[0])], ctx=ctx)
            if model.layout == 'NT':
                states = inference_model.init_states(src_token_ids, src_valid_length)
            elif model.layout == 'TN':
                states = inference_model.init_states(src_token_ids.T, src_valid_length)
            else:
                raise NotImplementedError
            samples, scores, valid_length = beam_search_sampler(init_input, states, src_valid_length)
            for j in range(samples.shape[0]):
                pred_tok_ids = samples[j, 0, :valid_length[j, 0].asnumpy()].asnumpy().tolist()
                bpe_decode_line = tgt_tokenizer.decode(pred_tok_ids[1:-1])
                pred_sentence = base_tgt_tokenizer.decode(bpe_decode_line.split(' '))
                pred_sentences.append(pred_sentence)
                print(pred_sentence)
            print('Processed {}/{}'.format(len(pred_sentences), len(all_tgt_lines)))
        end_eval_time = time.time()
        avg_nll_loss = avg_nll_loss / ntokens

        with open(os.path.join(args.save_dir, 'gt_sentences.txt'), 'w', encoding='utf-8') as of:
            of.write('\n'.join(all_tgt_lines))
            of.write('\n')
        with open(os.path.join(args.save_dir, 'pred_sentences.txt'), 'w', encoding='utf-8') as of:
            of.write('\n'.join(pred_sentences))
            of.write('\n')

        sacrebleu_out = sacrebleu.corpus_bleu(sys_stream=pred_sentences,
                                              ref_streams=[all_tgt_lines])
        logging.info('Time Spent: {}, #Sent={}, SacreBlEU={} '
                     '({:2.1f} {:2.1f} {:2.1f} {:2.1f}) '
                     '(BP={:.3f}, ratio={:.3f}, syslen={}, reflen={}), '
                     'Avg NLL={}, Perplexity={}'
                     .format(end_eval_time - start_eval_time, len(all_tgt_lines),
                             sacrebleu_out.score,
                             *sacrebleu_out.precisions,
                             sacrebleu_out.bp, sacrebleu_out.sys_len / sacrebleu_out.ref_len,
                             sacrebleu_out.sys_len, sacrebleu_out.ref_len,
                             avg_nll_loss, np.exp(avg_nll_loss)))
        results = {'sacrebleu': sacrebleu_out.score,
                   'nll': avg_nll_loss}
        with open(os.path.join(args.save_dir, 'results.json'), 'w') as of:
            json.dump(results, of)
    # inference only
    else:
        with open(os.path.join(args.save_dir, 'pred_sentences.txt'), 'w', encoding='utf-8') as of:
            processed_sentences = 0
            for src_token_ids, src_valid_length, _, _ in tqdm(test_dataloader):
                src_token_ids = mx.np.array(src_token_ids, ctx=ctx, dtype=np.int32)
                src_valid_length = mx.np.array(src_valid_length, ctx=ctx, dtype=np.int32)
                init_input = mx.np.array([tgt_vocab.bos_id for _ in range(src_token_ids.shape[0])], ctx=ctx)
                if model.layout == 'NT':
                    states = inference_model.init_states(src_token_ids, src_valid_length)
                elif model.layout == 'TN':
                    states = inference_model.init_states(src_token_ids.T, src_valid_length)
                else:
                    raise NotImplementedError
                samples, scores, valid_length = beam_search_sampler(init_input, states, src_valid_length)
                for j in range(samples.shape[0]):
                    pred_tok_ids = samples[j, 0, :valid_length[j, 0].asnumpy()].asnumpy().tolist()
                    bpe_decode_line = tgt_tokenizer.decode(pred_tok_ids[1:-1])
                    pred_sentence = base_tgt_tokenizer.decode(bpe_decode_line.split(' '))
                    pred_sentences.append(pred_sentence)
                of.write('\n'.join(pred_sentences))
                of.write('\n')
                processed_sentences += len(pred_sentences)
                pred_sentences = []
        end_eval_time = time.time()
        logging.info('Time Spent: {}, Inferred sentences: {}'
                     .format(end_eval_time - start_eval_time, processed_sentences))
Beispiel #11
0
    args = parse_args()
    context = mx.cpu(0) if args.gpu is None else mx.gpu(args.gpu)

    train_dataset = NLUBenchmarkDataset(SacreMosesTokenizer(), 'train_full')
    print(train_dataset.get_intent_map())
    print(train_dataset.get_slots_map())
    dev_dataset = NLUBenchmarkDataset(SacreMosesTokenizer(), 'val',
                                      train_dataset.get_intent_map(),
                                      train_dataset.get_slots_map())

    transformer = DataTransformer(ELMoCharVocab())
    transformed_train_dataset = train_dataset.transform(transformer,
                                                        lazy=False)
    transformed_dev_dataset = dev_dataset.transform(transformer, lazy=False)

    batchify_fn = Tuple(Pad(), Stack(), Pad(), Stack())

    train_dataloader = DataLoader(transformed_train_dataset,
                                  batch_size=args.batch_size,
                                  shuffle=True,
                                  num_workers=multiprocessing.cpu_count() - 3,
                                  batchify_fn=batchify_fn)
    dev_dataloader = DataLoader(transformed_dev_dataset,
                                batch_size=args.batch_size,
                                shuffle=True,
                                num_workers=multiprocessing.cpu_count() - 3,
                                batchify_fn=batchify_fn)

    slots_count = len(train_dataset.get_slots_map())
    intents_count = len(train_dataset.get_intent_map())