def predict(task: TaggingTask, params: cfg.DataConfig, model: tf.keras.Model) -> List[Tuple[int, int, List[int]]]: """Predicts on the input data. Args: task: A `TaggingTask` object. params: A `cfg.DataConfig` object. model: A keras.Model. Returns: A list of tuple. Each tuple contains `sentence_id`, `sub_sentence_id` and a list of predicted ids. """ def predict_step(inputs): """Replicated prediction calculation.""" x, y = inputs sentence_ids = x.pop('sentence_id') sub_sentence_ids = x.pop('sub_sentence_id') outputs = task.inference_step(x, model) predict_ids = outputs['predict_ids'] label_mask = tf.greater_equal(y, 0) return dict( predict_ids=predict_ids, label_mask=label_mask, sentence_ids=sentence_ids, sub_sentence_ids=sub_sentence_ids) def aggregate_fn(state, outputs): """Concatenates model's outputs.""" if state is None: state = [] for (batch_predict_ids, batch_label_mask, batch_sentence_ids, batch_sub_sentence_ids) in zip(outputs['predict_ids'], outputs['label_mask'], outputs['sentence_ids'], outputs['sub_sentence_ids']): for (tmp_predict_ids, tmp_label_mask, tmp_sentence_id, tmp_sub_sentence_id) in zip(batch_predict_ids.numpy(), batch_label_mask.numpy(), batch_sentence_ids.numpy(), batch_sub_sentence_ids.numpy()): real_predict_ids = [] assert len(tmp_predict_ids) == len(tmp_label_mask) for i in range(len(tmp_predict_ids)): # Skip the padding label. if tmp_label_mask[i]: real_predict_ids.append(tmp_predict_ids[i]) state.append((tmp_sentence_id, tmp_sub_sentence_id, real_predict_ids)) return state dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(), task.build_inputs, params) outputs = utils.predict(predict_step, aggregate_fn, dataset) return sorted(outputs, key=lambda x: (x[0], x[1]))
def predict(task: TaggingTask, params: cfg.DataConfig, model: tf.keras.Model) -> Tuple[List[List[int]], List[int]]: """Predicts on the input data. Args: task: A `TaggingTask` object. params: A `cfg.DataConfig` object. model: A keras.Model. Returns: A tuple of `predict_ids` and `sentence_ids`, which are list with length of `num_examples`. Each element in `predict_ids` is a sequence of predicted per-word label id, and each element in `sentence_ids` is the sentence id of the corresponding example. """ def predict_step(inputs): """Replicated prediction calculation.""" x, y = inputs sentence_ids = x.pop('sentence_id') outputs = task.inference_step(x, model) predict_ids = outputs['predict_ids'] label_mask = tf.greater_equal(y, 0) return dict( predict_ids=predict_ids, label_mask=label_mask, sentence_ids=sentence_ids) def aggregate_fn(state, outputs): """Concatenates model's outputs.""" if state is None: state = {'predict_ids': [], 'sentence_ids': []} cur_predict_ids = state['predict_ids'] cur_sentence_ids = state['sentence_ids'] for batch_predict_ids, batch_label_mask, batch_sentence_ids in zip( outputs['predict_ids'], outputs['label_mask'], outputs['sentence_ids']): for tmp_predict_ids, tmp_label_mask, tmp_sentence_id in zip( batch_predict_ids.numpy(), batch_label_mask.numpy(), batch_sentence_ids.numpy()): cur_sentence_ids.append(tmp_sentence_id) cur_predict_ids.append([]) assert len(tmp_predict_ids) == len(tmp_label_mask) for i in range(len(tmp_predict_ids)): # Skip the padding label. if tmp_label_mask[i]: cur_predict_ids[-1].append(tmp_predict_ids[i]) return state dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(), task.build_inputs, params) outputs = utils.predict(predict_step, aggregate_fn, dataset) return outputs['predict_ids'], outputs['sentence_ids']
def predict(task: SentencePredictionTask, params: cfg.DataConfig, model: tf.keras.Model) -> List[Union[int, float]]: """Predicts on the input data. Args: task: A `SentencePredictionTask` object. params: A `cfg.DataConfig` object. model: A keras.Model. Returns: A list of predictions with length of `num_examples`. For regression task, each element in the list is the predicted score; for classification task, each element is the predicted class id. """ is_regression = task.task_config.model.num_classes == 1 def predict_step(inputs): """Replicated prediction calculation.""" x, _ = inputs example_id = x.pop('example_id') outputs = task.inference_step(x, model) if is_regression: return dict(example_id=example_id, predictions=outputs) else: return dict(example_id=example_id, predictions=tf.argmax(outputs, axis=-1)) def aggregate_fn(state, outputs): """Concatenates model's outputs.""" if state is None: state = [] for per_replica_example_id, per_replica_batch_predictions in zip( outputs['example_id'], outputs['predictions']): state.extend( zip(per_replica_example_id, per_replica_batch_predictions)) return state dataset = orbit.utils.make_distributed_dataset( tf.distribute.get_strategy(), task.build_inputs, params) outputs = utils.predict(predict_step, aggregate_fn, dataset) # When running on TPU POD, the order of output cannot be maintained, # so we need to sort by example_id. outputs = sorted(outputs, key=lambda x: x[0]) return [x[1] for x in outputs]
def predict(task: QuestionAnsweringTask, params: cfg.DataConfig, model: tf.keras.Model): """Predicts on the input data. Args: task: A `QuestionAnsweringTask` object. params: A `cfg.DataConfig` object. model: A keras.Model. Returns: A tuple of `all_predictions`, `all_nbest` and `scores_diff`, which are dict and can be written to json files including prediction json file, nbest json file and null_odds json file. """ tf_record_input_path, eval_examples, eval_features = ( task._preprocess_eval_data(params)) # pylint: disable=protected-access # `tf_record_input_path` will overwrite `params.input_path`, # when `task.buid_inputs()` is called. task.set_preprocessed_eval_input_path(tf_record_input_path) def predict_step(inputs): """Replicated prediction calculation.""" return task.validation_step(inputs, model) dataset = orbit.utils.make_distributed_dataset( tf.distribute.get_strategy(), task.build_inputs, params) aggregated_outputs = utils.predict(predict_step, task.aggregate_logs, dataset) all_predictions, all_nbest, scores_diff = ( task.squad_lib.postprocess_output( eval_examples, eval_features, aggregated_outputs, task.task_config.n_best_size, task.task_config.max_answer_length, task.task_config.validation_data.do_lower_case, version_2_with_negative=(params.version_2_with_negative), null_score_diff_threshold=task.task_config. null_score_diff_threshold, xlnet_format=task.task_config.validation_data.xlnet_format, verbose=False)) return all_predictions, all_nbest, scores_diff
def predict(task: SentencePredictionTask, params: cfg.DataConfig, model: tf.keras.Model) -> List[Union[int, float]]: """Predicts on the input data. Args: task: A `SentencePredictionTask` object. params: A `cfg.DataConfig` object. model: A keras.Model. Returns: A list of predictions with length of `num_examples`. For regression task, each element in the list is the predicted score; for classification task, each element is the predicted class id. """ is_regression = task.task_config.model.num_classes == 1 def predict_step(inputs): """Replicated prediction calculation.""" x, _ = inputs outputs = task.inference_step(x, model) if is_regression: return outputs else: return tf.argmax(outputs, axis=-1) def aggregate_fn(state, outputs): """Concatenates model's outputs.""" if state is None: state = {'predictions': []} for per_replica_batch_predictions in outputs: state['predictions'].extend(per_replica_batch_predictions) return state dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(), task.build_inputs, params) outputs = utils.predict(predict_step, aggregate_fn, dataset) return outputs['predictions']
outputs['predict_ids'], outputs['label_mask'], outputs['sentence_ids']): for tmp_predict_ids, tmp_label_mask, tmp_sentence_id in zip( batch_predict_ids.numpy(), batch_label_mask.numpy(), batch_sentence_ids.numpy()): cur_sentence_ids.append(tmp_sentence_id) cur_predict_ids.append([]) assert len(tmp_predict_ids) == len(tmp_label_mask) for i in range(len(tmp_predict_ids)): # Skip the padding label. if tmp_label_mask[i]: cur_predict_ids[-1].append(tmp_predict_ids[i]) <<<<<<< HEAD return cur_predict_ids, cur_sentence_ids loop_fn = orbit.utils.create_loop_fn(predict_step) dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(), task.build_inputs, params) # Set `num_steps` to -1 to exhaust the dataset. predict_ids, sentence_ids = loop_fn( iter(dataset), num_steps=-1, state=([], []), reduce_fn=reduce_fn) return predict_ids, sentence_ids ======= return state dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(), task.build_inputs, params) outputs = utils.predict(predict_step, aggregate_fn, dataset) return outputs['predict_ids'], outputs['sentence_ids'] >>>>>>> a811a3b7e640722318ad868c99feddf3f3063e36
def predict(task: SentencePredictionTask, params: cfg.DataConfig, model: tf.keras.Model, params_aug: Optional[cfg.DataConfig] = None, test_time_aug_wgt: float = 0.3) -> List[Union[int, float]]: """Predicts on the input data. Args: task: A `SentencePredictionTask` object. params: A `cfg.DataConfig` object. model: A keras.Model. params_aug: A `cfg.DataConfig` object for augmented data. test_time_aug_wgt: Test time augmentation weight. The prediction score will use (1. - test_time_aug_wgt) original prediction plus test_time_aug_wgt augmented prediction. Returns: A list of predictions with length of `num_examples`. For regression task, each element in the list is the predicted score; for classification task, each element is the predicted class id. """ def predict_step(inputs): """Replicated prediction calculation.""" x, _ = inputs example_id = x.pop('example_id') outputs = task.inference_step(x, model) return dict(example_id=example_id, predictions=outputs) def aggregate_fn(state, outputs): """Concatenates model's outputs.""" if state is None: state = [] for per_replica_example_id, per_replica_batch_predictions in zip( outputs['example_id'], outputs['predictions']): state.extend(zip(per_replica_example_id, per_replica_batch_predictions)) return state dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(), task.build_inputs, params) outputs = utils.predict(predict_step, aggregate_fn, dataset) # When running on TPU POD, the order of output cannot be maintained, # so we need to sort by example_id. outputs = sorted(outputs, key=lambda x: x[0]) is_regression = task.task_config.model.num_classes == 1 if params_aug is not None: dataset_aug = orbit.utils.make_distributed_dataset( tf.distribute.get_strategy(), task.build_inputs, params_aug) outputs_aug = utils.predict(predict_step, aggregate_fn, dataset_aug) outputs_aug = sorted(outputs_aug, key=lambda x: x[0]) if is_regression: return [(1. - test_time_aug_wgt) * x[1] + test_time_aug_wgt * y[1] for x, y in zip(outputs, outputs_aug)] else: return [ tf.argmax( (1. - test_time_aug_wgt) * x[1] + test_time_aug_wgt * y[1], axis=-1) for x, y in zip(outputs, outputs_aug) ] if is_regression: return [x[1] for x in outputs] else: return [tf.argmax(x[1], axis=-1) for x in outputs]
def _predict(task, params, model): """Predicts on the input data. Similiar to official.nlp.tasks.tagging.predict, but returns the logits instead of the final label. Args: task: A `TaggingTask` object. params: A `cfg.DataConfig` object. model: A keras.Model. Returns: A list of tuple. Each tuple contains `sentence_id`, `sub_sentence_id` and a list of predicted ids. """ def _predict_step(inputs): """Replicated prediction calculation.""" x, y = inputs sentence_ids = x.pop("sentence_id") sub_sentence_ids = x.pop("sub_sentence_id") outputs = task.inference_step(x, model) logits = outputs["logits"] label_mask = tf.greater_equal(y, 0) return dict(logits=logits, label_mask=label_mask, sentence_ids=sentence_ids, sub_sentence_ids=sub_sentence_ids) def _aggregate_fn(state, outputs): """Concatenates model's outputs.""" if state is None: state = [] for (batch_logits, batch_label_mask, batch_sentence_ids, batch_sub_sentence_ids) in zip(outputs["logits"], outputs["label_mask"], outputs["sentence_ids"], outputs["sub_sentence_ids"]): batch_probs = tf.keras.activations.softmax(batch_logits) for (tmp_prob, tmp_label_mask, tmp_sentence_id, tmp_sub_sentence_id) in zip(batch_probs.numpy(), batch_label_mask.numpy(), batch_sentence_ids.numpy(), batch_sub_sentence_ids.numpy()): real_probs = [] assert len(tmp_prob) == len(tmp_label_mask) _assert_same_length([tmp_prob, tmp_label_mask], tmp_sentence_id) for i in range(len(tmp_prob)): # Skip the padding label. if tmp_label_mask[i]: real_probs.append(tmp_prob[i]) state.append( (tmp_sentence_id, tmp_sub_sentence_id, real_probs)) return state dataset = orbit.utils.make_distributed_dataset( tf.distribute.get_strategy(), task.build_inputs, params) outputs = utils.predict(_predict_step, _aggregate_fn, dataset) return sorted(outputs, key=lambda x: (x[0], x[1]))