def make_predictions(all_examples, all_features, all_results, n_best_size, \ max_answer_length, do_lower_case, \ verbose_logging, validate_flag=True): assert len(all_examples) == len(all_features) example_index_to_results = collections.defaultdict(list) for result in all_results: example_index_to_results[result.example_index].append(result) _PrelimPrediction = collections.namedtuple( "PrelimPrediction", ["result_index", "start_index", "end_index", "text", "logprob"]) validate_predictions = dict() all_predictions = [] all_nbest_json = [] for (example_index, feature) in enumerate(all_features): example = all_examples[example_index] results = example_index_to_results[example_index] prelim_predictions = [] for result_index, result in enumerate(results): #stop_logprob = np.log(result.stop_prob) #yes_no_flag_logprobs = np.log(_compute_softmax(result.yes_no_flag_logits)) # (2,) #yes_no_ans_logprobs = np.log(_compute_softmax(result.yes_no_ans_logits)) # (2,) # yes-no question if (np.argmax(result.yes_no_flag_logits) == 1): if (np.argmax(result.yes_no_ans_logits) == 1): text = 'yes' #logprob = stop_logprob + yes_no_flag_logprobs[1] + yes_no_ans_logprobs[1] logprob = result.stop_logits[1] + result.yes_no_flag_logits[ 1] + result.yes_no_ans_logits[1] else: text = 'no' #logprob = stop_logprob + yes_no_flag_logprobs[1] + yes_no_ans_logprobs[0] logprob = result.stop_logits[1] + result.yes_no_flag_logits[ 1] + result.yes_no_ans_logits[0] prelim_predictions.append( _PrelimPrediction(result_index=result_index, start_index=-1, end_index=-1, text=text, logprob=logprob)) continue start_indexes = _get_best_indexes(result.start_logits, n_best_size) end_indexes = _get_best_indexes(result.end_logits, n_best_size) #start_logprobs = np.log(_compute_softmax(result.start_logits)) #end_logprobs = np.log(_compute_softmax(result.end_logits)) for start_index in start_indexes: for end_index in end_indexes: if start_index not in result.id_to_tok_map: continue if end_index not in result.id_to_tok_map: continue if end_index < start_index: continue length = end_index - start_index + 1 if length > max_answer_length: continue #logprob = stop_logprob + yes_no_flag_logprobs[0] + \ # start_logprobs[start_index] + end_logprobs[end_index] logprob = result.stop_logits[1] + result.yes_no_flag_logits[0] + \ result.start_logits[start_index] + result.end_logits[end_index] prelim_predictions.append( _PrelimPrediction(result_index=result_index, start_index=start_index, end_index=end_index, text=None, logprob=logprob)) prelim_predictions = sorted(prelim_predictions, key=lambda x: x.logprob, reverse=True) _NbestPrediction = collections.namedtuple("NbestPrediction", ["text", "logprob"]) seen_predictions = {} nbest = [] for pred in prelim_predictions: if len(nbest) >= n_best_size: break result = results[pred.result_index] if (pred.start_index == -1 or pred.end_index == -1): final_text = pred.text else: # answer_tokens: tokenized answers doc_start = result.id_to_tok_map[pred.start_index] doc_end = result.id_to_tok_map[pred.end_index] answer_tokens = feature.doc_tokens[doc_start:doc_end + 1] answer_text = " ".join(answer_tokens) # De-tokenize WordPieces that have been split off. answer_text = answer_text.replace(" ##", "") answer_text = answer_text.replace("##", "") # Clean whitespace answer_text = answer_text.strip() answer_text = " ".join(answer_text.split()) # orig_answer_tokens: original answers orig_doc_start = feature.tok_to_orig_map[doc_start] orig_doc_end = feature.tok_to_orig_map[doc_end] orig_answer_tokens = example.doc_tokens[ orig_doc_start:orig_doc_end + 1] orig_answer_text = " ".join(orig_answer_tokens) # combine tokenized answer text and original text final_text = get_final_text(answer_text, orig_answer_text, do_lower_case, verbose_logging) if final_text in seen_predictions: continue seen_predictions[final_text] = True nbest.append( _NbestPrediction(text=final_text, logprob=pred.logprob)) if validate_flag: break if not nbest: nbest.append(_NbestPrediction(text="empty", logprob=0.0)) assert len(nbest) >= 1 if validate_flag: validate_predictions[(example.paragraph_id, example.turn_id)] = nbest[0].text else: total_scores = [] for entry in nbest: total_scores.append(entry.logprob) nbest_json = [] for (i, entry) in enumerate(nbest): output = collections.OrderedDict() output["text"] = entry.text output["logprob"] = entry.logprob nbest_json.append(output) assert len(nbest_json) >= 1 cur_prediction = collections.OrderedDict() cur_prediction["id"] = example.paragraph_id cur_prediction["turn_id"] = example.turn_id cur_prediction["answer"] = nbest_json[0]["text"] all_predictions.append(cur_prediction) cur_nbest_json = collections.OrderedDict() cur_nbest_json["id"] = example.paragraph_id cur_nbest_json["turn_id"] = example.turn_id cur_nbest_json["answers"] = nbest_json all_nbest_json.append(cur_nbest_json) if validate_flag: return validate_predictions else: return all_predictions, all_nbest_json
def make_predictions(all_examples, all_features, all_results, n_best_size, \ max_answer_length, do_lower_case, verbose_logging, \ validate_flag=True): assert len(all_examples) == len(all_features) example_index_to_results = collections.defaultdict(list) for result in all_results: example_index_to_results[result.example_index].append(result) _PrelimPrediction = collections.namedtuple( "PrelimPrediction", ["result_index", "start_index", "end_index", "text", "logit"]) validate_predictions = collections.defaultdict(dict) all_predictions = [] all_nbest_json = [] for (example_index, feature) in enumerate(all_features): example = all_examples[example_index] results = example_index_to_results[example_index] prelim_predictions = [] for result_index, result in enumerate(results): # yesno #yes_no_flag_logits = result.yes_no_flag_logits #yes_no_pred_flag = np.argmax(yes_no_flag_logits) # followup #followup_logits = result.followup_logits #followup = np.argmax(followup_logits) # answer span start_indexes = _get_best_indexes(result.start_logits, n_best_size) end_indexes = _get_best_indexes(result.end_logits, n_best_size) for start_index in start_indexes: for end_index in end_indexes: if start_index not in result.id_to_tok_map: continue if end_index not in result.id_to_tok_map: continue if end_index < start_index: continue length = end_index - start_index + 1 if length > max_answer_length: continue """ logit = result.stop_logits[1] + yes_no_flag_logits[yes_no_pred_flag] + \ followup_logits[followup] + result.start_logits[start_index] + \ result.end_logits[end_index] logit = result.stop_logits[1] + result.start_logits[start_index] + result.end_logits[end_index] """ logit = result.stop_logits[1] + result.start_logits[start_index] + \ result.end_logits[end_index] prelim_predictions.append( _PrelimPrediction( result_index=result_index, start_index=start_index, end_index=end_index, text=None, logit=logit)) prelim_predictions = sorted( prelim_predictions, key=lambda x: x.logit, reverse=True) _NbestPrediction = collections.namedtuple( "NbestPrediction", ["text", "logit"]) seen_predictions = {} nbest = [] for pred in prelim_predictions: if len(nbest) >= n_best_size: break result = results[pred.result_index] if pred.start_index < 0 or pred.end_index < 0: final_text = UNK else: # answer_tokens: tokenized answers doc_start = result.id_to_tok_map[pred.start_index] doc_end = result.id_to_tok_map[pred.end_index] answer_tokens = feature.doc_tokens[doc_start:doc_end+1] answer_text = " ".join(answer_tokens) # De-tokenize WordPieces that have been split off. answer_text = answer_text.replace(" ##", "") answer_text = answer_text.replace("##", "") # Clean whitespace answer_text = answer_text.strip() answer_text = " ".join(answer_text.split()) # orig_answer_tokens: original answers orig_doc_start = feature.tok_to_orig_map[doc_start] orig_doc_end = feature.tok_to_orig_map[doc_end] orig_answer_tokens = example.doc_tokens[orig_doc_start:orig_doc_end+1] orig_answer_text = " ".join(orig_answer_tokens) # combine tokenized answer text and original text final_text = get_final_text(answer_text, orig_answer_text, do_lower_case, verbose_logging) if final_text in seen_predictions: continue seen_predictions[final_text] = True nbest.append( _NbestPrediction( text=final_text, logit=pred.logit)) if validate_flag: break if not nbest: nbest.append( _NbestPrediction( text=UNK, logit=0.0)) assert len(nbest) >= 1 if validate_flag: qid = example.example_id dia_id = qid.split("_q#")[0] validate_predictions[dia_id][qid] = nbest[0].text else: total_scores = [] for entry in nbest: total_scores.append(entry.logit) probs = _compute_softmax(total_scores) nbest_json = [] for (i, entry) in enumerate(nbest): output = collections.OrderedDict() output["text"] = entry.text output["probability"] = probs[i] output["logit"] = entry.logit nbest_json.append(output) assert len(nbest_json) >= 1 cur_prediction = collections.OrderedDict() cur_prediction['example_id'] = example.example_id cur_prediction['answer'] = nbest_json[0]["text"] all_predictions.append(cur_prediction) cur_nbest_json = collections.OrderedDict() cur_nbest_json["example_id"] = example.example_id cur_nbest_json["answers"] = nbest_json all_nbest_json.append(cur_nbest_json) if validate_flag: return validate_predictions else: return all_predictions, all_nbest_json
def make_predictions(all_examples, all_features, all_results, n_best_size, max_answer_length, do_lower_case, verbose_logging, validate_flag=True): example_index_to_features = collections.defaultdict(list) for feature in all_features: example_index_to_features[feature.example_index].append(feature) unique_id_to_result = {} for result in all_results: unique_id_to_result[result.unique_id] = result _PrelimPrediction = collections.namedtuple( "PrelimPrediction", ["feature_index", "start_index", "end_index", "text", "logit"]) validate_predictions = collections.defaultdict(dict) all_predictions = [] all_nbest_json = [] for (example_index, example) in enumerate(all_examples): features = example_index_to_features[example_index] prelim_predictions = [] for (feature_index, feature) in enumerate(features): result = unique_id_to_result[feature.unique_id] start_indexes = _get_best_indexes(result.start_logits, n_best_size) end_indexes = _get_best_indexes(result.end_logits, n_best_size) for start_index in start_indexes: for end_index in end_indexes: # We could hypothetically create invalid predictions, e.g., predict # that the start of the span is in the question. We throw out all # invalid predictions. if start_index >= len(feature.tokens): continue if end_index >= len(feature.tokens): continue if start_index not in feature.token_to_orig_map: continue if end_index not in feature.token_to_orig_map: continue if not feature.token_is_max_context.get( start_index, False): continue if end_index < start_index: continue length = end_index - start_index + 1 if length > max_answer_length: continue prelim_predictions.append( _PrelimPrediction( feature_index=feature_index, start_index=start_index, end_index=end_index, text=None, logit=result.start_logits[start_index] + result.end_logits[end_index])) prelim_predictions = sorted(prelim_predictions, key=lambda x: x.logit, reverse=True) _NbestPrediction = collections.namedtuple("NbestPrediction", ["text", "logit"]) seen_predictions = {} nbest = [] for pred in prelim_predictions: if len(nbest) >= n_best_size: break feature = features[pred.feature_index] if (pred.start_index == -1 or pred.end_index == -1): final_text = pred.text else: tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] orig_doc_start = feature.token_to_orig_map[pred.start_index] orig_doc_end = feature.token_to_orig_map[pred.end_index] orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] tok_text = " ".join(tok_tokens) # De-tokenize WordPieces that have been split off. tok_text = tok_text.replace(" ##", "") tok_text = tok_text.replace("##", "") # Clean whitespace tok_text = tok_text.strip() tok_text = " ".join(tok_text.split()) orig_text = " ".join(orig_tokens) final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging) if final_text in seen_predictions: continue seen_predictions[final_text] = True nbest.append(_NbestPrediction(text=final_text, logit=pred.logit)) # for validation, only the best one prediction is needed if validate_flag: break # In very rare edge cases we could have no valid predictions. So we # just create a nonce prediction in this case to avoid failure. if not nbest: nbest.append(_NbestPrediction(text="empty", logit=0.0)) assert len(nbest) >= 1 if validate_flag: qid = example.example_id dia_id = qid.split("_q#")[0] validate_predictions[dia_id][qid] = nbest[0].text else: total_scores = [] for entry in nbest: total_scores.append(entry.logit) probs = _compute_softmax(total_scores) nbest_json = [] for (i, entry) in enumerate(nbest): output = collections.OrderedDict() output["text"] = entry.text output["probability"] = probs[i] output["logit"] = entry.logit nbest_json.append(output) assert len(nbest_json) >= 1 cur_prediction = collections.OrderedDict() cur_prediction["example_id"] = example.example_id cur_prediction["answer"] = nbest_json[0]["text"] all_predictions.append(cur_prediction) cur_nbest_json = collections.OrderedDict() cur_nbest_json["example_id"] = example.example_id cur_nbest_json["answers"] = nbest_json all_nbest_json.append(cur_nbest_json) if validate_flag: return validate_predictions else: return all_predictions, all_nbest_json