def main( prediction_report_tsv: str, datum_ids_jsonl: Optional[str], scores_json: str, ) -> None: prediction_report_df = pd.read_csv( prediction_report_tsv, sep="\t", encoding="utf-8", quoting=csv.QUOTE_ALL, na_values=None, keep_default_na=False, ) assert not prediction_report_df.isnull().any().any() if datum_ids_jsonl: datum_ids = set( load_jsonl_file(data_jsonl=datum_ids_jsonl, cls=TurnId, verbose=False)) mask_datum_id = [ TurnId(dialogue_id=row.get("dialogueId"), turn_index=row.get("turnIndex")) in datum_ids for _, row in prediction_report_df.iterrows() ] prediction_report_df = prediction_report_df.loc[mask_datum_id] scores = evaluate_dataset(prediction_report_df) with open(scores_json, "w") as fp: fp.write(jsons.dumps(scores, jdkwargs={"indent": 2})) fp.write("\n")
def build_dialogue_report( dataflow_dialogues: List[Dialogue], ) -> Tuple[pd.DataFrame, List[TurnId], List[TurnId]]: refer_turn_ids = [] revise_turn_ids = [] report_rows = [] for dialogue in dataflow_dialogues: num_turns = len(dialogue.turns) num_kept_turns = 0 num_skipped_turns = 0 num_refer_turns = 0 num_revise_turns = 0 for turn in dialogue.turns: if turn.skip: num_skipped_turns += 1 continue num_kept_turns += 1 if is_refer_turn(turn): num_refer_turns += 1 refer_turn_ids.append( TurnId(dialogue_id=dialogue.dialogue_id, turn_index=turn.turn_index)) if is_revise_turn(turn): num_revise_turns += 1 revise_turn_ids.append( TurnId(dialogue_id=dialogue.dialogue_id, turn_index=turn.turn_index)) report_rows.append({ "dialogueId": dialogue.dialogue_id, "numTurns": num_turns, "numKeptTurns": num_kept_turns, "numSkippedTurns": num_skipped_turns, "numReferTurns": num_refer_turns, "numReviseTurns": num_revise_turns, }) report_df = pd.DataFrame(report_rows) return report_df, refer_turn_ids, revise_turn_ids
def main( dataflow_dialogues_jsonl: str, dialogue_id_prefix: str, contextualized_turns_file: str, turn_answers_file: str, ) -> None: new_dialogue_id_index = 0 new_dialogue_ids = [get_random_string(16) for _ in range(500000)] new_dialogue_ids = list(set(new_dialogue_ids)) contextualized_turns: List[UtteranceWithContext] = [] turn_predictons: List[TurnAnswer] = [] for line in tqdm(open(dataflow_dialogues_jsonl), unit=" dialogues"): dialogue: Dialogue dialogue = jsons.loads(line.strip(), Dialogue) for turn_index, turn in enumerate(dialogue.turns): if turn.skip: continue full_dialogue_id = ( dialogue_id_prefix + "-" + new_dialogue_ids[new_dialogue_id_index] ) datum_id = TurnId(full_dialogue_id, turn.turn_index) contextualized_turn = UtteranceWithContext( datum_id=datum_id, user_utterance=turn.user_utterance, context=Dialogue( dialogue_id=full_dialogue_id, turns=dialogue.turns[:turn_index] ), ) contextualized_turns.append(contextualized_turn) turn_predictons.append( TurnAnswer( datum_id=datum_id, user_utterance=turn.user_utterance.original_text, lispress=" ".join(turn.tokenized_lispress()), program_execution_oracle=turn.program_execution_oracle, ) ) new_dialogue_id_index += 1 random.shuffle(contextualized_turns) save_jsonl_file(contextualized_turns, contextualized_turns_file) save_jsonl_file(turn_predictons, turn_answers_file)
def create_onmt_text_datum_for_turn( dialogue_id: str, curr_turn: Turn, context_turns: List[Turn], include_program: bool, include_agent_utterance: bool, include_described_entities: bool, ) -> OnmtTextDatum: """Creates the OpenNMT text datum for a turn.""" datum_id_str = jsons.dumps(TurnId(dialogue_id, curr_turn.turn_index)) src_str = create_source_str( curr_turn=curr_turn, context_turns=context_turns, include_program=include_program, include_agent_utterance=include_agent_utterance, include_described_entities=include_described_entities, tokenize_utterance=False, ) src_tok_str = create_source_str( curr_turn=curr_turn, context_turns=context_turns, include_program=include_program, include_agent_utterance=include_agent_utterance, include_described_entities=include_described_entities, tokenize_utterance=True, ) tgt_str = " ".join(curr_turn.tokenized_lispress()) # make sure there are not consecutive spaces in the tokenized sequence assert re.search(r"\s{2,}", src_tok_str) is None assert re.search(r"\s{2,}", tgt_str) is None return OnmtTextDatum( datum_id_str=datum_id_str, src_str=src_str, src_tok_str=src_tok_str, tgt_str=tgt_str, )
def main( exp0_prediction_report_tsv: str, exp1_prediction_report_tsv: str, datum_ids_jsonl: Optional[str], scores_json: str, ) -> None: """Loads the two prediction report files and calculates statistical significance. For the turn-level and dialogue-level accuracy, we use the McNemar test. For the dialogue-level prefix length (i.e., the number of turns before the first error), we use the two-sample permutation test. If `datum_ids_jsonl` is given, we only use the subset of turns specified in the file. In this case, only turn-level metrics are used since it doesn't make sense to compute dialogue-level metrics with only a subset of turns. """ exp0_prediction_report_df = pd.read_csv( exp0_prediction_report_tsv, sep="\t", encoding="utf-8", quoting=csv.QUOTE_ALL, na_values=None, keep_default_na=False, ) assert not exp0_prediction_report_df.isnull().any().any() exp1_prediction_report_df = pd.read_csv( exp1_prediction_report_tsv, sep="\t", encoding="utf-8", quoting=csv.QUOTE_ALL, na_values=None, keep_default_na=False, ) assert not exp1_prediction_report_df.isnull().any().any() turn_report_df, dialogue_report_df = get_report_dataframes( exp0_prediction_report_df=exp0_prediction_report_df, exp1_prediction_report_df=exp1_prediction_report_df, ) if not datum_ids_jsonl: turn_statistic, turn_pvalue = run_mcnemar_test(turn_report_df) dialogue_statistic, dialogue_pvalue = run_mcnemar_test(dialogue_report_df) prefix_pvalue = run_paired_permutation_test( xs=dialogue_report_df.loc[:, "prefix_0"].tolist(), ys=dialogue_report_df.loc[:, "prefix_1"].tolist(), ) with open(scores_json, "w") as fp: fp.write( json.dumps( { "turn": {"statistic": turn_statistic, "pvalue": turn_pvalue}, "dialogue": { "statistic": dialogue_statistic, "pvalue": dialogue_pvalue, }, "prefix": {"pvalue": prefix_pvalue}, }, indent=2, ) ) fp.write("\n") else: datum_ids = set( load_jsonl_file(data_jsonl=datum_ids_jsonl, cls=TurnId, verbose=False) ) mask_datum_id = [ TurnId(dialogue_id=dialogue_id, turn_index=turn_index) in datum_ids for (dialogue_id, turn_index), row in exp1_prediction_report_df.iterrows() ] turn_report_df = turn_report_df.loc[mask_datum_id] # NOTE: We only compute turn-level statistics since it doesn't make sense to compute dialogue-level metrics # with only a subset of turns. turn_statistic, turn_pvalue = run_mcnemar_test(turn_report_df) with open(scores_json, "w") as fp: fp.write( json.dumps( {"turn": {"statistic": turn_statistic, "pvalue": turn_pvalue}}, indent=2, ) ) fp.write("\n")