def generate_final_annotation_files() -> NoReturn: """ Generating the final gold annotations - 1. taking the .arbit file and adding the predicates with isVerbal==false,false from generation (that haven't been sent to consolidation). 2. Anonymize worker-id 3. Adjust CSV columns """ gen_dir_path = "files/annotations/gold_set/generation/corrected_filtered" arb_dir_path = "files/annotations/gold_set/arbitration" dest_path = "files/annotations/gold_set/final" arb_name_to_gen_name = lambda name: '.'.join(['annot'] + name.split('.')[1: ]) ann_files = [(os.path.join(arb_dir_path, fn), os.path.join(gen_dir_path, arb_name_to_gen_name(fn))) for fn in os.listdir(arb_dir_path) if fn.endswith(".csv") and arb_name_to_gen_name(fn) in os.listdir(gen_dir_path)] # prepare worker anonymization (dataset-wide) anonymization: Dict[str, str] = get_anonymization(all_worker_ids) for arb_fn, gen_fn in ann_files: arb_df = read_annot_csv(arb_fn) gen_df = read_annot_csv(gen_fn) # combine arb with (false,false) predicates from gen combined_df = combine_to_final_annot(arb_df=arb_df, gen_df=gen_df) # make internal aesthetic modifications in the DataFrame final_df = convert_to_final_annot(combined_df, anonymization) # save fn = os.path.basename(arb_fn) # remove prefix and put new one fn = 'annot.final.' + fn.lstrip("arbit.") dest_fn = os.path.join(dest_path, fn) save_annot_csv(final_df, dest_fn)
def generate_pruned_dupl_annot() -> NoReturn: gen_dupl_fn = "files/annotations/gold_set/generation/corrected_filtered/annot.dupl.wikinews.dev.5.csv" arb_dupl_fn = "files/annotations/gold_set/arbitration/arbit.dupl.wikinews.dev.5.csv" out_fn = "files/annotations/gold_set/final/annot.final.wikinews.dev.5.csv" gen_dupl_df = read_annot_csv(gen_dupl_fn) arb_dupl_df = read_annot_csv(arb_dupl_fn) pruned_final_df = prune_duplicated_annot(gen_dupl_df, arb_dupl_df) save_annot_csv(pruned_final_df, out_fn)
def main(proposed_path: str, reference_path: str, sentences_path: str): if sentences_path: sent_df = read_csv(sentences_path) sent_map = dict(zip(sent_df.qasrl_id, sent_df.tokens.apply(str.split))) else: sent_map = None sys_df = read_annot_csv(proposed_path) grt_df = read_annot_csv(reference_path) print_system_evaluation(sys_df, grt_df)
def postprocess_annotation_files( orig_dir: str, dest_dir: str, process_annot_func: Callable[[ pd.DataFrame, ], pd.DataFrame], file_name_modification_func: Callable[[ str, ], str] = lambda s: s) -> NoReturn: """ :param orig_dir: Directory from which to take the annottion to process (input) :param dest_dir: Directory to which the processed annotation files are to be exported :param process_annot_func: a function that gets an annot_df and returns a processed (i.e. corrected or changed, to some aspect) annot_df :param file_name_modification_func: how to change an annotation file-name from source-dir to dest-dir :return: """ ann_files = [ os.path.join(orig_dir, fn) for fn in os.listdir(orig_dir) if fn.endswith(".csv") ] for orig_fn in ann_files: orig_df = read_annot_csv(orig_fn) new_df = process_annot_func(orig_df) # now export to file with same naming as orig (but in destination folder) orig_dir, orig_name = os.path.split(orig_fn) new_name = file_name_modification_func(orig_name) dest_fn = os.path.join(dest_dir, new_name) save_annot_csv(new_df, dest_fn) print(f"exported annotations to {dest_fn}")
def fix_annot_with_corrected( orig_annot_fn: str, corrected_annot_fn: str, dest_dir: str = "files/annotations/production/corrected") -> NoReturn: orig_df = read_annot_csv(orig_annot_fn) all_corrected_df = read_annot_csv(corrected_annot_fn) corrected_df = replace_some_annotations(orig_df, all_corrected_df) # in addition to re-annotation correction, filter out currently invalid prompts for data corrected_df = find_invalid_prompts(corrected_df) corrected_and_filtered_df = corrected_df[~corrected_df.invalid_prompt] final_df = corrected_and_filtered_df.drop( ["corrected_verb_form", "invalid_prompt"], axis=1) # now export to file with same naming as orig (but in destination folder) orig_dir, orig_name = os.path.split(orig_annot_fn) dest_fn = os.path.join(dest_dir, orig_name) save_annot_csv(final_df, dest_fn)
def fix_annot_with_nmr_blacklist(orig_annot_fn: str, dest_dir: str) -> NoReturn: orig_df = read_annot_csv(orig_annot_fn) filtered_df = remove_NMR_cases_from_annotations(orig_df) # now export to file with same naming as orig (but in destination folder) orig_dir, orig_name = os.path.split(orig_annot_fn) dest_fn = os.path.join(dest_dir, orig_name) save_annot_csv(filtered_df, dest_fn)
def qanom_csv_file_to_jsonl(qanom_csv_fn: str, dest_dir: str) -> NoReturn: annot_df = read_annot_csv(qanom_csv_fn) sentences_dicts = ( sentence_df_to_sentence_jsonl_dict(sentence_df) for qasrl_id, sentence_df in annot_df.groupby('qasrl_id')) # save jsonl in destination orig_dir, orig_name = os.path.split(qanom_csv_fn) new_name = '.'.join(orig_name.split('.')[:-1]) + ".jsonl" dest_fn = os.path.join(dest_dir, new_name) jsonl.dump(sentences_dicts, open(dest_fn, "w"))
def generate_final_train_annotations() -> NoReturn: """ Generating the final train-set annotations - 1. Anonymize worker-id 2. Adjust CSV columns """ orig_train_dir_path = "files/annotations/train_set/filtered" dest_path = "files/annotations/train_set/final" ann_files = [ os.path.join(orig_train_dir_path, fn) for fn in os.listdir(orig_train_dir_path) if fn.endswith('.csv') ] # prepare worker anonymization (dataset-wide) anonymization: Dict[str, str] = get_anonymization(all_worker_ids) for gen_fn in ann_files: gen_df = read_annot_csv(gen_fn) # make internal aesthetic modifications in the DataFrame final_df = convert_to_final_annot(gen_df, anonymization) # save fn = os.path.basename(gen_fn) dest_fn = os.path.join(dest_path, fn) save_annot_csv(final_df, dest_fn)
def get_worker_statistics_from_file(anot_fn): return get_worker_statistics(read_annot_csv(anot_fn))
def main_iaa_per_worker(annotation_path: str): annot_df = read_annot_csv(annotation_path) annot_df = decode_qasrl(annot_df) print(annot_df.worker_id.value_counts()) evaluate_per_worker_iaa(annot_df)
def main(annotation_path: str): annot_df = read_annot_csv(annotation_path) annot_df = decode_qasrl(annot_df) # original annotations, multiple generation tasks per predicate print(annot_df.worker_id.value_counts()) evaluate_inter_generator_agreement(annot_df, verbose=True)