def preprocess(input_path, output_path=None): """For now I'm keeping the logic of the Telegraph and Times preprocess seperate, even thought it is almost the same, and can be easily made identical""" df = load_scraped_df('the-times', input_path=input_path) num_entries_before_preprocessing = len(df) print(f'num entries in {input_path} -- {num_entries_before_preprocessing}') df = standardize_html(df) df = standardize_enumeration(df) df = validate_enumeration(df) df = standardize_orientation(df) df = validate_orientation(df) df = standardize_answers(df) df = validate_answers(df) df = standardize_clues(df) df = validate_clues(df) df = assign_quickness(df) df = assign_publisher(df) df = standardize_dates(df) print(f'num entries after preprocessing -- {len(df)}') print( f'percentage of entries remaining: {len(df) / num_entries_before_preprocessing * 100:.2f}' ) if output_path is not None: print(f'saving in {output_path}...') save_df_as_jsonl(df, output_path) print(f'successfully saved!') return df
def save_split(name, train_df, val_df, test_df, version=""): if version != '': version = f"{version}_" for part, part_df in (('train', train_df), ('val', val_df), ('test', test_df)): save_df_as_jsonl(part_df, f'data/{name}_{version}{part}.jsonl')
def preprocess(per_publisher_preprocessed_data_dir='../data/', output_path='../data/dataset.jsonl'): df = merge_publisher_specific_preprocessed_data( per_publisher_preprocessed_data_dir, output_path='../data/merged_per_publisher_preprocessing.jsonl') df = dedup_same_clue_same_answer(df) df = dedup_same_clue_different_answer(df) save_df_as_jsonl(df, output_path) print(f'successfully saved as {output_path}')
def save_ablation_split(ablation_name, split_name, train, val, test, version=''): if version != "": version = f"{version}_" for part, part_df in (('train', train), ('val', val), ('test', test)): save_df_as_jsonl( part_df, f'data/{split_name}_{version}{ablation_name}_{part}.jsonl')
def preprocess(publisher_dfs, output_path, add_enumeration=True): df = pd.concat(publisher_dfs, axis='index', ignore_index=True) df = dedup_same_clue_same_answer(df) df = dedup_same_clue_different_answer(df) # add enumeration if add_enumeration: df['clue'] = df.apply(lambda row: add_enumeration_to_clue(row), axis=1) save_df_as_jsonl(df, output_path) print(f'successfully created {output_path}') return df
def merge_publisher_specific_preprocessed_data(per_publisher_preprocessed_data_dir, output_path=None): preprocessed_paths = [entry.path for entry in os.scandir(per_publisher_preprocessed_data_dir) if entry.name.endswith('_preprocessed.jsonl')] dfs_to_merge = [pd.read_json(p, orient='columns', dtype=default_dtypes, lines=True) for p in preprocessed_paths] df = pd.concat(dfs_to_merge, axis='index', ignore_index=True) if output_path is not None: save_df_as_jsonl(df, output_path) return df
def save_split(name, train_df, val_df, test_df, output_dir, enumeration_modifier, version): if name != 'answer_split': name = f"{name}_" else: name = '' if version != '': version = f"{version}_" if enumeration_modifier != '': enumeration_modifier = f"{enumeration_modifier}_" for part, part_df in (('train', train_df), ('val', val_df), ('test', test_df)): filename = f'{name}{enumeration_modifier}{version}{part}.jsonl' output_path = output_dir.joinpath(filename) save_df_as_jsonl(part_df, output_path)
def create_to_quick_splits(dataset_path, n_val_versions=3, val_ratio=0.05, test_ratio=0.05, seed=1407): df = pd.read_json(dataset_path, orient='columns', lines=True) quick_df = df[df['quick']] non_quick_df = _get_non_quick_df(df) non_quick_answer_counts = non_quick_df['answer'].value_counts() all_non_quick_answers = non_quick_df['answer'].unique() np.random.seed(seed) for i in range(n_val_versions): quick_to_quick_train, x_to_quick_val, x_to_quick_test = split_by_answer( quick_df, val_ratio, test_ratio, seed + i) version = chr(ord('@') + i + 1) save_df_as_jsonl(quick_to_quick_train, f'data/quick_to_quick_split_{version}_train.jsonl') # the val and test sets are shared between quick_to_quick and non-quick_to_quick for part, part_df in (('val', x_to_quick_val), ('test', x_to_quick_test)): save_df_as_jsonl(part_df, f'data/x_to_quick_split_{version}_{part}.jsonl') # create a non-quick training set whose size is equal to the quick training set, and there # is no answer overlap between it and the quick val and test sets quick_val_answers = x_to_quick_val['answer'].unique() quick_test_answers = x_to_quick_test['answer'].unique() quick_val_and_test_answers = set(quick_val_answers) | set( quick_test_answers) non_quick_answer_pool = set( all_non_quick_answers) - quick_val_and_test_answers non_quick_answer_pool = np.array(list(non_quick_answer_pool), dtype=object) chosen_non_quick_train_answers = set() quick_train_size = len(quick_to_quick_train) non_quick_train_size = 0 while non_quick_train_size < quick_train_size: non_quick_train_answer = np.random.choice(non_quick_answer_pool) if non_quick_train_answer not in chosen_non_quick_train_answers: chosen_non_quick_train_answers.add(non_quick_train_answer) non_quick_train_size += non_quick_answer_counts[ non_quick_train_answer] non_quick_to_quick_train_df = non_quick_df[non_quick_df['answer'].isin( chosen_non_quick_train_answers)] save_df_as_jsonl( non_quick_to_quick_train_df, f'data/non-quick_to_quick_split_{version}_train.jsonl')
def create_to_non_quick_splits(dataset_path, n_val_versions=3, val_ratio=0.05, test_ratio=0.05, seed=1407): df = pd.read_json(dataset_path, orient='columns', lines=True) non_quick_df = _get_non_quick_df(df) quick_df = df[df['quick']] np.random.seed(seed) for i in range(n_val_versions): version = chr(ord('@') + i + 1) non_quick_to_non_quick_train, x_to_non_quick_val, x_to_non_quick_test = split_by_answer( non_quick_df, val_ratio, test_ratio, seed + i) # Do not yet save the non-quick to non-quick train. # As there are less quick clues then non-quick clues, and we are trying to compare learning # non-quick clues from non-quick clues to learning non-quick clues from quick clues, we # need the train sets to be of the same size. In later lines we will also create a quick to # non-quick train set from the quick clues, and then sample down from the non-quick train # to match the size of the quick train set # the val and test sets are shared between and non-quick to non-quick and quick to non-quick for part, part_df in (('val', x_to_non_quick_val), ('test', x_to_non_quick_test)): save_df_as_jsonl( part_df, f'data/x_to_non-quick_split_{version}_{part}.jsonl') # create a quick train set with no answer overlap between it and the non-quick val and test sets non_quick_val_answers = x_to_non_quick_val['answer'].unique() non_quick_test_answers = x_to_non_quick_test['answer'].unique() non_quick_val_and_test_answers = set(non_quick_val_answers) | set( non_quick_test_answers) quick_to_non_quick_train_df = quick_df[ ~quick_df['answer'].isin(non_quick_val_and_test_answers)] save_df_as_jsonl( quick_to_non_quick_train_df, f'data/quick_to_non-quick_split_{version}_train.jsonl') # sample down the non-quick train set to match the size of the quick train set non_quick_to_non_quick_train = non_quick_to_non_quick_train.sample( n=len(quick_to_non_quick_train_df), random_state=seed + i) # finally, save the non-quick to non-quick train set save_df_as_jsonl( non_quick_to_non_quick_train, f'data/non-quick_to_non-quick_split_{version}_train.jsonl')