Esempio n. 1
0
def preprocess(input_path, output_path=None):
    """For now I'm keeping the logic of the Telegraph and Times preprocess seperate, even thought it
    is almost the same, and can be easily made identical"""

    df = load_scraped_df('the-times', input_path=input_path)
    num_entries_before_preprocessing = len(df)
    print(f'num entries in {input_path} -- {num_entries_before_preprocessing}')

    df = standardize_html(df)
    df = standardize_enumeration(df)
    df = validate_enumeration(df)
    df = standardize_orientation(df)
    df = validate_orientation(df)
    df = standardize_answers(df)
    df = validate_answers(df)
    df = standardize_clues(df)
    df = validate_clues(df)
    df = assign_quickness(df)
    df = assign_publisher(df)
    df = standardize_dates(df)

    print(f'num entries after preprocessing -- {len(df)}')
    print(
        f'percentage of entries remaining: {len(df) / num_entries_before_preprocessing * 100:.2f}'
    )

    if output_path is not None:
        print(f'saving in {output_path}...')
        save_df_as_jsonl(df, output_path)
        print(f'successfully saved!')

    return df
Esempio n. 2
0
def save_split(name, train_df, val_df, test_df, version=""):

    if version != '':
        version = f"{version}_"

    for part, part_df in (('train', train_df), ('val', val_df), ('test',
                                                                 test_df)):
        save_df_as_jsonl(part_df, f'data/{name}_{version}{part}.jsonl')
Esempio n. 3
0
def preprocess(per_publisher_preprocessed_data_dir='../data/', output_path='../data/dataset.jsonl'):

    df = merge_publisher_specific_preprocessed_data(
        per_publisher_preprocessed_data_dir,
        output_path='../data/merged_per_publisher_preprocessing.jsonl')

    df = dedup_same_clue_same_answer(df)
    df = dedup_same_clue_different_answer(df)

    save_df_as_jsonl(df, output_path)
    print(f'successfully saved as {output_path}')
Esempio n. 4
0
def save_ablation_split(ablation_name,
                        split_name,
                        train,
                        val,
                        test,
                        version=''):

    if version != "":
        version = f"{version}_"

    for part, part_df in (('train', train), ('val', val), ('test', test)):
        save_df_as_jsonl(
            part_df,
            f'data/{split_name}_{version}{ablation_name}_{part}.jsonl')
Esempio n. 5
0
def preprocess(publisher_dfs, output_path, add_enumeration=True):

    df = pd.concat(publisher_dfs, axis='index', ignore_index=True)

    df = dedup_same_clue_same_answer(df)
    df = dedup_same_clue_different_answer(df)

    # add enumeration
    if add_enumeration:
        df['clue'] = df.apply(lambda row: add_enumeration_to_clue(row), axis=1)

    save_df_as_jsonl(df, output_path)
    print(f'successfully created {output_path}')

    return df
Esempio n. 6
0
def merge_publisher_specific_preprocessed_data(per_publisher_preprocessed_data_dir,
                                               output_path=None):

    preprocessed_paths = [entry.path for entry in os.scandir(per_publisher_preprocessed_data_dir)
                          if entry.name.endswith('_preprocessed.jsonl')]

    dfs_to_merge = [pd.read_json(p, orient='columns', dtype=default_dtypes,
                                 lines=True)
                    for p in preprocessed_paths]

    df = pd.concat(dfs_to_merge, axis='index', ignore_index=True)

    if output_path is not None:
        save_df_as_jsonl(df, output_path)

    return df
Esempio n. 7
0
def save_split(name, train_df, val_df, test_df, output_dir,
               enumeration_modifier, version):

    if name != 'answer_split':
        name = f"{name}_"
    else:
        name = ''

    if version != '':
        version = f"{version}_"

    if enumeration_modifier != '':
        enumeration_modifier = f"{enumeration_modifier}_"

    for part, part_df in (('train', train_df), ('val', val_df), ('test',
                                                                 test_df)):
        filename = f'{name}{enumeration_modifier}{version}{part}.jsonl'
        output_path = output_dir.joinpath(filename)
        save_df_as_jsonl(part_df, output_path)
Esempio n. 8
0
def create_to_quick_splits(dataset_path,
                           n_val_versions=3,
                           val_ratio=0.05,
                           test_ratio=0.05,
                           seed=1407):

    df = pd.read_json(dataset_path, orient='columns', lines=True)

    quick_df = df[df['quick']]
    non_quick_df = _get_non_quick_df(df)
    non_quick_answer_counts = non_quick_df['answer'].value_counts()
    all_non_quick_answers = non_quick_df['answer'].unique()

    np.random.seed(seed)

    for i in range(n_val_versions):

        quick_to_quick_train, x_to_quick_val, x_to_quick_test = split_by_answer(
            quick_df, val_ratio, test_ratio, seed + i)
        version = chr(ord('@') + i + 1)

        save_df_as_jsonl(quick_to_quick_train,
                         f'data/quick_to_quick_split_{version}_train.jsonl')

        # the val and test sets are shared between quick_to_quick and non-quick_to_quick
        for part, part_df in (('val', x_to_quick_val), ('test',
                                                        x_to_quick_test)):
            save_df_as_jsonl(part_df,
                             f'data/x_to_quick_split_{version}_{part}.jsonl')

        # create a non-quick training set whose size is equal to the quick training set, and there
        # is no answer overlap between it and the quick val and test sets
        quick_val_answers = x_to_quick_val['answer'].unique()
        quick_test_answers = x_to_quick_test['answer'].unique()
        quick_val_and_test_answers = set(quick_val_answers) | set(
            quick_test_answers)

        non_quick_answer_pool = set(
            all_non_quick_answers) - quick_val_and_test_answers
        non_quick_answer_pool = np.array(list(non_quick_answer_pool),
                                         dtype=object)
        chosen_non_quick_train_answers = set()

        quick_train_size = len(quick_to_quick_train)
        non_quick_train_size = 0

        while non_quick_train_size < quick_train_size:
            non_quick_train_answer = np.random.choice(non_quick_answer_pool)
            if non_quick_train_answer not in chosen_non_quick_train_answers:
                chosen_non_quick_train_answers.add(non_quick_train_answer)
                non_quick_train_size += non_quick_answer_counts[
                    non_quick_train_answer]

        non_quick_to_quick_train_df = non_quick_df[non_quick_df['answer'].isin(
            chosen_non_quick_train_answers)]

        save_df_as_jsonl(
            non_quick_to_quick_train_df,
            f'data/non-quick_to_quick_split_{version}_train.jsonl')
Esempio n. 9
0
def create_to_non_quick_splits(dataset_path,
                               n_val_versions=3,
                               val_ratio=0.05,
                               test_ratio=0.05,
                               seed=1407):

    df = pd.read_json(dataset_path, orient='columns', lines=True)

    non_quick_df = _get_non_quick_df(df)
    quick_df = df[df['quick']]

    np.random.seed(seed)

    for i in range(n_val_versions):
        version = chr(ord('@') + i + 1)

        non_quick_to_non_quick_train, x_to_non_quick_val, x_to_non_quick_test = split_by_answer(
            non_quick_df, val_ratio, test_ratio, seed + i)

        # Do not yet save the non-quick to non-quick train.
        # As there are less quick clues then non-quick clues, and we are trying to compare learning
        # non-quick clues from non-quick clues to learning non-quick clues from quick clues, we
        # need the train sets to be of the same size. In later lines we will also create a quick to
        # non-quick train set from the quick clues, and then sample down from the non-quick train
        # to match the size of the quick train set

        # the val and test sets are shared between and non-quick to non-quick and quick to non-quick
        for part, part_df in (('val', x_to_non_quick_val),
                              ('test', x_to_non_quick_test)):
            save_df_as_jsonl(
                part_df, f'data/x_to_non-quick_split_{version}_{part}.jsonl')

        # create a quick train set with no answer overlap between it and the non-quick val and test sets
        non_quick_val_answers = x_to_non_quick_val['answer'].unique()
        non_quick_test_answers = x_to_non_quick_test['answer'].unique()
        non_quick_val_and_test_answers = set(non_quick_val_answers) | set(
            non_quick_test_answers)

        quick_to_non_quick_train_df = quick_df[
            ~quick_df['answer'].isin(non_quick_val_and_test_answers)]
        save_df_as_jsonl(
            quick_to_non_quick_train_df,
            f'data/quick_to_non-quick_split_{version}_train.jsonl')

        # sample down the non-quick train set to match the size of the quick train set
        non_quick_to_non_quick_train = non_quick_to_non_quick_train.sample(
            n=len(quick_to_non_quick_train_df), random_state=seed + i)
        # finally, save the non-quick to non-quick train set
        save_df_as_jsonl(
            non_quick_to_non_quick_train,
            f'data/non-quick_to_non-quick_split_{version}_train.jsonl')