def test_random_undersample(): df = create_test_df(length=30, c1_percentage=0.8) standardized_df = categories.standardize_columns(df, 'id_column_name', 'category_column_name') rus_df = categories.random_undersample(standardized_df) print(rus_df) assert (list( rus_df['id'] == [13, 18, 3, 14, 20, 17, 24, 25, 26, 27, 28, 29]))
def test_create_sets(): df = create_test_df(length=100) standardized_df = categories.standardize_columns(df, 'id_column_name', 'category_column_name') training, validation, testing = categories.create_sets( standardized_df, 10, 10) print(training) print(validation) print(testing) assert (len(validation) == 10 and len(testing) == 10)
def test_StratifiedShuffleSplit(): df = create_test_df() standardized_df = categories.standardize_columns(df, 'id_column_name', 'category_column_name') splits = train_test_split(standardized_df, train_size=0.8, random_state=0, stratify=standardized_df['category']) assert (list(splits[1].groupby('category').size()) == [3, 3]) assert (list(splits[0].groupby('category').size()) == [12, 12]) [print(split, split.groupby('category').size()) for split in splits]
isbi2016test_metadata_df = load_isbi2016_test_df(isbi2016test_metadata_path) filtered_isic_metadata_df = filter_metadata(isic_metadata_df) # union # of: filtered, mclass and isbi2016 # to download: # the pipe symbol "|" carries out the union. download_df = isic_metadata_df.loc[(filtered_isic_metadata_df.index | mclass_metadata_df.index | isbi2016test_metadata_df.index)] df = download(download_df, images_base_path) # standardize image DataFrames, i.e. select only the relevant columns and # rename them to 'id' and 'category'. std_df = standardize_columns(df, config_dict['id_column'], config_dict['category_column']) # in the ISBI 2016 test set there are two images without clear # category: # - 'ISIC_0009959' with category 'indeterminate' # - 'ISIC_0010454' with category 'indeterminate/malignant' # in the official ground truth both are counted as malignant std_df[std_df.isin({'category': ['indeterminate', 'indeterminate/malignant']})] = 'malignant' # 'ISIC_0011319' has no category. in the official ground truth it is treated as # benign std_df.loc[std_df['id'] == 'ISIC_0011319.jpg', 'category'] = 'benign' # select images which are in filtered but not in mclass or isic2016test # a set difference operation. filtered_std_df = std_df.loc[filtered_isic_metadata_df.index.difference( mclass_metadata_df.index.union(isbi2016test_metadata_df.index))]
def test_standardize_columns(): df = create_test_df(length=2) standardized_df = categories.standardize_columns(df, 'id_column_name', 'category_column_name') assert (list(standardized_df.columns) == ['id', 'category']) print(standardized_df)