def remove_samples(db_flat_file, dataset_file, meta_file, dest, label, verbose=True): ''' Remove samples where the sample pot was not searched. This information is given in the dog behaviour database. In the Excel file it was marked 'NS'. In the csv file, the predicted class (i.e. the class indicated by the dog's behaviour) is set to 2 Parameters ---------- db_flat_file: str The flattened dog behaviour database csv file dataset_file: str An array of pressure sensor data. One row per sample meta_file: str Meta data corresponding to the dataset. One row per sample dest: str File directory for saving the new dataset and meta data label: str Label for naming the output dataset and meta data files verbose: bool Set to True to print out info ''' db_flat = manager.load_dog_behaviour_flat_db(db_flat_file) dataset_df = manager.load_dataset(dataset_file) meta_df = manager.load_meta(meta_file) remove_samples_from_df(db_flat, dataset_df, meta_df, dest, label, verbose)
def test_drop_no_event(): ''' Create a dataset by dropping any samples with no event detected''' dataset_file = 'data/test_data/thirty/thirty.txt' meta_file = 'data/test_data/thirty/thirty_meta.txt' dest = 'data/test_data/event_only' label = 'event_only' dataset = manager.load_dataset(dataset_file) meta = manager.load_meta(meta_file) dataset_win, meta_win = event_detection.create_window_dataset( dataset, meta, detection_window=50, window=1000, threshold=0.1, drop=True) do_save = False if do_save: dataset_file = 'data/test_data/thirty/events/events.txt' meta_file = 'data/test_data/thirty/events/events_meta.txt' manager.save_dataset(dataset_file, dataset_win) manager.save_meta(meta_file, meta_win) # Expect certain rows to have been dropped expected = pd.DataFrame([ ['2017_08_14-12_12_Samson_12_2_B.csv', 2], ['2017_08_28-14_42_Samson2_14_1_T2.csv', 0], ['2017_09_04-11_47_Samson_3_1_T3.csv', 0], ['2017_10_23-14_57-Samson_15_1_T1.csv', 1]], columns=['filename', 'sensor_number']) meta_merge = meta.merge(meta_win.drop_duplicates(), on=['filename', 'sensor_number'], how='left', indicator=True) dropped = meta_merge[meta_merge['_merge'] == 'left_only'] dropped.reset_index(inplace=True) assert(expected.equals(dropped[['filename', 'sensor_number']]))
def test_window(): ''' Find the event window in a data sample ''' dataset_file = 'data/test_data/datasets/random_dataset.txt' dataset = manager.load_dataset(dataset_file) detection_window = 50 window = 200 threshold = 0.1 meta = pd.DataFrame(np.zeros((dataset.shape[0],1))) window_dataset, meta = event_detection.create_window_dataset( \ dataset, meta, detection_window, window, threshold) expected = manager.load_dataset('data/test_data/datasets/random_window_dataset.txt') assert(np.allclose(window_dataset, expected)) print(meta) assert(meta.shape[1]==3) assert_that(meta.iloc[0]['breakpoint_0'], equal_to(54)) assert_that(meta.iloc[0]['breakpoint_1'], equal_to(254))
def mini_dataset(dataset_file, meta_file, \ num_samples, test_split, class_balance=0.5, \ dog=None, events_only=False, event_detection_window=50, event_window=1000, event_threshold=0.1, \ dest=None, label=None): ''' Create mini, balanced, dataset, with meta data, for the given dog. Save it in dest, using label to name the files ''' dataset = manager.load_dataset(dataset_file) meta = manager.load_meta(meta_file) assert (dataset.shape[0] == meta.shape[0]) # Use data for only one dog if dog: dataset, meta = dataset_for_dog(dataset, meta, dog) # Create a smaller, balanced dataset dataset, meta = create_balanced_dataset(dataset, meta, num_samples, class_balance, shuffle=False) # Reduce samples to the event window if events_only: dataset, meta = event_detection.create_window_dataset( \ dataset, meta, event_detection_window, event_window, event_threshold) # Split in to training and test sets, maintaining the balanace split_arrays(dataset, meta, test_split, dest, label, stratify=dataset[0])
def create_window_dataset(): ''' Remove all samples where no event is detected and create sample windows ''' parser = argparse.ArgumentParser(description='Remove all samples where no event is detected and create sample windows') parser.add_argument('source', help='source directory containing the overall dataset') parser.add_argument('--dest', help='destination file path to save the dataset and meta data', default='') args = parser.parse_args() # Load original dataset dataset_file = args.source+'/private_data_all.txt' meta_file = args.source+'/private_data_all_meta.txt' label = 'private_events_all' dataset = manager.load_dataset(dataset_file) meta = manager.load_meta(meta_file) # Select event windows, remove samples with no event detection_window = 50 window = 1000 threshold = 0.1 dataset_win, meta_win = event_detection.create_window_dataset( dataset, meta, detection_window=detection_window, window=window, threshold=threshold, drop=True) if args.dest: dest_dir = args.dest+'/'+label pathlib.Path(dest_dir).mkdir(parents=True, exist_ok=True) dataset_file = dest_dir+'/'+label+'.txt' meta_file = dest_dir+'/'+label+'_meta.txt' manager.save_dataset(dataset_file, dataset_win, verbose=True) manager.save_meta(meta_file, meta_win, verbose=True) # plot data plotting.plot_dataset(dataset_file) # Test that window data can be compared back to original data n = dataset_win.shape[0] test_events.compare_data(dataset, meta, dataset_win, meta_win, i=round(n/3)) test_events.compare_data(dataset, meta, dataset_win, meta_win, i=round(2*n/3)) test_events.compare_data(dataset, meta, dataset_win, meta_win, i=n-1) print('Test successful') # Split the windowed dataset into a training set and a test set. split = 0.2 split_data.split(dataset_file, meta_file, test_split=split, dest=args.dest+'/'+label, label=label) # plot data plotting.plot_dataset(args.dest+'/private_events_all/private_events_all_TRAIN.txt') plotting.plot_dataset(args.dest+'/private_events_all/private_events_all_TEST.txt') # Further split the training dataset into a smaller training set # and a dev (test) set that will be used during development split = 0.25 dataset = args.dest+'/private_events_all/private_events_all_TRAIN.txt' meta = args.dest+'/private_events_all/private_events_all_TRAIN_meta.txt' label = 'private_events_dev' dest = args.dest+'/'+label split_data.split(dataset, meta, test_split=split, dest=dest, label=label, shuffle=False) # plot data plotting.plot_dataset(dest+'/private_events_dev_TRAIN.txt') plotting.plot_dataset(dest+'/private_events_dev_TEST.txt')
def test_mini_dataset_window(): ''' Create a dog-specific, balanced, windowed, dataset ''' dataset_file = 'data/test_data/thirty/thirty.txt' meta_file = 'data/test_data/thirty/thirty_meta.txt' dest = 'data/test_data/thirty' label = 'thirty' split_data.mini_dataset(dataset_file, meta_file, 12, 0.3333, 0.5, \ dog='Samson', events_only=True, \ event_detection_window=50, event_window=1000, event_threshold=0.1, \ dest=dest, label=label) # Load and test dataset = manager.load_dataset(dataset_file) meta = manager.load_meta(meta_file) dataset_win_file = dest+'/'+label+'_TRAIN.txt' meta_win_file = dest+'/'+label+'_TRAIN_meta.txt' dataset_win = manager.load_dataset(dataset_win_file) meta_win = manager.load_meta(meta_win_file) compare_data(dataset, meta, dataset_win, meta_win, i=7)
def test_create_balanced_dataset(): ''' Using samson data, test that the balanced dataset is created correctly ''' raw_data_path = Path('data/test_data/samson/raw_data') dataset = 'data/test_data/samson/samson_dataset.txt' meta = 'data/test_data/samson/samson_dataset_meta.txt' num = 8 class_balance = 0.25 dataset = manager.load_dataset(dataset) meta = manager.load_meta(meta) df, meta_bal = split_data.create_balanced_dataset(dataset, meta, num, class_balance) # Check output assert_that(df[df[0] == 0].count()[0], equal_to(num * class_balance)) assert_that(df[df[0] == 1].count()[0], equal_to(num * (1 - class_balance))) compare_data(raw_data_path, df, meta_bal, i='all')
def compare_data_files(raw_data_path, dataset_file, meta_file, i='random'): ''' Compare a row from the dataset against the corresponding raw data file. Parameters ---------- raw_data_path: str Path to a directory of raw pressure sensor data csv files. dataset_file: str An array of pressure sensor data in a txt file. One row per sample meta_file: str Meta data corresponding to the dataset in a txt file. One row per sample i: str index of the dataset row to test. Or 'all' or 'random'. ''' dataset = manager.load_dataset(dataset_file) meta = manager.load_meta(meta_file) compare_data(raw_data_path, dataset, meta, i)
def test_split_data_user(): ''' Test the training and test sets against the raw data in the files specified by the user config ''' config = configparser.ConfigParser() config.optionxform = str config_files = [ 'src/public_config.ini', 'src/private_config.ini', 'src/user_config.ini' ] config.read(config_files) # Load test data and compare against original raw_data_path = Path(config.get('files', 'raw_data_dir')) dataset_file = Path(config.get('files', 'dataset_test')) meta_file = Path(config.get('files', 'meta_test')) dataset = manager.load_dataset(dataset_file) meta = manager.load_meta(meta_file) n = meta.shape[0] compare_data(raw_data_path, dataset, meta, i=0) compare_data(raw_data_path, dataset, meta, i=n - 1) compare_data(raw_data_path, dataset, meta)
def test_dataset_random_user(): ''' Test a random row of the dataset against the corresponding raw data file. Use data from the files given in the user config, if one is provided.''' # Get directories and files from config in order to test private dataset config = configparser.ConfigParser() config.optionxform = str config_files = [ 'src/public_config.ini', 'src/private_config.ini', 'src/user_config.ini' ] config.read(config_files) raw_data_path = Path(config.get('files', 'raw_data_dir')) dataset_file = Path(config.get('files', 'dataset')) meta_file = Path(config.get('files', 'meta')) # Check the test set dataset_file = Path(config.get('files', 'dataset_test')) meta_file = Path(config.get('files', 'meta_test')) dataset = manager.load_dataset(dataset_file) meta = manager.load_meta(meta_file) n = meta.shape[0] compare_data(raw_data_path, dataset, meta, i=0) compare_data(raw_data_path, dataset, meta, i=n - 1) compare_data(raw_data_path, dataset, meta)