Ejemplo n.º 1
0
def remove_samples(db_flat_file, dataset_file, meta_file, dest, label, verbose=True):
    ''' Remove samples where the sample pot was not searched. This 
    information is given in the dog behaviour database. In the Excel
    file it was marked 'NS'. In the csv file, the predicted class
    (i.e. the class indicated by the dog's behaviour) is set to 2

    Parameters
    ----------
    db_flat_file: str
        The flattened dog behaviour database csv file
    dataset_file: str
        An array of pressure sensor data. One row per sample
    meta_file: str
        Meta data corresponding to the dataset. One row per sample
    dest: str
        File directory for saving the new dataset and meta data
    label: str
        Label for naming the output dataset and meta data files
    verbose: bool
        Set to True to print out info
    '''
    db_flat = manager.load_dog_behaviour_flat_db(db_flat_file)
    dataset_df = manager.load_dataset(dataset_file)
    meta_df = manager.load_meta(meta_file)
    remove_samples_from_df(db_flat, dataset_df, meta_df, dest, label, verbose)
Ejemplo n.º 2
0
def test_drop_no_event():
    ''' Create a dataset by dropping any samples with no event detected'''
    dataset_file = 'data/test_data/thirty/thirty.txt'
    meta_file = 'data/test_data/thirty/thirty_meta.txt'
    dest = 'data/test_data/event_only'
    label = 'event_only'
    dataset = manager.load_dataset(dataset_file)
    meta = manager.load_meta(meta_file) 
    dataset_win, meta_win = event_detection.create_window_dataset(
        dataset, meta, detection_window=50, window=1000, threshold=0.1, drop=True)

    do_save = False
    if do_save:
        dataset_file = 'data/test_data/thirty/events/events.txt'
        meta_file = 'data/test_data/thirty/events/events_meta.txt'
        manager.save_dataset(dataset_file, dataset_win)
        manager.save_meta(meta_file, meta_win)

    # Expect certain rows to have been dropped
    expected = pd.DataFrame([
        ['2017_08_14-12_12_Samson_12_2_B.csv', 2],
        ['2017_08_28-14_42_Samson2_14_1_T2.csv', 0],
        ['2017_09_04-11_47_Samson_3_1_T3.csv', 0],
        ['2017_10_23-14_57-Samson_15_1_T1.csv', 1]],
        columns=['filename', 'sensor_number'])

    meta_merge = meta.merge(meta_win.drop_duplicates(), on=['filename', 'sensor_number'], 
                   how='left', indicator=True)
    dropped = meta_merge[meta_merge['_merge'] == 'left_only']
    dropped.reset_index(inplace=True)
    assert(expected.equals(dropped[['filename', 'sensor_number']]))
Ejemplo n.º 3
0
def mini_dataset(dataset_file, meta_file, \
        num_samples, test_split, class_balance=0.5, \
        dog=None, events_only=False,
        event_detection_window=50, event_window=1000, event_threshold=0.1, \
        dest=None, label=None):
    ''' Create mini, balanced, dataset, with meta data, for the given dog.
    Save it in dest, using label to name the files '''

    dataset = manager.load_dataset(dataset_file)
    meta = manager.load_meta(meta_file)
    assert (dataset.shape[0] == meta.shape[0])

    # Use data for only one dog
    if dog:
        dataset, meta = dataset_for_dog(dataset, meta, dog)

    # Create a smaller, balanced dataset
    dataset, meta = create_balanced_dataset(dataset,
                                            meta,
                                            num_samples,
                                            class_balance,
                                            shuffle=False)

    # Reduce samples to the event window
    if events_only:
        dataset, meta = event_detection.create_window_dataset( \
            dataset, meta, event_detection_window, event_window, event_threshold)

    # Split in to training and test sets, maintaining the balanace
    split_arrays(dataset, meta, test_split, dest, label, stratify=dataset[0])
Ejemplo n.º 4
0
def create_window_dataset():
    ''' Remove all samples where no event is detected and create sample windows '''
    parser = argparse.ArgumentParser(description='Remove all samples where no event is detected and create sample windows')
    parser.add_argument('source', help='source directory containing the overall dataset')
    parser.add_argument('--dest', help='destination file path to save the dataset and meta data', default='')
    args = parser.parse_args()

    # Load original dataset
    dataset_file = args.source+'/private_data_all.txt'
    meta_file = args.source+'/private_data_all_meta.txt'
    label = 'private_events_all'
    dataset = manager.load_dataset(dataset_file)
    meta = manager.load_meta(meta_file) 

    # Select event windows, remove samples with no event
    detection_window = 50
    window = 1000
    threshold = 0.1
    dataset_win, meta_win = event_detection.create_window_dataset(
        dataset, meta, detection_window=detection_window, 
        window=window, threshold=threshold, drop=True)
    if args.dest:
        dest_dir = args.dest+'/'+label
        pathlib.Path(dest_dir).mkdir(parents=True, exist_ok=True) 
        dataset_file = dest_dir+'/'+label+'.txt'
        meta_file = dest_dir+'/'+label+'_meta.txt'
        manager.save_dataset(dataset_file, dataset_win, verbose=True)
        manager.save_meta(meta_file, meta_win, verbose=True)
    # plot data
    plotting.plot_dataset(dataset_file)

    # Test that window data can be compared back to original data
    n = dataset_win.shape[0]
    test_events.compare_data(dataset, meta, dataset_win, meta_win, i=round(n/3))
    test_events.compare_data(dataset, meta, dataset_win, meta_win, i=round(2*n/3))
    test_events.compare_data(dataset, meta, dataset_win, meta_win, i=n-1)
    print('Test successful')

    # Split the windowed dataset into a training set and a test set.
    split = 0.2
    split_data.split(dataset_file, meta_file, test_split=split,
        dest=args.dest+'/'+label, label=label)
    # plot data
    plotting.plot_dataset(args.dest+'/private_events_all/private_events_all_TRAIN.txt')
    plotting.plot_dataset(args.dest+'/private_events_all/private_events_all_TEST.txt')

    # Further split the training dataset into a smaller training set
    # and a dev (test) set that will be used during development
    split = 0.25
    dataset = args.dest+'/private_events_all/private_events_all_TRAIN.txt'
    meta = args.dest+'/private_events_all/private_events_all_TRAIN_meta.txt'
    label = 'private_events_dev'
    dest = args.dest+'/'+label
    split_data.split(dataset, meta, test_split=split,
        dest=dest, label=label, shuffle=False)
    # plot data
    plotting.plot_dataset(dest+'/private_events_dev_TRAIN.txt')
    plotting.plot_dataset(dest+'/private_events_dev_TEST.txt')
Ejemplo n.º 5
0
def test_mini_dataset_window():
    ''' Create a dog-specific, balanced, windowed, dataset '''
    dataset_file = 'data/test_data/thirty/thirty.txt'
    meta_file = 'data/test_data/thirty/thirty_meta.txt'
    dest = 'data/test_data/thirty'
    label = 'thirty'
    split_data.mini_dataset(dataset_file, meta_file, 12, 0.3333, 0.5, \
        dog='Samson', events_only=True, \
        event_detection_window=50, event_window=1000, event_threshold=0.1, \
        dest=dest, label=label)
    # Load and test
    dataset = manager.load_dataset(dataset_file)
    meta = manager.load_meta(meta_file)  
    dataset_win_file = dest+'/'+label+'_TRAIN.txt'
    meta_win_file = dest+'/'+label+'_TRAIN_meta.txt'
    dataset_win = manager.load_dataset(dataset_win_file)
    meta_win = manager.load_meta(meta_win_file)    
    compare_data(dataset, meta, dataset_win, meta_win, i=7)
Ejemplo n.º 6
0
def test_create_balanced_dataset():
    ''' Using samson data, test that the balanced dataset is created correctly '''
    raw_data_path = Path('data/test_data/samson/raw_data')
    dataset = 'data/test_data/samson/samson_dataset.txt'
    meta = 'data/test_data/samson/samson_dataset_meta.txt'
    num = 8
    class_balance = 0.25
    dataset = manager.load_dataset(dataset)
    meta = manager.load_meta(meta)
    df, meta_bal = split_data.create_balanced_dataset(dataset, meta, num,
                                                      class_balance)

    # Check output
    assert_that(df[df[0] == 0].count()[0], equal_to(num * class_balance))
    assert_that(df[df[0] == 1].count()[0], equal_to(num * (1 - class_balance)))
    compare_data(raw_data_path, df, meta_bal, i='all')
Ejemplo n.º 7
0
def compare_data_files(raw_data_path, dataset_file, meta_file, i='random'):
    ''' Compare a row from the dataset against the corresponding raw data file. 
    
    Parameters
    ----------
    raw_data_path: str
        Path to a directory of raw pressure sensor data csv files.
    dataset_file: str
        An array of pressure sensor data in a txt file. One row per sample
    meta_file: str
        Meta data corresponding to the dataset in a txt file. One row per sample
    i: str
        index of the dataset row to test. Or 'all' or 'random'.
    '''
    dataset = manager.load_dataset(dataset_file)
    meta = manager.load_meta(meta_file)
    compare_data(raw_data_path, dataset, meta, i)
Ejemplo n.º 8
0
def test_split_data_user():
    ''' Test the training and test sets against the raw data 
    in the files specified by the user config '''
    config = configparser.ConfigParser()
    config.optionxform = str
    config_files = [
        'src/public_config.ini', 'src/private_config.ini',
        'src/user_config.ini'
    ]
    config.read(config_files)
    # Load test data and compare against original
    raw_data_path = Path(config.get('files', 'raw_data_dir'))
    dataset_file = Path(config.get('files', 'dataset_test'))
    meta_file = Path(config.get('files', 'meta_test'))
    dataset = manager.load_dataset(dataset_file)
    meta = manager.load_meta(meta_file)
    n = meta.shape[0]
    compare_data(raw_data_path, dataset, meta, i=0)
    compare_data(raw_data_path, dataset, meta, i=n - 1)
    compare_data(raw_data_path, dataset, meta)
Ejemplo n.º 9
0
def test_dataset_random_user():
    ''' Test a random row of the dataset against the corresponding raw 
    data file. Use data from the files given in the user config, if one is provided.'''
    # Get directories and files from config in order to test private dataset
    config = configparser.ConfigParser()
    config.optionxform = str
    config_files = [
        'src/public_config.ini', 'src/private_config.ini',
        'src/user_config.ini'
    ]
    config.read(config_files)
    raw_data_path = Path(config.get('files', 'raw_data_dir'))
    dataset_file = Path(config.get('files', 'dataset'))
    meta_file = Path(config.get('files', 'meta'))
    # Check the test set
    dataset_file = Path(config.get('files', 'dataset_test'))
    meta_file = Path(config.get('files', 'meta_test'))
    dataset = manager.load_dataset(dataset_file)
    meta = manager.load_meta(meta_file)
    n = meta.shape[0]
    compare_data(raw_data_path, dataset, meta, i=0)
    compare_data(raw_data_path, dataset, meta, i=n - 1)
    compare_data(raw_data_path, dataset, meta)
Ejemplo n.º 10
0
def test_import_data_save():
    ''' Create a dataset from the raw data files. Check that the created 
    dataset and corresponding meta data file contain the expected data. '''
    regenerate = False  # Regenerate data/test_data/class_info/good.pkl
    if regenerate:
        source = 'data/test_data/raw_data'
        dest = 'data/test_data/class_info'
        class_info.parse_filenames(source, dest)
    input = 'data/test_data/class_info/good.pkl'
    target = Path('data/test_data/datasets/test_output_dataset.txt')
    target_meta = Path('data/test_data/datasets/test_output_dataset_meta.txt')
    datapoints = 6000
    expected_good = 8
    dataset_shape = import_data.create_dataset(input, target, datapoints)
    assert_that(dataset_shape, equal_to((expected_good * 3, datapoints + 1)))
    # Test meta data
    meta = manager.load_meta(target_meta)
    i = 10
    assert_that(meta.iloc[i]['filename'],
                equal_to('2017_11_06-11_38-Rex_5_2_T3.csv'))
    expected_time = datetime.datetime(2017, 11, 6, 11, 38)
    assert_that(meta.iloc[i]['date'].date(), equal_to(expected_time.date()))
    assert_that(meta.iloc[i]['time'], equal_to(str(expected_time.time())))
    assert_that(meta.iloc[i]['dog'], equal_to('Rex'))
    assert_that(meta.iloc[i]['run'], equal_to(5))
    assert_that(meta.iloc[i]['pass'], equal_to(2))
    assert_that(meta.iloc[i]['positive_position'], equal_to('T3'))
    assert_that(meta.iloc[i]['sensor_number'], equal_to(1))
    assert_that(meta.iloc[i]['class'], equal_to(0))
    # Test the data
    loaded = manager.load_dataset_as_np(target)
    raw_filename = meta.iloc[i]['filename']
    raw_loaded = manager.load_raw_data_as_np('data/test_data/raw_data/' +
                                             raw_filename)
    cols = raw_loaded.shape[1]
    assert (np.array_equal(loaded[i][1:cols + 1], raw_loaded[1]))