Example #1
0
def import_eeglab_sets(filepaths, target_path):
    # try load metadata-db
    metadb_file = os.path.join(target_path, 'metadata_db.pklz')
    if os.path.exists(metadb_file) and os.path.isfile(metadb_file):
        metadb = load(metadb_file)
        log.info('metadb loaded from {}'.format(metadb_file))
    else:
        metadb = {}  # empty DB
        log.info('no metadb found at {}. using empty db'.format(metadb_file))

    for filepath in filepaths:
        # load extra data
        filename = os.path.basename(filepath)
        data, metadata = load_eeglab_data(filepath)

        # save data
        savepath = generate_filepath_from_metadata(metadata)
        save(os.path.join(target_path, savepath), (data, metadata),
             mkdirs=True)

        # save metadata
        metadb[savepath] = metadata
        save(metadb_file, metadb, mkdirs=True)

        log.debug('imported as {}'.format(savepath))
Example #2
0
def extract_output(experiment_root):
    train, model = load_results(experiment_root);
        
    # get the datasets with their names from the monitor
    for key, dataset in train.algorithm.monitoring_dataset.items():
        # process each dataset 
        with log_timing(log, 'processing dataset \'{}\''.format(key)): 
            y_real, y_pred, output = process_dataset(model, dataset)
            
            save(os.path.join(experiment_root, 'cache', key+'_output.pklz'), (y_real, y_pred, output));    
def extract_output(experiment_root):
    train, model = load_results(experiment_root)

    # get the datasets with their names from the monitor
    for key, dataset in train.algorithm.monitoring_dataset.items():
        # process each dataset
        with log_timing(log, 'processing dataset \'{}\''.format(key)):
            y_real, y_pred, output = process_dataset(model, dataset)

            save(os.path.join(experiment_root, 'cache', key + '_output.pklz'),
                 (y_real, y_pred, output))
Example #4
0
def split_trial(path, trial_len):

    log.info('processing {}'.format(path))

    datafile = glob.glob(os.path.join(path, '*.txt'))[0]
    metafile = glob.glob(os.path.join(path, '*_Trials_Onsets.xlsx'))[0]

    log.debug('data file: {}'.format(datafile))
    log.debug('meta file: {}'.format(metafile))

    onsets = load_xlsx_meta_file(metafile)
    data = load_data_file(datafile)
    log.debug(onsets)

    onsets.append([len(data), 'end'])
    # artificial last marker

    trials = {}
    for i in xrange(len(onsets) - 1):
        onset, label = onsets[i]
        next_onset = onsets[i + 1][0]

        # rounding to integers
        onset = int(math.floor(float(onset)))
        next_onset = int(math.floor(float(next_onset)))

        next_onset = min(onset + trial_len, next_onset)

        log.debug('[{}..{}) -> {}'.format(onset, next_onset, label))
        trial_data = np.vstack(data[onset:next_onset])
        log.debug('{} samples extracted'.format(trial_data.shape))

        trials[label] = trial_data

    filename = os.path.join(path, 'trials.pklz')
    with log_timing(log, 'saving to {}'.format(filename)):
        save(filename, trials)

    return trials
Example #5
0
def split_trial(path, trial_len):
    
    log.info('processing {}'.format(path));
    
    datafile = glob.glob(os.path.join(path,'*.txt'))[0];
    metafile = glob.glob(os.path.join(path,'*_Trials_Onsets.xlsx'))[0];
    
    log.debug('data file: {}'.format(datafile));
    log.debug('meta file: {}'.format(metafile));

    onsets = load_xlsx_meta_file(metafile);    
    data = load_data_file(datafile);
    log.debug(onsets);
    
    onsets.append([len(data), 'end']); # artificial last marker

    trials = {};
    for i in xrange(len(onsets) - 1):
        onset, label = onsets[i];
        next_onset = onsets[i+1][0];
        
        # rounding to integers
        onset = int(math.floor(float(onset)));
        next_onset = int(math.floor(float(next_onset)));
        
        next_onset = min(onset+trial_len, next_onset);
        
        log.debug('[{}..{}) -> {}'.format(onset, next_onset, label));
        trial_data = np.vstack(data[onset:next_onset]);
        log.debug('{} samples extracted'.format(trial_data.shape));
        
        trials[label] = trial_data;
        
    filename = os.path.join(path, 'trials.pklz');
    with log_timing(log, 'saving to {}'.format(filename)):
        save(filename, trials);
        
    return trials;
Example #6
0
def import_eeglab_sets(filepaths, target_path):
    # try load metadata-db
    metadb_file = os.path.join(target_path, 'metadata_db.pklz')
    if os.path.exists(metadb_file) and os.path.isfile(metadb_file): 
        metadb = load(metadb_file)
        log.info('metadb loaded from {}'.format(metadb_file))
    else:
        metadb = {}   # empty DB
        log.info('no metadb found at {}. using empty db'.format(metadb_file))
        
    for filepath in filepaths:
        # load extra data
        filename = os.path.basename(filepath)
        data, metadata = load_eeglab_data(filepath)
        
        # save data
        savepath = generate_filepath_from_metadata(metadata)
        save(os.path.join(target_path, savepath), (data, metadata), mkdirs=True)
        
        # save metadata
        metadb[savepath] = metadata
        save(metadb_file, metadb, mkdirs=True)
        
        log.debug('imported as {}'.format(savepath))
Example #7
0
def save(filepath, data):
    return fs_util.save(filepath, data)
Example #8
0
    #, case_sensitive=False, limit=None, offset=0)
    for filename in files:
        x, y = loadfile(filename,
                        auto_sample_rate=config.audio.autpsamplerate,
                        samplerate=config.audio.samplerate,
                        barsamples=config.audio.barsamples,
                        maxbars=config.audio.maxbars)
        data.append(x)
        labels.append(y)
    data = np.vstack(data)
    # transform list to a big numpy array by stacking
    labels = np.vstack(labels)

    logging.info('loaded {0} values from {1} files in total'.format(
        data.shape, len(files)))
    #     print labels;
    return (data, labels)


if __name__ == '__main__':
    #pass
    #     logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.DEBUG);
    #     global config;
    config = Config(file('deepbeat.cfg'))
    logging.basicConfig(format=config.logger.pattern, level=logging.DEBUG)

    dataset = loadall(config.audio.path)
    #split = splitdata(dataset, ptest=config.audio.ptest, pvalid=config.audio.pvalid);
    #save(config.audio.datasetpath, split);
    save(config.audio.datasetpath, dataset)
#     load();
Example #9
0
def import_dataset(source_path, target_path):

    #     config = load_config(default_config='../train_sda.cfg');

    # DATA_ROOT = source_path

    # DATA_ROOT = config.eeg.get('dataset_root', './')
    SAMPLE_RATE = 400  # in Hz
    TRIAL_LENGTH = 32  # in sec

    TRIAL_LENGTH += 4  # add 4s after end of presentation

    TRIAL_SAMPLE_LENGTH = SAMPLE_RATE * TRIAL_LENGTH

    log.info('using dataset at {}'.format(source_path))
    '''
    Note from Dan:
    All subjects should have channels 15, 16, 17 and 18 removed [...]
    If you want to make them truly identical, you could remove channel 19 from
    the subjects with more channels, although this should be 'good' data.
    '''
    bad_channels = {}
    bad_channels[1] = [5, 6, 15, 16, 17, 18, 20, 21]
    bad_channels[2] = [7, 8, 15, 16, 17, 18, 20, 21]
    bad_channels[3] = [5, 6, 15, 16, 17, 18, 20, 21]
    bad_channels[4] = [7, 8, 15, 16, 17, 18, 20, 21]
    bad_channels[5] = [7, 8, 15, 16, 17, 18, 20, 21]
    bad_channels[6] = [7, 8, 9, 12, 15, 16, 17, 18]
    bad_channels[7] = [5, 6, 12, 15, 16, 17, 18, 20]
    bad_channels[8] = [7, 8, 15, 16, 17, 18, 20, 21]
    bad_channels[9] = [5, 6, 12, 15, 16, 17, 18, 20]
    bad_channels[10] = [5, 6, 15, 16, 17, 18, 20, 21]
    bad_channels[11] = [5, 6, 15, 16, 17, 18, 20, 21]
    bad_channels[12] = [5, 6, 15, 16, 17, 18, 20, 21]
    bad_channels[13] = [5, 6, 12, 15, 16, 17, 18, 20]

    label_converter = LabelConverter()

    metadb_file = os.path.join(target_path, 'metadata_db.pklz')
    metadb = {}  # empty DB

    with log_timing(log, 'generating datasets'):
        for subject_id in xrange(1, 14):
            search_path = os.path.join(source_path,
                                       'Sub{0:03d}*'.format(subject_id))
            sourcefile_path = glob.glob(search_path)

            if sourcefile_path is None or len(sourcefile_path) == 0:
                log.warn('nothing found at {}'.format(search_path))
                continue
            else:
                sourcefile_path = sourcefile_path[0]

            trials = split_session(sourcefile_path, TRIAL_SAMPLE_LENGTH)

            for stimulus, trial_data in trials.iteritems():
                stimulus_id = label_converter.get_stimulus_id(stimulus)
                log.debug(
                    'processing {} with {} samples and stimulus_id {}'.format(
                        stimulus, trial_data.shape, stimulus_id))

                channels = trial_data.transpose()
                trial_data = []
                channel_ids = []
                for i, channel in enumerate(channels):
                    channel_id = i + 1
                    # filter bad channels
                    if channel_id in bad_channels[subject_id]:
                        log.debug('skipping bad channel {}'.format(channel_id))
                        continue

                    # convert to float32
                    channel = np.asfarray(channel, dtype='float32')

                    trial_data.append(channel)
                    channel_ids.append(channel_id)

                trial_data = np.vstack(
                    trial_data).transpose()  # fromat: (samples, channels)
                log.debug('extracted {} from channels: {}'.format(
                    trial_data.shape, channel_ids))

                label = label_converter.get_label(
                    stimulus_id, 'rhythm')  # raw label, unsorted
                label = label_converter.shuffle_classes[
                    label]  # sorted label id
                metadata = {
                    'subject':
                    subject_id,
                    'label':
                    label,
                    'meta_label':
                    label_converter.get_label(stimulus_id, 'rhythm_meta'),
                    'stimulus':
                    stimulus,
                    'stimulus_id':
                    stimulus_id,
                    'rhythm_type':
                    label_converter.get_label(stimulus_id, 'rhythm'),
                    'tempo':
                    label_converter.get_label(stimulus_id, 'tempo'),
                    'audio_file':
                    label_converter.get_label(stimulus_id, 'audio_file'),
                    'trial_no':
                    1,
                    'trial_type':
                    'perception',
                    'condition':
                    'n/a',
                    'channels':
                    channel_ids,
                }

                # save data
                savepath = generate_filepath_from_metadata(metadata)
                save(os.path.join(target_path, savepath),
                     (trial_data, metadata),
                     mkdirs=True)

                # save metadata
                metadb[savepath] = metadata

                log.debug('imported {}={} as {}'.format(
                    label, metadata['meta_label'], savepath))

        save(metadb_file, metadb, mkdirs=True)
    log.info('import finished')
Example #10
0
def preprocess(config):
    
#     config = load_config(default_config='../train_sda.cfg');
    
    DATA_ROOT = config.eeg.get('dataset_root', './');
    SAMPLE_RATE = 400; # in Hz
    TRIAL_LENGTH = 32; # in sec
    
    TRIAL_LENGTH += 4; # add 4s after end of presentation
    
    TRIAL_SAMPLE_LENGTH = SAMPLE_RATE * TRIAL_LENGTH;    
    
    log.info('using dataset at {}'.format(DATA_ROOT));
    
    '''
    Note from Dan:
    All subjects should have channels 15, 16, 17 and 18 removed [...]
    If you want to make them truly identical, you could remove channel 19 from
    the subjects with more channels, although this should be 'good' data.
    '''    
    bad_channels = {};
    bad_channels[1]  = [5, 6,                   15, 16, 17, 18,  20, 21];
    bad_channels[2]  = [      7, 8,             15, 16, 17, 18,  20, 21];
    bad_channels[3]  = [5, 6,                   15, 16, 17, 18,  20, 21];
    bad_channels[4]  = [      7, 8,             15, 16, 17, 18,  20, 21];
    bad_channels[5]  = [      7, 8,             15, 16, 17, 18,  20, 21];
    bad_channels[6]  = [      7, 8, 9,  12,     15, 16, 17, 18         ];
    bad_channels[7]  = [5, 6,           12,     15, 16, 17, 18,  20    ];
    bad_channels[8]  = [      7, 8,             15, 16, 17, 18,  20, 21];
    bad_channels[9]  = [5, 6,           12,     15, 16, 17, 18,  20    ];
    bad_channels[10] = [5, 6,                   15, 16, 17, 18,  20, 21];
    bad_channels[11] = [5, 6,                   15, 16, 17, 18,  20, 21];
    bad_channels[12] = [5, 6,                   15, 16, 17, 18,  20, 21];
    bad_channels[13] = [5, 6,           12,     15, 16, 17, 18,  20    ];
    
    with log_timing(log, 'generating datasets'):
        for subject_id in xrange(1,14):
            search_path = os.path.join(DATA_ROOT, 'Sub{0:03d}*'.format(subject_id));
            path = glob.glob(search_path);
            
            if path is None or len(path) == 0:
                log.warn('nothing found at {}'.format(search_path));
                continue;
            else:
                path = path[0];
            
            trials_filename = os.path.join(path, 'trials.pklz');        
            
            trials = None;        
            if not os.path.isfile(trials_filename):
                log.debug('{} not found. running split_trial()'.format(trials_filename));
                trials = split_trial(path, TRIAL_SAMPLE_LENGTH);
            else:
                with log_timing(log, 'loading data from {}'.format(trials_filename)):    
                    trials = load(trials_filename);
                    
            assert trials;
             
            dataset_filename = os.path.join(path, 'dataset_13goodchannels_plus4s.pklz');
            dataset = generate_cases(subject_id, trials, bad_channels[subject_id]); # = data, labels
            with log_timing(log, 'saving dataset to {}'.format(dataset_filename)):
                save(dataset_filename, dataset);
Example #11
0
def preprocess(config):

    #     config = load_config(default_config='../train_sda.cfg');

    DATA_ROOT = config.eeg.get('dataset_root', './')
    SAMPLE_RATE = 400
    # in Hz
    TRIAL_LENGTH = 32
    # in sec

    TRIAL_LENGTH += 4
    # add 4s after end of presentation

    TRIAL_SAMPLE_LENGTH = SAMPLE_RATE * TRIAL_LENGTH

    log.info('using dataset at {}'.format(DATA_ROOT))
    '''
    Note from Dan:
    All subjects should have channels 15, 16, 17 and 18 removed [...]
    If you want to make them truly identical, you could remove channel 19 from
    the subjects with more channels, although this should be 'good' data.
    '''
    bad_channels = {}
    bad_channels[1] = [5, 6, 15, 16, 17, 18, 20, 21]
    bad_channels[2] = [7, 8, 15, 16, 17, 18, 20, 21]
    bad_channels[3] = [5, 6, 15, 16, 17, 18, 20, 21]
    bad_channels[4] = [7, 8, 15, 16, 17, 18, 20, 21]
    bad_channels[5] = [7, 8, 15, 16, 17, 18, 20, 21]
    bad_channels[6] = [7, 8, 9, 12, 15, 16, 17, 18]
    bad_channels[7] = [5, 6, 12, 15, 16, 17, 18, 20]
    bad_channels[8] = [7, 8, 15, 16, 17, 18, 20, 21]
    bad_channels[9] = [5, 6, 12, 15, 16, 17, 18, 20]
    bad_channels[10] = [5, 6, 15, 16, 17, 18, 20, 21]
    bad_channels[11] = [5, 6, 15, 16, 17, 18, 20, 21]
    bad_channels[12] = [5, 6, 15, 16, 17, 18, 20, 21]
    bad_channels[13] = [5, 6, 12, 15, 16, 17, 18, 20]

    with log_timing(log, 'generating datasets'):
        for subject_id in xrange(1, 14):
            search_path = os.path.join(DATA_ROOT,
                                       'Sub{0:03d}*'.format(subject_id))
            path = glob.glob(search_path)

            if path is None or len(path) == 0:
                log.warn('nothing found at {}'.format(search_path))
                continue
            else:
                path = path[0]

            trials_filename = os.path.join(path, 'trials.pklz')

            trials = None
            if not os.path.isfile(trials_filename):
                log.debug('{} not found. running split_trial()'.format(
                    trials_filename))
                trials = split_trial(path, TRIAL_SAMPLE_LENGTH)
            else:
                with log_timing(
                        log, 'loading data from {}'.format(trials_filename)):
                    trials = load(trials_filename)

            assert trials

            dataset_filename = os.path.join(
                path, 'dataset_13goodchannels_plus4s.pklz')
            dataset = generate_cases(subject_id, trials,
                                     bad_channels[subject_id])
            # = data, labels
            with log_timing(log,
                            'saving dataset to {}'.format(dataset_filename)):
                save(dataset_filename, dataset)
Example #12
0
    def run(self, classifiers=(), verbose=False, debug=False):
        print 'running job #{}'.format(self.job_id)

        import deepthought.util.fs_util as fs_util
        fs_util.ensure_dir_exists(self.output_path)
        print 'output path: ', self.output_path

        # prepare result objects
        results = {k: ClassificationResult(k) for (k, _) in classifiers}

        # load full dataset with all sources only once!
        from deepthought.datasets.hdf5 import get_dataset
        self.full_hdf5, self.full_meta = get_dataset(self.hdf5name, selectors=self.base_selectors, sources=None)

        self.initialize()

        # main loop ###

        # outer cross-validation
        outer_folds = self.fold_generator.get_outer_cv_folds()
        for ofi, ofold in enumerate(outer_folds):
            print 'processing outer fold', ofold

            # phase I : pre-train features ###
            encoder_fn = self.pretrain_encoder(ofi, ofold)  # FIXME: add params

            # phase II : classify ###

            train_selectors = self.fold_generator.get_fold_selectors(outer_fold=ofold['train'])
            X_train, Y_train, meta_train = self.get_encoded_dataset(encoder_fn, train_selectors)

            test_selectors = self.fold_generator.get_fold_selectors(outer_fold=ofold['valid'])
            X_test, Y_test, _ = self.get_encoded_dataset(encoder_fn, test_selectors)

            for (classifier_name, classifier_factory) in classifiers:
                result = results[classifier_name]

                model_prefix = os.path.join(self.output_path, '{}_fold_{}'.format(classifier_name, ofi))

                # generate index folds
                idx_folds = []
                from deepthought.datasets.selection import DatasetMetaDB
                for ifold in self.fold_generator.get_inner_cv_folds(ofold):
                    train_selectors = self.fold_generator.get_fold_selectors(outer_fold=ofold['train'],
                                                                             inner_fold=ifold['train'])
                    metadb = DatasetMetaDB(meta_train, train_selectors.keys())

                    
                    if 'valid' in ifold.keys():
                        valid_selectors = self.fold_generator.get_fold_selectors(outer_fold=ofold['train'],
                                                                                 inner_fold=ifold['valid'])
                    else:
                        valid_selectors = None
                    
                    if debug:
                        print 'train_selectors:', train_selectors                                        
                        print 'valid_selectors:', valid_selectors

                    # get selected trial IDs
                    train_idx = metadb.select(train_selectors)
                    
                    if valid_selectors is not None:
                        valid_idx = metadb.select(valid_selectors)
                    else:
                        valid_idx = []

                    idx_folds.append((train_idx, valid_idx))

                if debug:
                    print idx_folds  # print the generated folds before running the classifier
                
                # train classifier
                classifier, predict_fn = classifier_factory.train(X_train, Y_train, idx_folds, self.hyper_params, model_prefix)

                # test classifier
                train_Y_pred = predict_fn(X_train)
                test_Y_pred = predict_fn(X_test)

                # append to result
                result.append_train(Y_train, train_Y_pred)
                result.append_test(Y_test, test_Y_pred)
                # result.fold_scores.append(classifier.score(X_test, Y_test))
                result.fold_scores.append(np.mean(Y_test == test_Y_pred))

                if verbose:
                    print '{} results for fold {}'.format(classifier_name, ofold)
                    print classification_report(Y_test, test_Y_pred)
                    print confusion_matrix(Y_test, test_Y_pred)
                    print 'overall test accuracy so far:', 1 - result.test_error()

        print 'all folds completed'

        for (classifier_name, _) in classifiers:
            result = results[classifier_name]
            fs_util.save(os.path.join(self.output_path, '{}_result.pklz'.format(classifier_name)), result)  # result

            print
            print 'SUMMARY for classifier', classifier_name
            print
            print 'fold scores: ', np.asarray(result.fold_scores)
            print
            print classification_report(result.test_Y_real, result.test_Y_pred)
            print confusion_matrix(result.test_Y_real, result.test_Y_pred)
            print
            print 'train accuracy:', 1 - result.train_error()
            print 'test accuracy :', 1 - result.test_error()

        return [results[classifier[0]].test_error() for classifier in classifiers]  # error for each classifier
Example #13
0
    def pretrain_encoder(self, outer_fold_index, outer_fold):
        """
        generic template that works with any model structure
        :param outer_fold_index:
        :param outer_fold:
        :return:
        """
        import deepthought.util.fs_util as fs_util
        from deepthought.util.function_util import get_function

        fold_params_filename = os.path.join(self.output_path, 'fold_params_{}.pklz'.format(outer_fold_index))

        inner_folds = self.fold_generator.get_inner_cv_folds(outer_fold)

        if os.path.isfile(fold_params_filename):
            # load trained network parameters from existing file
            fold_param_values = fs_util.load(fold_params_filename)
            print 'loaded trained fold network parameters from', fold_params_filename
            #assert len(fold_param_values) == len(inner_folds)
        else:
            # compute trial fold models
            fold_param_values = []
            fold_errors = []
            for ifi, ifold in enumerate(inner_folds):
                log.info('processing fold {}.{}: {}'.format(outer_fold_index, ifi, ifold))

                train_selectors = self.fold_generator.get_fold_selectors(
                    outer_fold=outer_fold['train'], inner_fold=ifold['train'], base_selectors=self.base_selectors)
                
                if 'valid' in ifold.keys():
                    valid_selectors = self.fold_generator.get_fold_selectors(
                        outer_fold=outer_fold['train'], inner_fold=ifold['valid'], base_selectors=self.base_selectors)
                else:
                    valid_selectors = None

                self.pretrain_model.set_parameter_values(self.init_param_values)  # reset weights
                trained_model_param_values, best_error_valid = self.pretrain(
                    self.pretrain_model, self.hyper_params,
                    self.full_hdf5, self.full_meta,
                    train_selectors, valid_selectors)

                fold_param_values.append(trained_model_param_values)
                fold_errors.append(best_error_valid)
                
                if 'only_1_inner_fold' in self.hyper_params and self.hyper_params['only_1_inner_fold']:
                    print 'Stop after 1 inner fold requested (only_1_inner_fold=True).'
                    break

            fold_errors = np.asarray(fold_errors).squeeze()
            print 'fold errors:', fold_errors

            # store trained network parameters for later analysis
            fs_util.save(fold_params_filename, fold_param_values)
            print 'parameters saved to', fold_params_filename

        # build encoder
        encoder = self.encoder_pipeline_factory.set_pipeline_parameters(self.encoder_model, fold_param_values)

        # transform dataset (re-using data_dict and working with indices as input)
        encoder_fn = get_function(encoder, allow_input_downcast=True)

        return encoder_fn
Example #14
0
    files = librosa.util.find_files(path, ext='wav', recurse=True); #, case_sensitive=False, limit=None, offset=0)
    for filename in files:
        x, y = loadfile(
                         filename, 
                         auto_sample_rate=config.audio.autpsamplerate, 
                         samplerate=config.audio.samplerate, 
                         barsamples=config.audio.barsamples,
                         maxbars=config.audio.maxbars);
        data.append(x);
        labels.append(y);
    data = np.vstack(data);  # transform list to a big numpy array by stacking
    labels = np.vstack(labels);
      
    logging.info('loaded {0} values from {1} files in total'.format(data.shape, len(files)));
#     print labels;
    return (data, labels);


if __name__ == '__main__':
    #pass
#     logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.DEBUG);
#     global config; 
    config = Config(file('deepbeat.cfg'));        
    logging.basicConfig(format=config.logger.pattern, level=logging.DEBUG);
    
    
    dataset = loadall(config.audio.path);
    #split = splitdata(dataset, ptest=config.audio.ptest, pvalid=config.audio.pvalid);
    #save(config.audio.datasetpath, split);
    save(config.audio.datasetpath, dataset);
#     load();
Example #15
0
def save(filepath, data):
    return fs_util.save(filepath, data);
Example #16
0
def import_dataset(source_path, target_path):

#     config = load_config(default_config='../train_sda.cfg');

    # DATA_ROOT = source_path

    # DATA_ROOT = config.eeg.get('dataset_root', './')
    SAMPLE_RATE = 400 # in Hz
    TRIAL_LENGTH = 32 # in sec

    TRIAL_LENGTH += 4 # add 4s after end of presentation

    TRIAL_SAMPLE_LENGTH = SAMPLE_RATE * TRIAL_LENGTH

    log.info('using dataset at {}'.format(source_path))

    '''
    Note from Dan:
    All subjects should have channels 15, 16, 17 and 18 removed [...]
    If you want to make them truly identical, you could remove channel 19 from
    the subjects with more channels, although this should be 'good' data.
    '''
    bad_channels = {}
    bad_channels[1]  = [5, 6,                   15, 16, 17, 18,  20, 21]
    bad_channels[2]  = [      7, 8,             15, 16, 17, 18,  20, 21]
    bad_channels[3]  = [5, 6,                   15, 16, 17, 18,  20, 21]
    bad_channels[4]  = [      7, 8,             15, 16, 17, 18,  20, 21]
    bad_channels[5]  = [      7, 8,             15, 16, 17, 18,  20, 21]
    bad_channels[6]  = [      7, 8, 9,  12,     15, 16, 17, 18         ]
    bad_channels[7]  = [5, 6,           12,     15, 16, 17, 18,  20    ]
    bad_channels[8]  = [      7, 8,             15, 16, 17, 18,  20, 21]
    bad_channels[9]  = [5, 6,           12,     15, 16, 17, 18,  20    ]
    bad_channels[10] = [5, 6,                   15, 16, 17, 18,  20, 21]
    bad_channels[11] = [5, 6,                   15, 16, 17, 18,  20, 21]
    bad_channels[12] = [5, 6,                   15, 16, 17, 18,  20, 21]
    bad_channels[13] = [5, 6,           12,     15, 16, 17, 18,  20    ]

    label_converter = LabelConverter()

    metadb_file = os.path.join(target_path, 'metadata_db.pklz')
    metadb = {}   # empty DB

    with log_timing(log, 'generating datasets'):
        for subject_id in xrange(1,14):
            search_path = os.path.join(source_path, 'Sub{0:03d}*'.format(subject_id))
            sourcefile_path = glob.glob(search_path)

            if sourcefile_path is None or len(sourcefile_path) == 0:
                log.warn('nothing found at {}'.format(search_path))
                continue
            else:
                sourcefile_path = sourcefile_path[0]

            trials = split_session(sourcefile_path, TRIAL_SAMPLE_LENGTH)

            for stimulus, trial_data in trials.iteritems():
                stimulus_id = label_converter.get_stimulus_id(stimulus)
                log.debug('processing {} with {} samples and stimulus_id {}'.
                          format(stimulus,trial_data.shape,stimulus_id))

                channels = trial_data.transpose()
                trial_data = []
                channel_ids = []
                for i, channel in enumerate(channels):
                    channel_id = i+1
                    # filter bad channels
                    if channel_id in bad_channels[subject_id]:
                        log.debug('skipping bad channel {}'.format(channel_id))
                        continue

                    # convert to float32
                    channel = np.asfarray(channel, dtype='float32')

                    trial_data.append(channel)
                    channel_ids.append(channel_id)

                trial_data = np.vstack(trial_data).transpose() # fromat: (samples, channels)
                log.debug('extracted {} from channels: {}'.format(trial_data.shape, channel_ids))

                label = label_converter.get_label(stimulus_id, 'rhythm') # raw label, unsorted
                label = label_converter.shuffle_classes[label]           # sorted label id
                metadata = {
                    'subject'       : subject_id,
                    'label'         : label,
                    'meta_label'    : label_converter.get_label(stimulus_id, 'rhythm_meta'),
                    'stimulus'      : stimulus,
                    'stimulus_id'   : stimulus_id,
                    'rhythm_type'   : label_converter.get_label(stimulus_id, 'rhythm'),
                    'tempo'         : label_converter.get_label(stimulus_id, 'tempo'),
                    'audio_file'    : label_converter.get_label(stimulus_id, 'audio_file'),
                    'trial_no'      : 1,
                    'trial_type'    : 'perception',
                    'condition'     : 'n/a',
                    'channels'      : channel_ids,
                }

                # save data
                savepath = generate_filepath_from_metadata(metadata)
                save(os.path.join(target_path, savepath), (trial_data, metadata), mkdirs=True)

                # save metadata
                metadb[savepath] = metadata

                log.debug('imported {}={} as {}'.format(label, metadata['meta_label'], savepath))

        save(metadb_file, metadb, mkdirs=True)
    log.info('import finished')