Ejemplo n.º 1
0
    def get_encoded_dataset(self, encoder_fn, selectors):
        """
        This version is intended for use with a data dict / indices.
        :return:
        """
        from deepthought.datasets.selection import DatasetMetaDB
        from deepthought.util.function_util import process_dataset
        import theano

        # build lookup structure
        metadb = DatasetMetaDB(self.full_meta, selectors.keys())

        # get selected trial IDs
        selected_trial_ids = metadb.select(selectors)

        X, Y = process_dataset(self.full_hdf5, encoder_fn,
                               indices=selected_trial_ids,
                               input_sources=['indices'],
                               target_source=self.hyper_params['classification_target_source'])
        meta = [self.full_meta[i] for i in selected_trial_ids]

        # flatten X (2d) and Y (1d)
        X = np.asarray(X, dtype=theano.config.floatX)
        X = X.reshape(X.shape[0], np.prod(X.shape[1:]))
        Y = Y.argmax(axis=1)

        return X, Y, meta
Ejemplo n.º 2
0
def get_dataset(hdf5name,
                selectors=None,
                sources=('features', 'targets', 'subjects')):
    if selectors is None:
        selectors = {}

    # load metadata
    import deepthought.util.fs_util as fs_util
    base_meta = fs_util.load(hdf5name + '.meta.pklz')

    # build lookup structure
    from deepthought.datasets.selection import DatasetMetaDB
    metadb = DatasetMetaDB(base_meta, selectors.keys())

    # get selected trial IDs
    selected_trial_ids = metadb.select(selectors)
    log.debug('selectors: {}'.format(selectors))
    log.debug('selected trials: {}'.format(selected_trial_ids))
    log.debug('selected sources: {}'.format(sources))

    # load data and generate metadata
    from fuel.datasets.hdf5 import H5PYDataset
    hdf5 = H5PYDataset(hdf5name,
                       which_sets=('all', ),
                       subset=selected_trial_ids,
                       load_in_memory=True,
                       sources=sources)
    meta = [base_meta[i] for i in selected_trial_ids]

    log.debug('number of examples: {}'.format(hdf5.num_examples))

    return hdf5, meta
Ejemplo n.º 3
0
    def __init__(self, db, selectors):
        metadb = DatasetMetaDB(db.metadata, selectors.keys())
        selected_trial_ids = metadb.select(selectors)

        self.data = [db.data[i] for i in selected_trial_ids]
        self.metadata = [db.metadata[i] for i in selected_trial_ids]

        if hasattr(db, 'targets'):
            if db.targets is None:
                self.targets = None
            else:
                self.targets = [db.targets[i] for i in selected_trial_ids]
Ejemplo n.º 4
0
    def __init__(self, db, selectors):
        metadb = DatasetMetaDB(db.metadata, selectors.keys())
        selected_trial_ids = metadb.select(selectors)

        self.data = [db.data[i] for i in selected_trial_ids]
        self.metadata = [db.metadata[i] for i in selected_trial_ids]

        if hasattr(db, 'targets'):
            if db.targets is None:
                self.targets = None
            else:
                self.targets = [db.targets[i] for i in selected_trial_ids]
Ejemplo n.º 5
0
    def __init__(self, root_path, selectors=dict()):
        # read metadata file: dict filename -> metadata
        meta_map = load(os.path.join(root_path, 'metadata_db.pklz'))
        filenames = list(meta_map.keys())
        metadata = [meta_map[fn] for fn in filenames]

        # filter files by metadata selectors
        metadb = DatasetMetaDB(metadata, selectors.keys())
        selected_file_ids = metadb.select(selectors)
        # log.info('selected files: {}'.format(selected_file_ids))

        # load selected files
        self.data = []
        self.metadata = []
        for id in selected_file_ids:
            log.debug('loading data file #{} {}'.format(id, filenames[id]))
            f_data, f_metadata = load(os.path.join(root_path, filenames[id]))
            self.data.append(f_data)
            self.metadata.append(metadata[id])

        print len(self.data), len(self.metadata)
Ejemplo n.º 6
0
    def __init__(self, root_path, selectors=dict()):
        # read metadata file: dict filename -> metadata
        meta_map = load(os.path.join(root_path, 'metadata_db.pklz'))
        filenames = list(meta_map.keys())
        metadata = [meta_map[fn] for fn in filenames]

        # filter files by metadata selectors
        metadb = DatasetMetaDB(metadata, selectors.keys())
        selected_file_ids = metadb.select(selectors)
        # log.info('selected files: {}'.format(selected_file_ids))

        # load selected files
        self.data = []
        self.metadata = []
        for id in selected_file_ids:
            log.debug('loading data file #{} {}'.format(id, filenames[id]))
            f_data, f_metadata = load(os.path.join(root_path, filenames[id]))
            self.data.append(f_data)
            self.metadata.append(metadata[id])

        print len(self.data), len(self.metadata)
Ejemplo n.º 7
0
    def __init__(self,
                 db,                # data source
                 name = '',         # optional name

                 selectors = dict(),

                 partitioner = None,

                 meta_sources = [],     # optional sources other than 'features' and 'targets' from metadata

                 channel_filter = NoChannelFilter(),   # optional channel filter, default: keep all
                 channel_names = None,  # optional channel names (for metadata)

                 label_attribute = 'label', # metadata attribute to be used as label
                 label_map = None,      # optional conversion of labels
                 use_targets = True,    # use targets if provides, otherwise labels are used

                 remove_dc_offset = False,  # optional subtraction of channel mean, usually done already earlier
                 resample = None,       # optional down-sampling
                 normalize = True,      # normalize to max=1

                 # optional sub-sequences selection
                 start_sample = 0,
                 stop_sample  = None,   # optional for selection of sub-sequences
                 zero_padding = True,   # if True (default) trials that are too short will be padded with
                                        # otherwise they will rejected.

                 # optional signal filter to by applied before splitting the signal
                 signal_filter = None,

                 trial_processors = [],     # optional processing of the trials
                 target_processor = None,   # optional processing of the targets, e.g. zero-padding
                 transformers = [],         # optional transformations of the dataset

                 layout='tf',       # (0,1)-axes layout tf=time x features or ft=features x time

                 debug=False,
                 ):
        '''
        Constructor
        '''

        # save params
        self.params = locals().copy()
        del self.params['self']
        # print self.params

        self.name = name
        self.debug = debug

        metadb = DatasetMetaDB(db.metadata, selectors.keys())

        if partitioner is not None:
            pass # FIXME

        selected_trial_ids = metadb.select(selectors)
        log.info('selectors: {}'.format(selectors))
        log.info('selected trials: {}'.format(selected_trial_ids))

        if normalize:
            log.info('Data will be normalized to max amplitude 1 per channel (normalize=True).')

        trials = list()
        labels = list()
        targets = list()
        meta = list()

        if stop_sample == 'auto-min':
            stop_sample = np.min([db.data[trial_i].shape[-1] for trial_i in selected_trial_ids])
            log.info('Using minimum trial length. stop_sample={}'.format(stop_sample))
        elif stop_sample ==  'auto-max':
            stop_sample = np.max([db.data[trial_i].shape[-1] for trial_i in selected_trial_ids])
            log.info('Using maximum trial length. stop_sample={}'.format(stop_sample))

        for trial_i in selected_trial_ids:

            trial_meta = db.metadata[trial_i]

            if use_targets:
                if targets is None:
                    target = None
                else:
                    target = db.targets[trial_i]
                    assert not np.isnan(np.sum(target))

                if target_processor is not None:
                    target = target_processor.process(target, trial_meta)

                    assert not np.isnan(np.sum(target))
            else:
                # get and process label
                label = db.metadata[trial_i][label_attribute]
                if label_map is not None:
                    label = label_map[label]

            processed_trial = []

            trial = db.data[trial_i]

            if np.isnan(np.sum(trial)):
                print trial_i, trial

            assert not np.isnan(np.sum(trial))

            rejected = False # flag for trial rejection

            trial = np.atleast_2d(trial)

            # process 1 channel at a time
            for channel in xrange(trial.shape[0]):
                # filter channels
                if not channel_filter.keep_channel(channel):
                    continue

                samples = trial[channel, :]

                # subtract channel mean
                if remove_dc_offset:
                    samples -= samples.mean()

                # down-sample if requested
                if resample is not None and resample[0] != resample[1]:
                    samples = librosa.resample(samples, resample[0], resample[1], res_type='sinc_best')

                # apply optional signal filter after down-sampling -> requires lower order
                if signal_filter is not None:
                    samples = signal_filter.process(samples)

                # get sub-sequence in resampled space
                # log.info('using samples {}..{} of {}'.format(start_sample,stop_sample, samples.shape))

                if stop_sample is not None and stop_sample > len(samples):
                    if zero_padding:
                        tmp = np.zeros(stop_sample)
                        tmp[:len(samples)] = samples
                        samples = tmp
                    else:
                        rejected = True
                        break # stop processing this trial

                s = samples[start_sample:stop_sample]

                # TODO optional channel processing

                # normalize to max amplitude 1
                if normalize:
                    s = librosa.util.normalize(s)

                # add 2nd data dimension
                s = s.reshape(s.shape[0], 1)
                # print s.shape

                s = np.asfarray(s, dtype=theano.config.floatX)

                processed_trial.append(s)

                ### end of channel iteration ###

            if rejected:
                continue    # next trial

            processed_trial = np.asfarray([processed_trial], dtype=theano.config.floatX)

            # processed_trial = processed_trial.reshape((1, processed_trial.shape))
            processed_trial = np.rollaxis(processed_trial, 1, 4)

            # optional (external) trial processing, e.g. windowing
            # trials will be in b01c format with tf layout for 01-axes
            for trial_processor in trial_processors:
                processed_trial = trial_processor.process(processed_trial, trial_meta)

            trials.append(processed_trial)

            for k in range(len(processed_trial)):
                meta.append(trial_meta)

                if use_targets:
                    targets.append(target)
                else:
                    labels.append(label)

        ### end of datafile iteration ###

        # turn into numpy arrays
        self.trials = np.vstack(trials)

        assert not np.isnan(np.sum(self.trials))

        # prepare targets / labels
        if use_targets:
            self.targets = np.vstack(targets)
            assert not np.isnan(np.sum(self.targets))
        else:
            labels = np.hstack(labels)
            if label_map is None:
                one_hot_formatter = OneHotFormatter(max(labels) + 1)
            else:
                one_hot_formatter = OneHotFormatter(max(label_map.values()) + 1)
            one_hot_y = one_hot_formatter.format(labels)
            self.targets = one_hot_y

        self.metadata = meta

        if layout == 'ft': # swap axes to (batch, feature, time, channels)
            self.trials = self.trials.swapaxes(1, 2)

        # transform after finalizing the data structure
        for transformer in transformers:
            self.trials, self.targets = transformer.process(self.trials, self.targets)

        self.trials = np.asarray(self.trials, dtype=theano.config.floatX)

        log.debug('final dataset shape: {} (b,0,1,c)'.format(self.trials.shape))
        # super(EEGEpochsDataset, self).__init__(topo_view=self.trials, y=self.targets, axes=['b', 0, 1, 'c'])

        self.X = self.trials.reshape(self.trials.shape[0], np.prod(self.trials.shape[1:]))
        self.y = self.targets
        log.info('generated dataset "{}" with shape X={}={} y={} targets={} '.
                 format(self.name, self.X.shape, self.trials.shape, self.y.shape, self.targets.shape))


        # determine data specs
        features_space = Conv2DSpace(
            shape=[self.trials.shape[1], self.trials.shape[2]],
            num_channels=self.trials.shape[3]
        )
        features_source = 'features'

        targets_space = VectorSpace(dim=self.targets.shape[-1])
        targets_source = 'targets'

        space_components = [features_space, targets_space]
        source_components = [features_source, targets_source]

        # additional support for meta information
        self.meta_maps = dict()
        for meta_source in meta_sources:
            self.meta_maps[meta_source] = sorted(list(set([m[meta_source] for m in self.metadata])))
            space_components.extend([VectorSpace(dim=1)])
            source_components.extend([meta_source])
            log.info('Generated meta-source "{}" with value map: {}'
                     .format(meta_source, self.meta_maps[meta_source]))

        space = CompositeSpace(space_components)
        source = tuple(source_components)
        self.data_specs = (space, source)
        log.debug('data specs: {}'.format(self.data_specs))
Ejemplo n.º 8
0
    def run(self, classifiers=(), verbose=False, debug=False):
        print 'running job #{}'.format(self.job_id)

        import deepthought.util.fs_util as fs_util
        fs_util.ensure_dir_exists(self.output_path)
        print 'output path: ', self.output_path

        # prepare result objects
        results = {k: ClassificationResult(k) for (k, _) in classifiers}

        # load full dataset with all sources only once!
        from deepthought.datasets.hdf5 import get_dataset
        self.full_hdf5, self.full_meta = get_dataset(self.hdf5name, selectors=self.base_selectors, sources=None)

        self.initialize()

        # main loop ###

        # outer cross-validation
        outer_folds = self.fold_generator.get_outer_cv_folds()
        for ofi, ofold in enumerate(outer_folds):
            print 'processing outer fold', ofold

            # phase I : pre-train features ###
            encoder_fn = self.pretrain_encoder(ofi, ofold)  # FIXME: add params

            # phase II : classify ###

            train_selectors = self.fold_generator.get_fold_selectors(outer_fold=ofold['train'])
            X_train, Y_train, meta_train = self.get_encoded_dataset(encoder_fn, train_selectors)

            test_selectors = self.fold_generator.get_fold_selectors(outer_fold=ofold['valid'])
            X_test, Y_test, _ = self.get_encoded_dataset(encoder_fn, test_selectors)

            for (classifier_name, classifier_factory) in classifiers:
                result = results[classifier_name]

                model_prefix = os.path.join(self.output_path, '{}_fold_{}'.format(classifier_name, ofi))

                # generate index folds
                idx_folds = []
                from deepthought.datasets.selection import DatasetMetaDB
                for ifold in self.fold_generator.get_inner_cv_folds(ofold):
                    train_selectors = self.fold_generator.get_fold_selectors(outer_fold=ofold['train'],
                                                                             inner_fold=ifold['train'])
                    metadb = DatasetMetaDB(meta_train, train_selectors.keys())

                    
                    if 'valid' in ifold.keys():
                        valid_selectors = self.fold_generator.get_fold_selectors(outer_fold=ofold['train'],
                                                                                 inner_fold=ifold['valid'])
                    else:
                        valid_selectors = None
                    
                    if debug:
                        print 'train_selectors:', train_selectors                                        
                        print 'valid_selectors:', valid_selectors

                    # get selected trial IDs
                    train_idx = metadb.select(train_selectors)
                    
                    if valid_selectors is not None:
                        valid_idx = metadb.select(valid_selectors)
                    else:
                        valid_idx = []

                    idx_folds.append((train_idx, valid_idx))

                if debug:
                    print idx_folds  # print the generated folds before running the classifier
                
                # train classifier
                classifier, predict_fn = classifier_factory.train(X_train, Y_train, idx_folds, self.hyper_params, model_prefix)

                # test classifier
                train_Y_pred = predict_fn(X_train)
                test_Y_pred = predict_fn(X_test)

                # append to result
                result.append_train(Y_train, train_Y_pred)
                result.append_test(Y_test, test_Y_pred)
                # result.fold_scores.append(classifier.score(X_test, Y_test))
                result.fold_scores.append(np.mean(Y_test == test_Y_pred))

                if verbose:
                    print '{} results for fold {}'.format(classifier_name, ofold)
                    print classification_report(Y_test, test_Y_pred)
                    print confusion_matrix(Y_test, test_Y_pred)
                    print 'overall test accuracy so far:', 1 - result.test_error()

        print 'all folds completed'

        for (classifier_name, _) in classifiers:
            result = results[classifier_name]
            fs_util.save(os.path.join(self.output_path, '{}_result.pklz'.format(classifier_name)), result)  # result

            print
            print 'SUMMARY for classifier', classifier_name
            print
            print 'fold scores: ', np.asarray(result.fold_scores)
            print
            print classification_report(result.test_Y_real, result.test_Y_pred)
            print confusion_matrix(result.test_Y_real, result.test_Y_pred)
            print
            print 'train accuracy:', 1 - result.train_error()
            print 'test accuracy :', 1 - result.test_error()

        return [results[classifier[0]].test_error() for classifier in classifiers]  # error for each classifier
Ejemplo n.º 9
0
    def __init__(
        self,
        db,  # data source
        name='',  # optional name
        selectors=dict(),
        partitioner=None,
        meta_sources=[],  # optional sources other than 'features' and 'targets' from metadata
        channel_filter=NoChannelFilter(
        ),  # optional channel filter, default: keep all
        channel_names=None,  # optional channel names (for metadata)
        label_attribute='label',  # metadata attribute to be used as label
        label_map=None,  # optional conversion of labels
        use_targets=True,  # use targets if provides, otherwise labels are used
        remove_dc_offset=False,  # optional subtraction of channel mean, usually done already earlier
        resample=None,  # optional down-sampling
        normalize=True,  # normalize to max=1

        # optional sub-sequences selection
        start_sample=0,
        stop_sample=None,  # optional for selection of sub-sequences
        zero_padding=True,  # if True (default) trials that are too short will be padded with
        # otherwise they will rejected.

        # optional signal filter to by applied before splitting the signal
        signal_filter=None,
        trial_processors=[],  # optional processing of the trials
        target_processor=None,  # optional processing of the targets, e.g. zero-padding
        transformers=[],  # optional transformations of the dataset
        layout='tf',  # (0,1)-axes layout tf=time x features or ft=features x time
        debug=False,
    ):
        '''
        Constructor
        '''

        # save params
        self.params = locals().copy()
        del self.params['self']
        # print self.params

        self.name = name
        self.debug = debug

        metadb = DatasetMetaDB(db.metadata, selectors.keys())

        if partitioner is not None:
            pass  # FIXME

        selected_trial_ids = metadb.select(selectors)
        log.info('selectors: {}'.format(selectors))
        log.info('selected trials: {}'.format(selected_trial_ids))

        if normalize:
            log.info(
                'Data will be normalized to max amplitude 1 per channel (normalize=True).'
            )

        trials = list()
        labels = list()
        targets = list()
        meta = list()

        if stop_sample == 'auto-min':
            stop_sample = np.min(
                [db.data[trial_i].shape[-1] for trial_i in selected_trial_ids])
            log.info('Using minimum trial length. stop_sample={}'.format(
                stop_sample))
        elif stop_sample == 'auto-max':
            stop_sample = np.max(
                [db.data[trial_i].shape[-1] for trial_i in selected_trial_ids])
            log.info('Using maximum trial length. stop_sample={}'.format(
                stop_sample))

        for trial_i in selected_trial_ids:

            trial_meta = db.metadata[trial_i]

            if use_targets:
                if targets is None:
                    target = None
                else:
                    target = db.targets[trial_i]
                    assert not np.isnan(np.sum(target))

                if target_processor is not None:
                    target = target_processor.process(target, trial_meta)

                    assert not np.isnan(np.sum(target))
            else:
                # get and process label
                label = db.metadata[trial_i][label_attribute]
                if label_map is not None:
                    label = label_map[label]

            processed_trial = []

            trial = db.data[trial_i]

            if np.isnan(np.sum(trial)):
                print trial_i, trial

            assert not np.isnan(np.sum(trial))

            rejected = False  # flag for trial rejection

            trial = np.atleast_2d(trial)

            # process 1 channel at a time
            for channel in xrange(trial.shape[0]):
                # filter channels
                if not channel_filter.keep_channel(channel):
                    continue

                samples = trial[channel, :]

                # subtract channel mean
                if remove_dc_offset:
                    samples -= samples.mean()

                # down-sample if requested
                if resample is not None and resample[0] != resample[1]:
                    samples = librosa.resample(samples,
                                               resample[0],
                                               resample[1],
                                               res_type='sinc_best')

                # apply optional signal filter after down-sampling -> requires lower order
                if signal_filter is not None:
                    samples = signal_filter.process(samples)

                # get sub-sequence in resampled space
                # log.info('using samples {}..{} of {}'.format(start_sample,stop_sample, samples.shape))

                if stop_sample is not None and stop_sample > len(samples):
                    if zero_padding:
                        tmp = np.zeros(stop_sample)
                        tmp[:len(samples)] = samples
                        samples = tmp
                    else:
                        rejected = True
                        break  # stop processing this trial

                s = samples[start_sample:stop_sample]

                # TODO optional channel processing

                # normalize to max amplitude 1
                if normalize:
                    s = librosa.util.normalize(s)

                # add 2nd data dimension
                s = s.reshape(s.shape[0], 1)
                # print s.shape

                s = np.asfarray(s, dtype=theano.config.floatX)

                processed_trial.append(s)

                ### end of channel iteration ###

            if rejected:
                continue  # next trial

            processed_trial = np.asfarray([processed_trial],
                                          dtype=theano.config.floatX)

            # processed_trial = processed_trial.reshape((1, processed_trial.shape))
            processed_trial = np.rollaxis(processed_trial, 1, 4)

            # optional (external) trial processing, e.g. windowing
            # trials will be in b01c format with tf layout for 01-axes
            for trial_processor in trial_processors:
                processed_trial = trial_processor.process(
                    processed_trial, trial_meta)

            trials.append(processed_trial)

            for k in range(len(processed_trial)):
                meta.append(trial_meta)

                if use_targets:
                    targets.append(target)
                else:
                    labels.append(label)

        ### end of datafile iteration ###

        # turn into numpy arrays
        self.trials = np.vstack(trials)

        assert not np.isnan(np.sum(self.trials))

        # prepare targets / labels
        if use_targets:
            self.targets = np.vstack(targets)
            assert not np.isnan(np.sum(self.targets))
        else:
            labels = np.hstack(labels)
            if label_map is None:
                one_hot_formatter = OneHotFormatter(max(labels) + 1)
            else:
                one_hot_formatter = OneHotFormatter(
                    max(label_map.values()) + 1)
            one_hot_y = one_hot_formatter.format(labels)
            self.targets = one_hot_y

        self.metadata = meta

        if layout == 'ft':  # swap axes to (batch, feature, time, channels)
            self.trials = self.trials.swapaxes(1, 2)

        # transform after finalizing the data structure
        for transformer in transformers:
            self.trials, self.targets = transformer.process(
                self.trials, self.targets)

        self.trials = np.asarray(self.trials, dtype=theano.config.floatX)

        log.debug('final dataset shape: {} (b,0,1,c)'.format(
            self.trials.shape))
        # super(EEGEpochsDataset, self).__init__(topo_view=self.trials, y=self.targets, axes=['b', 0, 1, 'c'])

        self.X = self.trials.reshape(self.trials.shape[0],
                                     np.prod(self.trials.shape[1:]))
        self.y = self.targets
        log.info('generated dataset "{}" with shape X={}={} y={} targets={} '.
                 format(self.name, self.X.shape, self.trials.shape,
                        self.y.shape, self.targets.shape))

        # determine data specs
        features_space = Conv2DSpace(
            shape=[self.trials.shape[1], self.trials.shape[2]],
            num_channels=self.trials.shape[3])
        features_source = 'features'

        targets_space = VectorSpace(dim=self.targets.shape[-1])
        targets_source = 'targets'

        space_components = [features_space, targets_space]
        source_components = [features_source, targets_source]

        # additional support for meta information
        self.meta_maps = dict()
        for meta_source in meta_sources:
            self.meta_maps[meta_source] = sorted(
                list(set([m[meta_source] for m in self.metadata])))
            space_components.extend([VectorSpace(dim=1)])
            source_components.extend([meta_source])
            log.info('Generated meta-source "{}" with value map: {}'.format(
                meta_source, self.meta_maps[meta_source]))

        space = CompositeSpace(space_components)
        source = tuple(source_components)
        self.data_specs = (space, source)
        log.debug('data specs: {}'.format(self.data_specs))
Ejemplo n.º 10
0
    def __init__(self,
                 dataset,
                 dataset_metadata,
                 base_selectors=None,
                 ext_selectors=None,
                 targets_source='targets',
                 group_attribute=None,
                 allow_self_comparison=False,
                 additional_sources=None,
                 **kwargs):

        if base_selectors is None:
            base_selectors = {}

        if additional_sources is None:
            additional_sources = []

        # get selected trial IDs
        from deepthought.datasets.selection import DatasetMetaDB
        metadb = DatasetMetaDB(dataset_metadata, base_selectors.keys())
        base_trial_ids = metadb.select(base_selectors)
        log.debug('base selectors: {}'.format(base_selectors))
        log.debug('selected base trials: {}'.format(base_trial_ids))

        if ext_selectors is not None:
            split_index = len(base_trial_ids)
            metadb = DatasetMetaDB(dataset_metadata, ext_selectors.keys())
            ext_trial_ids = metadb.select(ext_selectors)
        else:
            split_index = 0
            ext_trial_ids = []

        log.debug('ext selectors: {}'.format(ext_selectors))
        log.debug('selected ext trials: {}'.format(ext_trial_ids))

        # indices = np.concatenate((base_trial_ids, ext_trial_ids))
        indices = base_trial_ids + ext_trial_ids
        metadata = [dataset_metadata[i] for i in indices]

        # load targets from dataset
        state = dataset.open()
        targets = dataset.get_data(
            state=state,
            request=indices)[dataset.sources.index(targets_source)]
        dataset.close(state)
        # print targets

        # split data into partitions according to
        groups = dict()
        if group_attribute is not None:
            for i, meta in enumerate(metadata):
                group = meta[group_attribute]
                if group not in groups:
                    groups[group] = []
                groups[group].append(i)
        else:
            # default: all in one group
            groups['default'] = np.arange(len(metadata))
        # print groups

        from itertools import product
        pairs = []
        others = []
        # add group-wise
        for group_ids in groups.values():
            for i in range(targets.shape[-1]):
                # 1st trial candidates
                if split_index > 0:
                    trial_ids = np.where(targets[:split_index, i] == 1)[0]
                else:
                    trial_ids = np.where(targets[:, i] == 1)[0]
                # 2nd trial candidates (same class)
                trial_ids2 = np.where(targets[:, i] == 1)[0]
                # others candidates (different class)
                others_ids = np.where(targets[:, i] == 0)[0]

                # only retain ids within the group
                trial_ids = np.intersect1d(trial_ids, group_ids)
                trial_ids2 = np.intersect1d(trial_ids2, group_ids)
                others_ids = np.intersect1d(others_ids, group_ids)

                # combine with permutation
                new_pairs = [
                    tuple(pair) for pair in product(trial_ids, trial_ids2)
                ]
                if not allow_self_comparison:  # remove repetitions
                    to_remove = []
                    for pair in new_pairs:
                        if pair[0] == pair[1]:
                            to_remove.append(pair)
                    for pair in to_remove:
                        new_pairs.remove(pair)
                    # print 'removed', to_remove
                    new_pairs = sorted(new_pairs)
                    # print pairs

                # combine all pairs with all other trials
                for pair, other in product(new_pairs, others_ids):
                    # print pair, other
                    pairs.append(pair)
                    others.append([other])

        # NOTE: triplets uses internal ids
        #   (refencing into indices which contains hdfs-specific ids)
        self.triplets = np.concatenate([pairs, others], axis=1)
        self.indices = np.asarray(indices, dtype=np.int16)
        # indices = indices.reshape((len(indices), 1)) # make 2D for VectorSpace
        log.debug('triplets.shape={} indices.shape={}'.format(
            self.triplets.shape, self.indices.shape))

        sources = ['targets', '0_indices', '1_indices', '2_indices']

        self.data_per_source = dict()
        for source in additional_sources:
            # load source data from hdf5 dataset
            # hdf5 = H5PYDataset(hdf5name, which_sets=('all',),
            #                   load_in_memory=True, sources=(source,)
            #                )
            # state = hdf5.open()
            # self.data_per_source[source] = hdf5.get_data(request=indices)[0]
            # hdf5.close(state)

            # load source data from dataset
            state = dataset.open()
            self.data_per_source[source] = dataset.get_data(
                state=state, request=indices)[dataset.sources.index(source)]
            dataset.close(state)

            for i in range(3):
                sources.append('{}_{}'.format(i, source))

        self.sources = tuple(sources)
        self.provides_sources = self.sources
        log.debug('sources: {}'.format(self.sources))
        super(TripletsIndexDataset, self).__init__(**kwargs)
Ejemplo n.º 11
0
    def __init__(self,
                 dataset,
                 dataset_metadata,
                 base_selectors=None,
                 ext_selectors=None,
                 targets_source='targets',
                 group_attribute=None,
                 allow_self_comparison=False,
                 additional_sources=None,
                 **kwargs):

        if base_selectors is None:
            base_selectors = {}

        if additional_sources is None:
            additional_sources = []

        # get selected trial IDs
        from deepthought.datasets.selection import DatasetMetaDB
        metadb = DatasetMetaDB(dataset_metadata, base_selectors.keys())
        base_trial_ids = metadb.select(base_selectors)
        log.debug('base selectors: {}'.format(base_selectors))
        log.debug('selected base trials: {}'.format(base_trial_ids))

        if ext_selectors is not None:
            split_index = len(base_trial_ids)
            metadb = DatasetMetaDB(dataset_metadata, ext_selectors.keys())
            ext_trial_ids = metadb.select(ext_selectors)
        else:
            split_index = 0
            ext_trial_ids = []

        log.debug('ext selectors: {}'.format(ext_selectors))
        log.debug('selected ext trials: {}'.format(ext_trial_ids))

        # indices = np.concatenate((base_trial_ids, ext_trial_ids))
        indices = base_trial_ids + ext_trial_ids
        metadata = [dataset_metadata[i] for i in indices]

        # load targets from dataset
        state = dataset.open()
        targets = dataset.get_data(
            state=state,
            request=indices)[dataset.sources.index(targets_source)]
        dataset.close(state)
        # print targets

        # split data into partitions according to
        groups = dict()
        if group_attribute is not None:
            for i, meta in enumerate(metadata):
                group = meta[group_attribute]
                if group not in groups:
                    groups[group] = []
                groups[group].append(i)
        else:
            # default: all in one group
            groups['default'] = np.arange(len(metadata))
        # print groups

        from itertools import product
        pairs = []
        pair_targets = []
        # add group-wise
        for group_ids in groups.values():
            for i in range(targets.shape[-1]):
                # 1st trial candidates
                if split_index > 0:
                    trial_ids = np.where(targets[:split_index, i] == 1)[0]
                else:
                    trial_ids = np.where(targets[:, i] == 1)[0]
                # similar candidates (same class)
                trial_ids2 = np.where(targets[:, i] == 1)[0]
                # dissimilar candidates (different class)
                others_ids = np.where(targets[:, i] == 0)[0]

                # only retain ids within the group
                trial_ids = np.intersect1d(trial_ids, group_ids)
                trial_ids2 = np.intersect1d(trial_ids2, group_ids)
                others_ids = np.intersect1d(others_ids, group_ids)

                for pair in product(trial_ids, trial_ids2):
                    if allow_self_comparison or pair[0] != pair[1]:
                        pairs.append(tuple(pair))
                        pair_targets.append(0)

                for pair in product(trial_ids, others_ids):
                    pairs.append(tuple(pair))
                    pair_targets.append(1)

        # NOTE: pairs uses internal ids
        #   (refencing into indices which contains hdfs-specific ids)
        self.pairs = np.asarray(pairs)
        self.pair_targets = np.asarray(pair_targets)
        self.indices = np.asarray(indices, dtype=np.int16)

        log.debug('pairs.shape={} indices.shape={}'.format(
            self.pairs.shape, self.indices.shape))

        sources = ['targets', '0_indices', '1_indices']

        self.data_per_source = dict()
        for source in additional_sources:
            # load source data from dataset
            state = dataset.open()
            self.data_per_source[source] = dataset.get_data(
                state=state, request=indices)[dataset.sources.index(source)]
            dataset.close(state)

            for i in range(2):
                sources.append('{}_{}'.format(i, source))

        self.sources = tuple(sources)
        self.provides_sources = self.sources
        log.debug('sources: {}'.format(self.sources))
        super(PairsIndexDataset, self).__init__(**kwargs)