def import_eeglab_sets(filepaths, target_path): # try load metadata-db metadb_file = os.path.join(target_path, 'metadata_db.pklz') if os.path.exists(metadb_file) and os.path.isfile(metadb_file): metadb = load(metadb_file) log.info('metadb loaded from {}'.format(metadb_file)) else: metadb = {} # empty DB log.info('no metadb found at {}. using empty db'.format(metadb_file)) for filepath in filepaths: # load extra data filename = os.path.basename(filepath) data, metadata = load_eeglab_data(filepath) # save data savepath = generate_filepath_from_metadata(metadata) save(os.path.join(target_path, savepath), (data, metadata), mkdirs=True) # save metadata metadb[savepath] = metadata save(metadb_file, metadb, mkdirs=True) log.debug('imported as {}'.format(savepath))
def load_datafiles_metadata(path): def tree(): return collections.defaultdict(tree) def multi_dimensions(n, dtype): """ Creates an n-dimension dictionary where the n-th dimension is of type 'type' """ if n == 0: return dtype() return collections.defaultdict(lambda:multi_dimensions(n-1, dtype)) datafiles = multi_dimensions(4, list) # datafiles = collections.defaultdict(lambda:collections.defaultdict(lambda:collections.defaultdict(list))) metadb = load(os.path.join(path, 'metadata_db.pklz')) for datafile, metadata in metadb.items(): subject = metadata['subject'] trial_type = metadata['trial_type'] trial_number = metadata['trial_no'] condition = metadata['condition'] # datafiles[subject][trial_type][trial_number][condition].append(os.path.join(path, datafile)); datafiles[subject][trial_type][trial_number][condition].append(datafile) log.debug('{} {} {} {} : {}'.format(subject,trial_type,trial_number,condition,datafile)) return datafiles, metadb
def load_datafiles_metadata(path): def tree(): return collections.defaultdict(tree) def multi_dimensions(n, dtype): """ Creates an n-dimension dictionary where the n-th dimension is of type 'type' """ if n == 0: return dtype() return collections.defaultdict(lambda: multi_dimensions(n - 1, dtype)) datafiles = multi_dimensions(4, list) # datafiles = collections.defaultdict(lambda:collections.defaultdict(lambda:collections.defaultdict(list))) metadb = load(os.path.join(path, 'metadata_db.pklz')) for datafile, metadata in metadb.items(): subject = metadata['subject'] trial_type = metadata['trial_type'] trial_number = metadata['trial_no'] condition = metadata['condition'] # datafiles[subject][trial_type][trial_number][condition].append(os.path.join(path, datafile)); datafiles[subject][trial_type][trial_number][condition].append( datafile) log.debug('{} {} {} {} : {}'.format(subject, trial_type, trial_number, condition, datafile)) return datafiles, metadb
def get_dataset(hdf5name, selectors=None, sources=('features', 'targets', 'subjects')): if selectors is None: selectors = {} # load metadata import deepthought.util.fs_util as fs_util base_meta = fs_util.load(hdf5name + '.meta.pklz') # build lookup structure from deepthought.datasets.selection import DatasetMetaDB metadb = DatasetMetaDB(base_meta, selectors.keys()) # get selected trial IDs selected_trial_ids = metadb.select(selectors) log.debug('selectors: {}'.format(selectors)) log.debug('selected trials: {}'.format(selected_trial_ids)) log.debug('selected sources: {}'.format(sources)) # load data and generate metadata from fuel.datasets.hdf5 import H5PYDataset hdf5 = H5PYDataset(hdf5name, which_sets=('all', ), subset=selected_trial_ids, load_in_memory=True, sources=sources) meta = [base_meta[i] for i in selected_trial_ids] log.debug('number of examples: {}'.format(hdf5.num_examples)) return hdf5, meta
def __init__(self, filepath): self.filepath = filepath with log_timing(log, 'loading data from {}'.format(filepath)): tmp = load(filepath) if len(tmp) == 2: self.data, self.metadata = tmp self.targets = None elif len(tmp) == 3: self.data, self.metadata, self.targets = tmp else: raise ValueError('got {} objects instead of 2 or 3.'.format(len(tmp)))
def __init__(self, root_path, selectors=dict()): # read metadata file: dict filename -> metadata meta_map = load(os.path.join(root_path, 'metadata_db.pklz')) filenames = list(meta_map.keys()) metadata = [meta_map[fn] for fn in filenames] # filter files by metadata selectors metadb = DatasetMetaDB(metadata, selectors.keys()) selected_file_ids = metadb.select(selectors) # log.info('selected files: {}'.format(selected_file_ids)) # load selected files self.data = [] self.metadata = [] for id in selected_file_ids: log.debug('loading data file #{} {}'.format(id, filenames[id])) f_data, f_metadata = load(os.path.join(root_path, filenames[id])) self.data.append(f_data) self.metadata.append(metadata[id]) print len(self.data), len(self.metadata)
def __init__(self, filepath): self.filepath = filepath with log_timing(log, 'loading data from {}'.format(filepath)): tmp = load(filepath) if len(tmp) == 2: self.data, self.metadata = tmp self.targets = None elif len(tmp) == 3: self.data, self.metadata, self.targets = tmp else: raise ValueError('got {} objects instead of 2 or 3.'.format( len(tmp)))
def __init__(self, filepath, meta_classes=dict()): self.filepath = filepath tmp = load(filepath) if len(tmp) == 2: self.data, self.metadata = tmp self.targets = None elif len(tmp) == 3: self.data, self.metadata, self.targets = tmp else: raise ValueError('got {} objects instead of 2 or 3.'.format(len(tmp))) for class_name, classes in meta_classes.iteritems(): self._create_meta_class(class_name, classes)
def __init__(self, path, name = '', # optional name # selectors subjects='all', # optional selector (list) or 'all' trial_types='all', # optional selector (list) or 'all' trial_numbers='all', # optional selector (list) or 'all' conditions='all', # optional selector (list) or 'all' partitioner = None, channel_filter = NoChannelFilter(), # optional channel filter, default: keep all channel_names = None, # optional channel names (for metadata) label_map = None, # optional conversion of labels remove_dc_offset = False, # optional subtraction of channel mean, usually done already earlier resample = None, # optional down-sampling # optional sub-sequences selection start_sample = 0, stop_sample = None, # optional for selection of sub-sequences # optional signal filter to by applied before spitting the signal signal_filter = None, # windowing parameters frame_size = -1, hop_size = -1, # values > 0 will lead to windowing hop_fraction = None, # alternative to specifying absolute hop_size # optional spectrum parameters, n_fft = 0 keeps raw data n_fft = 0, n_freq_bins = None, spectrum_log_amplitude = False, spectrum_normalization_mode = None, include_phase = False, flatten_channels=False, layout='tf', # (0,1)-axes layout tf=time x features or ft=features x time save_matrix_path = None, keep_metadata = False, ): ''' Constructor ''' # save params self.params = locals().copy() del self.params['self'] # print self.params # TODO: get the whole filtering into an extra class datafiles_metadata, metadb = load_datafiles_metadata(path) # print datafiles_metadata def apply_filters(filters, node): if isinstance(node, dict): filtered = [] keepkeys = filters[0] for key, value in node.items(): if keepkeys == 'all' or key in keepkeys: filtered.extend(apply_filters(filters[1:], value)) return filtered else: return node # [node] # keep only files that match the metadata filters self.datafiles = apply_filters([subjects,trial_types,trial_numbers,conditions], datafiles_metadata) # copy metadata for retained files self.metadb = {} for datafile in self.datafiles: self.metadb[datafile] = metadb[datafile] # print self.datafiles # print self.metadb self.name = name if partitioner is not None: self.datafiles = partitioner.get_partition(self.name, self.metadb) self.include_phase = include_phase self.spectrum_normalization_mode = spectrum_normalization_mode self.spectrum_log_amplitude = spectrum_log_amplitude self.sequence_partitions = [] # used to keep track of original sequences # metadata: [subject, trial_no, stimulus, channel, start, ] self.metadata = [] sequences = [] labels = [] n_sequences = 0 if frame_size > 0 and hop_size == -1 and hop_fraction is not None: hop_size = np.ceil(frame_size / hop_fraction) for i in xrange(len(self.datafiles)): with log_timing(log, 'loading data from {}'.format(self.datafiles[i])): # save start of next sequence self.sequence_partitions.append(n_sequences) data, metadata = load(os.path.join(path, self.datafiles[i])) label = metadata['label'] if label_map is not None: label = label_map[label] multi_channel_frames = [] # process 1 channel at a time for channel in xrange(data.shape[1]): # filter channels if not channel_filter.keep_channel(channel): continue samples = data[:, channel] # subtract channel mean if remove_dc_offset: samples -= samples.mean() # down-sample if requested if resample is not None and resample[0] != resample[1]: samples = librosa.resample(samples, resample[0], resample[1]) # apply optional signal filter after down-sampling -> requires lower order if signal_filter is not None: samples = signal_filter.process(samples) # get sub-sequence in resampled space # log.info('using samples {}..{} of {}'.format(start_sample,stop_sample, samples.shape)) samples = samples[start_sample:stop_sample] if n_fft is not None and n_fft > 0: # Optionally: ### frequency spectrum branch ### # transform to spectogram hop_length = n_fft / 4; ''' from http://theremin.ucsd.edu/~bmcfee/librosadoc/librosa.html >>> # Get a power spectrogram from a waveform y >>> S = np.abs(librosa.stft(y)) ** 2 >>> log_S = librosa.logamplitude(S) ''' S = librosa.core.stft(samples, n_fft=n_fft, hop_length=hop_length) # mag = np.abs(S) # magnitude spectrum mag = np.abs(S)**2 # power spectrum # include phase information if requested if self.include_phase: # phase = np.unwrap(np.angle(S)) phase = np.angle(S) # Optionally: cut off high bands if n_freq_bins is not None: mag = mag[0:n_freq_bins, :] if self.include_phase: phase = phase[0:n_freq_bins, :] if self.spectrum_log_amplitude: mag = librosa.logamplitude(mag) s = mag # for normalization ''' NOTE on normalization: It depends on the structure of a neural network and (even more) on the properties of data. There is no best normalization algorithm because if there would be one, it would be used everywhere by default... In theory, there is no requirement for the data to be normalized at all. This is a purely practical thing because in practice convergence could take forever if your input is spread out too much. The simplest would be to just normalize it by scaling your data to (-1,1) (or (0,1) depending on activation function), and in most cases it does work. If your algorithm converges well, then this is your answer. If not, there are too many possible problems and methods to outline here without knowing the actual data. ''' ## normalize to mean 0, std 1 if self.spectrum_normalization_mode == 'mean0_std1': # s = preprocessing.scale(s, axis=0); mean = np.mean(s) std = np.std(s) s = (s - mean) / std ## normalize by linear transform to [0,1] elif self.spectrum_normalization_mode == 'linear_0_1': s = s / np.max(s) ## normalize by linear transform to [-1,1] elif self.spectrum_normalization_mode == 'linear_-1_1': s = -1 + 2 * (s - np.min(s)) / (np.max(s) - np.min(s)) elif self.spectrum_normalization_mode is not None: raise ValueError( 'unsupported spectrum normalization mode {}'.format( self.spectrum_normalization_mode) ) #print s.mean(axis=0) #print s.std(axis=0) # include phase information if requested if self.include_phase: # normalize phase to [-1.1] phase = phase / np.pi s = np.vstack([s, phase]) # transpose to fit pylearn2 layout s = np.transpose(s) # print s.shape ### end of frequency spectrum branch ### else: ### raw waveform branch ### # normalize to max amplitude 1 s = librosa.util.normalize(samples) # add 2nd data dimension s = s.reshape(s.shape[0], 1) # print s.shape ### end of raw waveform branch ### s = np.asfarray(s, dtype='float32') if frame_size > 0 and hop_size > 0: s = s.copy() # FIXME: THIS IS NECESSARY IN MultiChannelEEGSequencesDataset - OTHERWISE, THE FOLLOWING OP DOES NOT WORK!!!! frames = frame(s, frame_length=frame_size, hop_length=hop_size) else: frames = s del s # print frames.shape if flatten_channels: # add artificial channel dimension frames = frames.reshape((frames.shape[0], frames.shape[1], frames.shape[2], 1)) # print frames.shape sequences.append(frames) # increment counter by new number of frames n_sequences += frames.shape[0] if keep_metadata: # determine channel name channel_name = None if channel_names is not None: channel_name = channel_names[channel] elif 'channels' in metadata: channel_name = metadata['channels'][channel] self.metadata.append({ 'subject' : metadata['subject'], # subject 'trial_type': metadata['trial_type'], # trial_type 'trial_no' : metadata['trial_no'], # trial_no 'condition' : metadata['condition'], # condition 'channel' : channel, # channel 'channel_name' : channel_name, 'start' : self.sequence_partitions[-1], # start 'stop' : n_sequences # stop }) for _ in xrange(frames.shape[0]): labels.append(label) else: multi_channel_frames.append(frames) ### end of channel iteration ### if not flatten_channels: # turn list into array multi_channel_frames = np.asfarray(multi_channel_frames, dtype='float32') # [channels x frames x time x freq] -> cb01 # [channels x frames x time x 1] -> cb0. # move channel dimension to end multi_channel_frames = np.rollaxis(multi_channel_frames, 0, 4) # print multi_channel_frames.shape # log.debug(multi_channel_frames.shape) sequences.append(multi_channel_frames) # increment counter by new number of frames n_sequences += multi_channel_frames.shape[0] if keep_metadata: self.metadata.append({ 'subject' : metadata['subject'], # subject 'trial_type': metadata['trial_type'], # trial_type 'trial_no' : metadata['trial_no'], # trial_no 'condition' : metadata['condition'], # condition 'channel' : 'all', # channel 'start' : self.sequence_partitions[-1], # start 'stop' : n_sequences # stop }) for _ in xrange(multi_channel_frames.shape[0]): labels.append(label) ### end of datafile iteration ### # turn into numpy arrays sequences = np.vstack(sequences) # print sequences.shape; labels = np.hstack(labels) # one_hot_y = one_hot(labels) one_hot_formatter = OneHotFormatter(labels.max() + 1) # FIXME! one_hot_y = one_hot_formatter.format(labels) self.labels = labels if layout == 'ft': # swap axes to (batch, feature, time, channels) sequences = sequences.swapaxes(1, 2) log.debug('final dataset shape: {} (b,0,1,c)'.format(sequences.shape)) super(MultiChannelEEGDataset, self).__init__(topo_view=sequences, y=one_hot_y, axes=['b', 0, 1, 'c']) log.info('generated dataset "{}" with shape X={}={} y={} labels={} '. format(self.name, self.X.shape, sequences.shape, self.y.shape, self.labels.shape)) if save_matrix_path is not None: matrix = DenseDesignMatrix(topo_view=sequences, y=one_hot_y, axes=['b', 0, 1, 'c']) with log_timing(log, 'saving DenseDesignMatrix to {}'.format(save_matrix_path)): serial.save(save_matrix_path, matrix)
def __init__(self, path, name = '', # optional name # selectors subjects='all', # optional selector (list) or 'all' trial_types='all', # optional selector (list) or 'all' trial_numbers='all', # optional selector (list) or 'all' conditions='all', # optional selector (list) or 'all' partitioner = None, channel_filter = NoChannelFilter(), # optional channel filter, default: keep all channel_names = None, # optional channel names (for metadata) label_map = None, # optional conversion of labels remove_dc_offset = False, # optional subtraction of channel mean, usually done already earlier resample = None, # optional down-sampling # optional sub-sequences selection start_sample = 0, stop_sample = None, # optional for selection of sub-sequences # optional signal filter to by applied before spitting the signal signal_filter = None, # windowing parameters frame_size = -1, hop_size = -1, # values > 0 will lead to windowing hop_fraction = None, # alternative to specifying absolute hop_size # # optional spectrum parameters, n_fft = 0 keeps raw data # n_fft = 0, # n_freq_bins = None, # spectrum_log_amplitude = False, # spectrum_normalization_mode = None, # include_phase = False, flatten_channels=False, # layout='tf', # (0,1)-axes layout tf=time x features or ft=features x time # save_matrix_path = None, keep_metadata = False, target_mode='label', ): ''' Constructor ''' # save params self.params = locals().copy() del self.params['self'] # print self.params # TODO: get the whole filtering into an extra class datafiles_metadata, metadb = load_datafiles_metadata(path) # print datafiles_metadata def apply_filters(filters, node): if isinstance(node, dict): filtered = [] keepkeys = filters[0] for key, value in node.items(): if keepkeys == 'all' or key in keepkeys: filtered.extend(apply_filters(filters[1:], value)) return filtered else: return node # [node] # keep only files that match the metadata filters self.datafiles = apply_filters([subjects,trial_types,trial_numbers,conditions], datafiles_metadata) # copy metadata for retained files self.metadb = {} for datafile in self.datafiles: self.metadb[datafile] = metadb[datafile] # print self.datafiles # print self.metadb self.name = name if partitioner is not None: self.datafiles = partitioner.get_partition(self.name, self.metadb) # self.include_phase = include_phase # self.spectrum_normalization_mode = spectrum_normalization_mode # self.spectrum_log_amplitude = spectrum_log_amplitude self.sequence_partitions = [] # used to keep track of original sequences # metadata: [subject, trial_no, stimulus, channel, start, ] self.metadata = [] sequences = [] labels = [] targets = [] n_sequences = 0 print hop_size if frame_size > 0 and hop_size == -1 and hop_fraction is not None: hop_size = np.ceil(frame_size / hop_fraction) print hop_size if target_mode == 'next': # get 1 more value per frame as target frame_size += 1 # print 'frame size: {}'.format(frame_size) for i in xrange(len(self.datafiles)): with log_timing(log, 'loading data from {}'.format(self.datafiles[i])): # save start of next sequence self.sequence_partitions.append(n_sequences) data, metadata = load(os.path.join(path, self.datafiles[i])) # data, metadata = self.generate_test_data() label = metadata['label'] if label_map is not None: label = label_map[label] multi_channel_frames = [] multi_channel_targets = [] # process 1 channel at a time for channel in xrange(data.shape[1]): # filter channels if not channel_filter.keep_channel(channel): continue samples = data[:, channel] # print samples # subtract channel mean #FIXME if remove_dc_offset: samples -= samples.mean() # down-sample if requested if resample is not None and resample[0] != resample[1]: samples = librosa.resample(samples, resample[0], resample[1]) # apply optional signal filter after down-sampling -> requires lower order if signal_filter is not None: samples = signal_filter.process(samples) # get sub-sequence in resampled space # log.info('using samples {}..{} of {}'.format(start_sample,stop_sample, samples.shape)) samples = samples[start_sample:stop_sample] # print start_sample, stop_sample, samples.shape # if n_fft is not None and n_fft > 0: # Optionally: # ### frequency spectrum branch ### # # # transform to spectogram # hop_length = n_fft / 4; # # ''' # from http://theremin.ucsd.edu/~bmcfee/librosadoc/librosa.html # >>> # Get a power spectrogram from a waveform y # >>> S = np.abs(librosa.stft(y)) ** 2 # >>> log_S = librosa.logamplitude(S) # ''' # # S = librosa.core.stft(samples, n_fft=n_fft, hop_length=hop_length) # # mag = np.abs(S) # magnitude spectrum # mag = np.abs(S)**2 # power spectrum # # # include phase information if requested # if self.include_phase: # # phase = np.unwrap(np.angle(S)) # phase = np.angle(S) # # # Optionally: cut off high bands # if n_freq_bins is not None: # mag = mag[0:n_freq_bins, :] # if self.include_phase: # phase = phase[0:n_freq_bins, :] # # if self.spectrum_log_amplitude: # mag = librosa.logamplitude(mag) # # s = mag # for normalization # # ''' # NOTE on normalization: # It depends on the structure of a neural network and (even more) # on the properties of data. There is no best normalization algorithm # because if there would be one, it would be used everywhere by default... # # In theory, there is no requirement for the data to be normalized at all. # This is a purely practical thing because in practice convergence could # take forever if your input is spread out too much. The simplest would be # to just normalize it by scaling your data to (-1,1) (or (0,1) depending # on activation function), and in most cases it does work. If your # algorithm converges well, then this is your answer. If not, there are # too many possible problems and methods to outline here without knowing # the actual data. # ''' # # ## normalize to mean 0, std 1 # if self.spectrum_normalization_mode == 'mean0_std1': # # s = preprocessing.scale(s, axis=0); # mean = np.mean(s) # std = np.std(s) # s = (s - mean) / std # # ## normalize by linear transform to [0,1] # elif self.spectrum_normalization_mode == 'linear_0_1': # s = s / np.max(s) # # ## normalize by linear transform to [-1,1] # elif self.spectrum_normalization_mode == 'linear_-1_1': # s = -1 + 2 * (s - np.min(s)) / (np.max(s) - np.min(s)) # # elif self.spectrum_normalization_mode is not None: # raise ValueError( # 'unsupported spectrum normalization mode {}'.format( # self.spectrum_normalization_mode) # ) # # #print s.mean(axis=0) # #print s.std(axis=0) # # # include phase information if requested # if self.include_phase: # # normalize phase to [-1.1] # phase = phase / np.pi # s = np.vstack([s, phase]) # # # transpose to fit pylearn2 layout # s = np.transpose(s) # # print s.shape # # ### end of frequency spectrum branch ### # else: ### raw waveform branch ### # normalize to max amplitude 1 s = librosa.util.normalize(samples) # add 2nd data dimension # s = s.reshape(s.shape[0], 1) # print s.shape ### end of raw waveform branch ### s = np.asfarray(s, dtype='float32') if frame_size > 0 and hop_size > 0: # print 'frame size: {}'.format(frame_size) s = s.copy() # FIXME: THIS IS NECESSARY - OTHERWISE, THE FOLLOWING OP DOES NOT WORK!!!! frames = compute_frames(s, frame_length=frame_size, hop_length=hop_size) # frames = librosa.util.frame(s, frame_length=frame_size, hop_length=hop_size) else: frames = s del s # print frames.shape if target_mode == 'next': frame_targets = np.empty(len(frames)) tmp = [] for f, frame in enumerate(frames): tmp.append(frame[:-1]) frame_targets[f] = frame[-1] frames = np.asarray(tmp) # print frames.shape # for f, frm in enumerate(frames): # print frm, frame_targets[f] # # FIXME: OK so far if flatten_channels: # add artificial channel dimension frames = frames.reshape((frames.shape[0], frames.shape[1], frames.shape[2], 1)) # print frames.shape sequences.append(frames) # increment counter by new number of frames n_sequences += frames.shape[0] if keep_metadata: # determine channel name channel_name = None if channel_names is not None: channel_name = channel_names[channel] elif 'channels' in metadata: channel_name = metadata['channels'][channel] self.metadata.append({ 'subject' : metadata['subject'], # subject 'trial_type': metadata['trial_type'], # trial_type 'trial_no' : metadata['trial_no'], # trial_no 'condition' : metadata['condition'], # condition 'channel' : channel, # channel 'channel_name' : channel_name, 'start' : self.sequence_partitions[-1], # start 'stop' : n_sequences # stop }) for _ in xrange(frames.shape[0]): labels.append(label) if target_mode == 'next': for next in frame_targets: targets.append(next) else: multi_channel_frames.append(frames) if target_mode == 'next': multi_channel_targets.append(frame_targets) ### end of channel iteration ### # print np.asarray(multi_channel_frames, dtype=np.int) # # FIXME: OK so far if not flatten_channels: # turn list into array multi_channel_frames = np.asfarray(multi_channel_frames, dtype='float32') # [channels x frames x time x freq] -> cb01 # [channels x frames x time x 1] -> cb0. # move channel dimension to end multi_channel_frames = np.rollaxis(multi_channel_frames, 0, len(multi_channel_frames.shape)) # print multi_channel_frames.shape log.info(multi_channel_frames.shape) sequences.append(multi_channel_frames) # increment counter by new number of frames n_sequences += multi_channel_frames.shape[0] if keep_metadata: self.metadata.append({ 'subject' : metadata['subject'], # subject 'trial_type': metadata['trial_type'], # trial_type 'trial_no' : metadata['trial_no'], # trial_no 'condition' : metadata['condition'], # condition 'channel' : 'all', # channel 'start' : self.sequence_partitions[-1], # start 'stop' : n_sequences # stop }) for _ in xrange(multi_channel_frames.shape[0]): labels.append(label) if target_mode == 'next': multi_channel_targets = np.asfarray(multi_channel_targets, dtype='float32') targets.append(multi_channel_targets.T) ### end of datafile iteration ### # print sequences[0].shape # print np.asarray(sequences[0], dtype=np.int) # # FIXME: looks OK # turn into numpy arrays sequences = np.vstack(sequences) # sequences = np.asarray(sequences).squeeze() # sequences = sequences.reshape(sequences.shape[0]*sequences.shape[1], sequences.shape[2]) print 'sequences: {}'.format(sequences.shape) labels = np.hstack(labels) self.labels = labels print 'labels: {}'.format(labels.shape) if target_mode == 'label': targets = labels.copy() ## copy targets to fit SequenceDataSpace(VectorSpace) structure (*, frame_size, 12) # targets = targets.reshape((targets.shape[0], 1)) # targets = np.repeat(targets, frame_size, axis=1) # print targets.shape # one_hot_formatter = OneHotFormatter(max(targets.max() + 1, len(label_map)), dtype=np.int) # one_hot_y = one_hot_formatter.format(targets) # print one_hot_y.shape ## copy targets to fit SequenceDataSpace(IndexSpace) structure -> (*, frame_size, 1) targets = targets.reshape((targets.shape[0], 1)) targets = np.repeat(targets, frame_size, axis=1) targets = targets.reshape((targets.shape[0], targets.shape[1], 1)) print targets.shape elif target_mode == 'next': targets = np.concatenate(targets) targets = targets.reshape((targets.shape[0], 1, targets.shape[1])) print 'targets: {}'.format(targets.shape) n_channels = sequences.shape[2] print 'number of channels: {}'.format(n_channels) # if layout == 'ft': # swap axes to (batch, feature, time, channels) # sequences = sequences.swapaxes(1, 2) log.debug('final dataset shape: {} (b,0,1,c)'.format(sequences.shape)) source = ('features', 'targets') # space = CompositeSpace([ # # VectorSequenceSpace(dim=64), # SequenceSpace(VectorSpace(dim=64)), # VectorSpace(dim=12), # ]) if target_mode == 'label': space = CompositeSpace([ SequenceDataSpace(VectorSpace(dim=n_channels)), # SequenceDataSpace(VectorSpace(dim=12)), SequenceDataSpace(IndexSpace(dim=1, max_labels=12)), # SequenceDataSpace(IndexSpace(dim=512, max_labels=12)), ]) elif target_mode == 'next': space = CompositeSpace([ # does not work with VectorSpacesDataset # SequenceSpace(VectorSpace(dim=64)), # SequenceSpace(VectorSpace(dim=64)) SequenceDataSpace(VectorSpace(dim=n_channels)), SequenceDataSpace(VectorSpace(dim=n_channels)) # VectorSpace(dim=n_channels) ]) # source = ('features') # space = SequenceSpace(VectorSpace(dim=64)) print 'sequences: {}'.format(sequences.shape) print 'targets: {}'.format(targets.shape) # for i, seq in enumerate(sequences): # print np.asarray(seq, dtype=np.int) # print np.asarray(targets[i], dtype=np.int) # break # # FIXME: looks OK # SequenceDataSpace(IndexSpace(dim=1, max_labels=self._max_labels)), if target_mode == 'label': super(MultiChannelEEGSequencesDataset, self).__init__( # data=(sequences, one_hot_y), # works with vectorspace-target data=(sequences, targets), # works with indexspace-target # data=sequences, data_specs=(space, source) ) elif target_mode == 'next': super(MultiChannelEEGSequencesDataset, self).__init__( # data=(sequences, one_hot_y), # works with vectorspace-target data=(sequences, targets), # works with indexspace-target # data=sequences, data_specs=(space, source) )
def load(filepath): return fs_util.load(filepath)
def pretrain_encoder(self, outer_fold_index, outer_fold): """ generic template that works with any model structure :param outer_fold_index: :param outer_fold: :return: """ import deepthought.util.fs_util as fs_util from deepthought.util.function_util import get_function fold_params_filename = os.path.join(self.output_path, 'fold_params_{}.pklz'.format(outer_fold_index)) inner_folds = self.fold_generator.get_inner_cv_folds(outer_fold) if os.path.isfile(fold_params_filename): # load trained network parameters from existing file fold_param_values = fs_util.load(fold_params_filename) print 'loaded trained fold network parameters from', fold_params_filename #assert len(fold_param_values) == len(inner_folds) else: # compute trial fold models fold_param_values = [] fold_errors = [] for ifi, ifold in enumerate(inner_folds): log.info('processing fold {}.{}: {}'.format(outer_fold_index, ifi, ifold)) train_selectors = self.fold_generator.get_fold_selectors( outer_fold=outer_fold['train'], inner_fold=ifold['train'], base_selectors=self.base_selectors) if 'valid' in ifold.keys(): valid_selectors = self.fold_generator.get_fold_selectors( outer_fold=outer_fold['train'], inner_fold=ifold['valid'], base_selectors=self.base_selectors) else: valid_selectors = None self.pretrain_model.set_parameter_values(self.init_param_values) # reset weights trained_model_param_values, best_error_valid = self.pretrain( self.pretrain_model, self.hyper_params, self.full_hdf5, self.full_meta, train_selectors, valid_selectors) fold_param_values.append(trained_model_param_values) fold_errors.append(best_error_valid) if 'only_1_inner_fold' in self.hyper_params and self.hyper_params['only_1_inner_fold']: print 'Stop after 1 inner fold requested (only_1_inner_fold=True).' break fold_errors = np.asarray(fold_errors).squeeze() print 'fold errors:', fold_errors # store trained network parameters for later analysis fs_util.save(fold_params_filename, fold_param_values) print 'parameters saved to', fold_params_filename # build encoder encoder = self.encoder_pipeline_factory.set_pipeline_parameters(self.encoder_model, fold_param_values) # transform dataset (re-using data_dict and working with indices as input) encoder_fn = get_function(encoder, allow_input_downcast=True) return encoder_fn
def __init__( self, path, name='', # optional name # selectors subjects='all', # optional selector (list) or 'all' trial_types='all', # optional selector (list) or 'all' trial_numbers='all', # optional selector (list) or 'all' conditions='all', # optional selector (list) or 'all' partitioner=None, channel_filter=NoChannelFilter( ), # optional channel filter, default: keep all channel_names=None, # optional channel names (for metadata) label_map=None, # optional conversion of labels remove_dc_offset=False, # optional subtraction of channel mean, usually done already earlier resample=None, # optional down-sampling # optional sub-sequences selection start_sample=0, stop_sample=None, # optional for selection of sub-sequences # optional signal filter to by applied before spitting the signal signal_filter=None, # windowing parameters frame_size=-1, hop_size=-1, # values > 0 will lead to windowing hop_fraction=None, # alternative to specifying absolute hop_size # # optional spectrum parameters, n_fft = 0 keeps raw data # n_fft = 0, # n_freq_bins = None, # spectrum_log_amplitude = False, # spectrum_normalization_mode = None, # include_phase = False, flatten_channels=False, # layout='tf', # (0,1)-axes layout tf=time x features or ft=features x time # save_matrix_path = None, keep_metadata=False, target_mode='label', ): ''' Constructor ''' # save params self.params = locals().copy() del self.params['self'] # print self.params # TODO: get the whole filtering into an extra class datafiles_metadata, metadb = load_datafiles_metadata(path) # print datafiles_metadata def apply_filters(filters, node): if isinstance(node, dict): filtered = [] keepkeys = filters[0] for key, value in node.items(): if keepkeys == 'all' or key in keepkeys: filtered.extend(apply_filters(filters[1:], value)) return filtered else: return node # [node] # keep only files that match the metadata filters self.datafiles = apply_filters( [subjects, trial_types, trial_numbers, conditions], datafiles_metadata) # copy metadata for retained files self.metadb = {} for datafile in self.datafiles: self.metadb[datafile] = metadb[datafile] # print self.datafiles # print self.metadb self.name = name if partitioner is not None: self.datafiles = partitioner.get_partition(self.name, self.metadb) # self.include_phase = include_phase # self.spectrum_normalization_mode = spectrum_normalization_mode # self.spectrum_log_amplitude = spectrum_log_amplitude self.sequence_partitions = [ ] # used to keep track of original sequences # metadata: [subject, trial_no, stimulus, channel, start, ] self.metadata = [] sequences = [] labels = [] targets = [] n_sequences = 0 print(hop_size) if frame_size > 0 and hop_size == -1 and hop_fraction is not None: hop_size = np.ceil(frame_size / hop_fraction) print(hop_size) if target_mode == 'next': # get 1 more value per frame as target frame_size += 1 # print 'frame size: {}'.format(frame_size) for i in xrange(len(self.datafiles)): with log_timing(log, 'loading data from {}'.format(self.datafiles[i])): # save start of next sequence self.sequence_partitions.append(n_sequences) data, metadata = load(os.path.join(path, self.datafiles[i])) # data, metadata = self.generate_test_data() label = metadata['label'] if label_map is not None: label = label_map[label] multi_channel_frames = [] multi_channel_targets = [] # process 1 channel at a time for channel in xrange(data.shape[1]): # filter channels if not channel_filter.keep_channel(channel): continue samples = data[:, channel] # print samples # subtract channel mean #FIXME if remove_dc_offset: samples -= samples.mean() # down-sample if requested if resample is not None and resample[0] != resample[1]: samples = librosa.resample(samples, resample[0], resample[1]) # apply optional signal filter after down-sampling -> requires lower order if signal_filter is not None: samples = signal_filter.process(samples) # get sub-sequence in resampled space # log.info('using samples {}..{} of {}'.format(start_sample,stop_sample, samples.shape)) samples = samples[start_sample:stop_sample] # print start_sample, stop_sample, samples.shape # if n_fft is not None and n_fft > 0: # Optionally: # ### frequency spectrum branch ### # # # transform to spectogram # hop_length = n_fft / 4; # # ''' # from http://theremin.ucsd.edu/~bmcfee/librosadoc/librosa.html # >>> # Get a power spectrogram from a waveform y # >>> S = np.abs(librosa.stft(y)) ** 2 # >>> log_S = librosa.logamplitude(S) # ''' # # S = librosa.core.stft(samples, n_fft=n_fft, hop_length=hop_length) # # mag = np.abs(S) # magnitude spectrum # mag = np.abs(S)**2 # power spectrum # # # include phase information if requested # if self.include_phase: # # phase = np.unwrap(np.angle(S)) # phase = np.angle(S) # # # Optionally: cut off high bands # if n_freq_bins is not None: # mag = mag[0:n_freq_bins, :] # if self.include_phase: # phase = phase[0:n_freq_bins, :] # # if self.spectrum_log_amplitude: # mag = librosa.logamplitude(mag) # # s = mag # for normalization # # ''' # NOTE on normalization: # It depends on the structure of a neural network and (even more) # on the properties of data. There is no best normalization algorithm # because if there would be one, it would be used everywhere by default... # # In theory, there is no requirement for the data to be normalized at all. # This is a purely practical thing because in practice convergence could # take forever if your input is spread out too much. The simplest would be # to just normalize it by scaling your data to (-1,1) (or (0,1) depending # on activation function), and in most cases it does work. If your # algorithm converges well, then this is your answer. If not, there are # too many possible problems and methods to outline here without knowing # the actual data. # ''' # # ## normalize to mean 0, std 1 # if self.spectrum_normalization_mode == 'mean0_std1': # # s = preprocessing.scale(s, axis=0); # mean = np.mean(s) # std = np.std(s) # s = (s - mean) / std # # ## normalize by linear transform to [0,1] # elif self.spectrum_normalization_mode == 'linear_0_1': # s = s / np.max(s) # # ## normalize by linear transform to [-1,1] # elif self.spectrum_normalization_mode == 'linear_-1_1': # s = -1 + 2 * (s - np.min(s)) / (np.max(s) - np.min(s)) # # elif self.spectrum_normalization_mode is not None: # raise ValueError( # 'unsupported spectrum normalization mode {}'.format( # self.spectrum_normalization_mode) # ) # # #print s.mean(axis=0) # #print s.std(axis=0) # # # include phase information if requested # if self.include_phase: # # normalize phase to [-1.1] # phase = phase / np.pi # s = np.vstack([s, phase]) # # # transpose to fit pylearn2 layout # s = np.transpose(s) # # print s.shape # # ### end of frequency spectrum branch ### # else: ### raw waveform branch ### # normalize to max amplitude 1 s = librosa.util.normalize(samples) # add 2nd data dimension # s = s.reshape(s.shape[0], 1) # print s.shape ### end of raw waveform branch ### s = np.asfarray(s, dtype='float32') if frame_size > 0 and hop_size > 0: # print 'frame size: {}'.format(frame_size) s = s.copy( ) # FIXME: THIS IS NECESSARY - OTHERWISE, THE FOLLOWING OP DOES NOT WORK!!!! frames = compute_frames(s, frame_length=frame_size, hop_length=hop_size) # frames = librosa.util.frame(s, frame_length=frame_size, hop_length=hop_size) else: frames = s del s # print frames.shape if target_mode == 'next': frame_targets = np.empty(len(frames)) tmp = [] for f, frame in enumerate(frames): tmp.append(frame[:-1]) frame_targets[f] = frame[-1] frames = np.asarray(tmp) # print frames.shape # for f, frm in enumerate(frames): # print frm, frame_targets[f] # # FIXME: OK so far if flatten_channels: # add artificial channel dimension frames = frames.reshape( (frames.shape[0], frames.shape[1], frames.shape[2], 1)) # print frames.shape sequences.append(frames) # increment counter by new number of frames n_sequences += frames.shape[0] if keep_metadata: # determine channel name channel_name = None if channel_names is not None: channel_name = channel_names[channel] elif 'channels' in metadata: channel_name = metadata['channels'][channel] self.metadata.append({ 'subject': metadata['subject'], # subject 'trial_type': metadata['trial_type'], # trial_type 'trial_no': metadata['trial_no'], # trial_no 'condition': metadata['condition'], # condition 'channel': channel, # channel 'channel_name': channel_name, 'start': self.sequence_partitions[-1], # start 'stop': n_sequences # stop }) for _ in xrange(frames.shape[0]): labels.append(label) if target_mode == 'next': for next in frame_targets: targets.append(next) else: multi_channel_frames.append(frames) if target_mode == 'next': multi_channel_targets.append(frame_targets) ### end of channel iteration ### # print np.asarray(multi_channel_frames, dtype=np.int) # # FIXME: OK so far if not flatten_channels: # turn list into array multi_channel_frames = np.asfarray(multi_channel_frames, dtype='float32') # [channels x frames x time x freq] -> cb01 # [channels x frames x time x 1] -> cb0. # move channel dimension to end multi_channel_frames = np.rollaxis( multi_channel_frames, 0, len(multi_channel_frames.shape)) # print multi_channel_frames.shape log.info(multi_channel_frames.shape) sequences.append(multi_channel_frames) # increment counter by new number of frames n_sequences += multi_channel_frames.shape[0] if keep_metadata: self.metadata.append({ 'subject': metadata['subject'], # subject 'trial_type': metadata['trial_type'], # trial_type 'trial_no': metadata['trial_no'], # trial_no 'condition': metadata['condition'], # condition 'channel': 'all', # channel 'start': self.sequence_partitions[-1], # start 'stop': n_sequences # stop }) for _ in xrange(multi_channel_frames.shape[0]): labels.append(label) if target_mode == 'next': multi_channel_targets = np.asfarray( multi_channel_targets, dtype='float32') targets.append(multi_channel_targets.T) ### end of datafile iteration ### # print sequences[0].shape # print np.asarray(sequences[0], dtype=np.int) # # FIXME: looks OK # turn into numpy arrays sequences = np.vstack(sequences) # sequences = np.asarray(sequences).squeeze() # sequences = sequences.reshape(sequences.shape[0]*sequences.shape[1], sequences.shape[2]) print('sequences: {}'.format(sequences.shape)) labels = np.hstack(labels) self.labels = labels print('labels: {}'.format(labels.shape)) if target_mode == 'label': targets = labels.copy() ## copy targets to fit SequenceDataSpace(VectorSpace) structure (*, frame_size, 12) # targets = targets.reshape((targets.shape[0], 1)) # targets = np.repeat(targets, frame_size, axis=1) # print targets.shape # one_hot_formatter = OneHotFormatter(max(targets.max() + 1, len(label_map)), dtype=np.int) # one_hot_y = one_hot_formatter.format(targets) # print one_hot_y.shape ## copy targets to fit SequenceDataSpace(IndexSpace) structure -> (*, frame_size, 1) targets = targets.reshape((targets.shape[0], 1)) targets = np.repeat(targets, frame_size, axis=1) targets = targets.reshape((targets.shape[0], targets.shape[1], 1)) print(targets.shape) elif target_mode == 'next': targets = np.concatenate(targets) targets = targets.reshape((targets.shape[0], 1, targets.shape[1])) print('targets: {}'.format(targets.shape)) n_channels = sequences.shape[2] print('number of channels: {}'.format(n_channels)) # if layout == 'ft': # swap axes to (batch, feature, time, channels) # sequences = sequences.swapaxes(1, 2) log.debug('final dataset shape: {} (b,0,1,c)'.format(sequences.shape)) source = ('features', 'targets') # space = CompositeSpace([ # # VectorSequenceSpace(dim=64), # SequenceSpace(VectorSpace(dim=64)), # VectorSpace(dim=12), # ]) if target_mode == 'label': space = CompositeSpace([ SequenceDataSpace(VectorSpace(dim=n_channels)), # SequenceDataSpace(VectorSpace(dim=12)), SequenceDataSpace(IndexSpace(dim=1, max_labels=12)), # SequenceDataSpace(IndexSpace(dim=512, max_labels=12)), ]) elif target_mode == 'next': space = CompositeSpace([ # does not work with VectorSpacesDataset # SequenceSpace(VectorSpace(dim=64)), # SequenceSpace(VectorSpace(dim=64)) SequenceDataSpace(VectorSpace(dim=n_channels)), SequenceDataSpace(VectorSpace(dim=n_channels)) # VectorSpace(dim=n_channels) ]) # source = ('features') # space = SequenceSpace(VectorSpace(dim=64)) print('sequences: {}'.format(sequences.shape)) print('targets: {}'.format(targets.shape)) # for i, seq in enumerate(sequences): # print np.asarray(seq, dtype=np.int) # print np.asarray(targets[i], dtype=np.int) # break # # FIXME: looks OK # SequenceDataSpace(IndexSpace(dim=1, max_labels=self._max_labels)), if target_mode == 'label': super(MultiChannelEEGSequencesDataset, self).__init__( # data=(sequences, one_hot_y), # works with vectorspace-target data=(sequences, targets), # works with indexspace-target # data=sequences, data_specs=(space, source)) elif target_mode == 'next': super(MultiChannelEEGSequencesDataset, self).__init__( # data=(sequences, one_hot_y), # works with vectorspace-target data=(sequences, targets), # works with indexspace-target # data=sequences, data_specs=(space, source))
def load(filepath): return fs_util.load(filepath);
def __init__( self, path, name='', # optional name # selectors subjects='all', # optional selector (list) or 'all' trial_types='all', # optional selector (list) or 'all' trial_numbers='all', # optional selector (list) or 'all' conditions='all', # optional selector (list) or 'all' partitioner=None, channel_filter=NoChannelFilter( ), # optional channel filter, default: keep all channel_names=None, # optional channel names (for metadata) label_map=None, # optional conversion of labels remove_dc_offset=False, # optional subtraction of channel mean, usually done already earlier resample=None, # optional down-sampling # optional sub-sequences selection start_sample=0, stop_sample=None, # optional for selection of sub-sequences # optional signal filter to by applied before spitting the signal signal_filter=None, # windowing parameters frame_size=-1, hop_size=-1, # values > 0 will lead to windowing hop_fraction=None, # alternative to specifying absolute hop_size # optional spectrum parameters, n_fft = 0 keeps raw data n_fft=0, n_freq_bins=None, spectrum_log_amplitude=False, spectrum_normalization_mode=None, include_phase=False, flatten_channels=False, layout='tf', # (0,1)-axes layout tf=time x features or ft=features x time save_matrix_path=None, keep_metadata=False, ): ''' Constructor ''' # save params self.params = locals().copy() del self.params['self'] # print self.params # TODO: get the whole filtering into an extra class datafiles_metadata, metadb = load_datafiles_metadata(path) # print datafiles_metadata def apply_filters(filters, node): if isinstance(node, dict): filtered = [] keepkeys = filters[0] for key, value in node.items(): if keepkeys == 'all' or key in keepkeys: filtered.extend(apply_filters(filters[1:], value)) return filtered else: return node # [node] # keep only files that match the metadata filters self.datafiles = apply_filters( [subjects, trial_types, trial_numbers, conditions], datafiles_metadata) # copy metadata for retained files self.metadb = {} for datafile in self.datafiles: self.metadb[datafile] = metadb[datafile] # print self.datafiles # print self.metadb self.name = name if partitioner is not None: self.datafiles = partitioner.get_partition(self.name, self.metadb) self.include_phase = include_phase self.spectrum_normalization_mode = spectrum_normalization_mode self.spectrum_log_amplitude = spectrum_log_amplitude self.sequence_partitions = [ ] # used to keep track of original sequences # metadata: [subject, trial_no, stimulus, channel, start, ] self.metadata = [] sequences = [] labels = [] n_sequences = 0 if frame_size > 0 and hop_size == -1 and hop_fraction is not None: hop_size = np.ceil(frame_size / hop_fraction) for i in xrange(len(self.datafiles)): with log_timing(log, 'loading data from {}'.format(self.datafiles[i])): # save start of next sequence self.sequence_partitions.append(n_sequences) data, metadata = load(os.path.join(path, self.datafiles[i])) label = metadata['label'] if label_map is not None: label = label_map[label] multi_channel_frames = [] # process 1 channel at a time for channel in xrange(data.shape[1]): # filter channels if not channel_filter.keep_channel(channel): continue samples = data[:, channel] # subtract channel mean if remove_dc_offset: samples -= samples.mean() # down-sample if requested if resample is not None and resample[0] != resample[1]: samples = librosa.resample(samples, resample[0], resample[1]) # apply optional signal filter after down-sampling -> requires lower order if signal_filter is not None: samples = signal_filter.process(samples) # get sub-sequence in resampled space # log.info('using samples {}..{} of {}'.format(start_sample,stop_sample, samples.shape)) samples = samples[start_sample:stop_sample] if n_fft is not None and n_fft > 0: # Optionally: ### frequency spectrum branch ### # transform to spectogram hop_length = n_fft / 4 ''' from http://theremin.ucsd.edu/~bmcfee/librosadoc/librosa.html >>> # Get a power spectrogram from a waveform y >>> S = np.abs(librosa.stft(y)) ** 2 >>> log_S = librosa.logamplitude(S) ''' S = librosa.core.stft(samples, n_fft=n_fft, hop_length=hop_length) # mag = np.abs(S) # magnitude spectrum mag = np.abs(S)**2 # power spectrum # include phase information if requested if self.include_phase: # phase = np.unwrap(np.angle(S)) phase = np.angle(S) # Optionally: cut off high bands if n_freq_bins is not None: mag = mag[0:n_freq_bins, :] if self.include_phase: phase = phase[0:n_freq_bins, :] if self.spectrum_log_amplitude: mag = librosa.logamplitude(mag) s = mag # for normalization ''' NOTE on normalization: It depends on the structure of a neural network and (even more) on the properties of data. There is no best normalization algorithm because if there would be one, it would be used everywhere by default... In theory, there is no requirement for the data to be normalized at all. This is a purely practical thing because in practice convergence could take forever if your input is spread out too much. The simplest would be to just normalize it by scaling your data to (-1,1) (or (0,1) depending on activation function), and in most cases it does work. If your algorithm converges well, then this is your answer. If not, there are too many possible problems and methods to outline here without knowing the actual data. ''' ## normalize to mean 0, std 1 if self.spectrum_normalization_mode == 'mean0_std1': # s = preprocessing.scale(s, axis=0); mean = np.mean(s) std = np.std(s) s = (s - mean) / std ## normalize by linear transform to [0,1] elif self.spectrum_normalization_mode == 'linear_0_1': s = s / np.max(s) ## normalize by linear transform to [-1,1] elif self.spectrum_normalization_mode == 'linear_-1_1': s = -1 + 2 * (s - np.min(s)) / (np.max(s) - np.min(s)) elif self.spectrum_normalization_mode is not None: raise ValueError( 'unsupported spectrum normalization mode {}'. format(self.spectrum_normalization_mode)) #print s.mean(axis=0) #print s.std(axis=0) # include phase information if requested if self.include_phase: # normalize phase to [-1.1] phase = phase / np.pi s = np.vstack([s, phase]) # transpose to fit pylearn2 layout s = np.transpose(s) # print s.shape ### end of frequency spectrum branch ### else: ### raw waveform branch ### # normalize to max amplitude 1 s = librosa.util.normalize(samples) # add 2nd data dimension s = s.reshape(s.shape[0], 1) # print s.shape ### end of raw waveform branch ### s = np.asfarray(s, dtype='float32') if frame_size > 0 and hop_size > 0: s = s.copy( ) # FIXME: THIS IS NECESSARY IN MultiChannelEEGSequencesDataset - OTHERWISE, THE FOLLOWING OP DOES NOT WORK!!!! frames = frame(s, frame_length=frame_size, hop_length=hop_size) else: frames = s del s # print frames.shape if flatten_channels: # add artificial channel dimension frames = frames.reshape( (frames.shape[0], frames.shape[1], frames.shape[2], 1)) # print frames.shape sequences.append(frames) # increment counter by new number of frames n_sequences += frames.shape[0] if keep_metadata: # determine channel name channel_name = None if channel_names is not None: channel_name = channel_names[channel] elif 'channels' in metadata: channel_name = metadata['channels'][channel] self.metadata.append({ 'subject': metadata['subject'], # subject 'trial_type': metadata['trial_type'], # trial_type 'trial_no': metadata['trial_no'], # trial_no 'condition': metadata['condition'], # condition 'channel': channel, # channel 'channel_name': channel_name, 'start': self.sequence_partitions[-1], # start 'stop': n_sequences # stop }) for _ in xrange(frames.shape[0]): labels.append(label) else: multi_channel_frames.append(frames) ### end of channel iteration ### if not flatten_channels: # turn list into array multi_channel_frames = np.asfarray(multi_channel_frames, dtype='float32') # [channels x frames x time x freq] -> cb01 # [channels x frames x time x 1] -> cb0. # move channel dimension to end multi_channel_frames = np.rollaxis(multi_channel_frames, 0, 4) # print multi_channel_frames.shape # log.debug(multi_channel_frames.shape) sequences.append(multi_channel_frames) # increment counter by new number of frames n_sequences += multi_channel_frames.shape[0] if keep_metadata: self.metadata.append({ 'subject': metadata['subject'], # subject 'trial_type': metadata['trial_type'], # trial_type 'trial_no': metadata['trial_no'], # trial_no 'condition': metadata['condition'], # condition 'channel': 'all', # channel 'start': self.sequence_partitions[-1], # start 'stop': n_sequences # stop }) for _ in xrange(multi_channel_frames.shape[0]): labels.append(label) ### end of datafile iteration ### # turn into numpy arrays sequences = np.vstack(sequences) # print sequences.shape; labels = np.hstack(labels) # one_hot_y = one_hot(labels) one_hot_formatter = OneHotFormatter(labels.max() + 1) # FIXME! one_hot_y = one_hot_formatter.format(labels) self.labels = labels if layout == 'ft': # swap axes to (batch, feature, time, channels) sequences = sequences.swapaxes(1, 2) log.debug('final dataset shape: {} (b,0,1,c)'.format(sequences.shape)) super(MultiChannelEEGDataset, self).__init__(topo_view=sequences, y=one_hot_y, axes=['b', 0, 1, 'c']) log.info( 'generated dataset "{}" with shape X={}={} y={} labels={} '.format( self.name, self.X.shape, sequences.shape, self.y.shape, self.labels.shape)) if save_matrix_path is not None: matrix = DenseDesignMatrix(topo_view=sequences, y=one_hot_y, axes=['b', 0, 1, 'c']) with log_timing( log, 'saving DenseDesignMatrix to {}'.format(save_matrix_path)): serial.save(save_matrix_path, matrix)
def preprocess(config): # config = load_config(default_config='../train_sda.cfg'); DATA_ROOT = config.eeg.get('dataset_root', './') SAMPLE_RATE = 400 # in Hz TRIAL_LENGTH = 32 # in sec TRIAL_LENGTH += 4 # add 4s after end of presentation TRIAL_SAMPLE_LENGTH = SAMPLE_RATE * TRIAL_LENGTH log.info('using dataset at {}'.format(DATA_ROOT)) ''' Note from Dan: All subjects should have channels 15, 16, 17 and 18 removed [...] If you want to make them truly identical, you could remove channel 19 from the subjects with more channels, although this should be 'good' data. ''' bad_channels = {} bad_channels[1] = [5, 6, 15, 16, 17, 18, 20, 21] bad_channels[2] = [7, 8, 15, 16, 17, 18, 20, 21] bad_channels[3] = [5, 6, 15, 16, 17, 18, 20, 21] bad_channels[4] = [7, 8, 15, 16, 17, 18, 20, 21] bad_channels[5] = [7, 8, 15, 16, 17, 18, 20, 21] bad_channels[6] = [7, 8, 9, 12, 15, 16, 17, 18] bad_channels[7] = [5, 6, 12, 15, 16, 17, 18, 20] bad_channels[8] = [7, 8, 15, 16, 17, 18, 20, 21] bad_channels[9] = [5, 6, 12, 15, 16, 17, 18, 20] bad_channels[10] = [5, 6, 15, 16, 17, 18, 20, 21] bad_channels[11] = [5, 6, 15, 16, 17, 18, 20, 21] bad_channels[12] = [5, 6, 15, 16, 17, 18, 20, 21] bad_channels[13] = [5, 6, 12, 15, 16, 17, 18, 20] with log_timing(log, 'generating datasets'): for subject_id in xrange(1, 14): search_path = os.path.join(DATA_ROOT, 'Sub{0:03d}*'.format(subject_id)) path = glob.glob(search_path) if path is None or len(path) == 0: log.warn('nothing found at {}'.format(search_path)) continue else: path = path[0] trials_filename = os.path.join(path, 'trials.pklz') trials = None if not os.path.isfile(trials_filename): log.debug('{} not found. running split_trial()'.format( trials_filename)) trials = split_trial(path, TRIAL_SAMPLE_LENGTH) else: with log_timing( log, 'loading data from {}'.format(trials_filename)): trials = load(trials_filename) assert trials dataset_filename = os.path.join( path, 'dataset_13goodchannels_plus4s.pklz') dataset = generate_cases(subject_id, trials, bad_channels[subject_id]) # = data, labels with log_timing(log, 'saving dataset to {}'.format(dataset_filename)): save(dataset_filename, dataset)
def preprocess(config): # config = load_config(default_config='../train_sda.cfg'); DATA_ROOT = config.eeg.get('dataset_root', './'); SAMPLE_RATE = 400; # in Hz TRIAL_LENGTH = 32; # in sec TRIAL_LENGTH += 4; # add 4s after end of presentation TRIAL_SAMPLE_LENGTH = SAMPLE_RATE * TRIAL_LENGTH; log.info('using dataset at {}'.format(DATA_ROOT)); ''' Note from Dan: All subjects should have channels 15, 16, 17 and 18 removed [...] If you want to make them truly identical, you could remove channel 19 from the subjects with more channels, although this should be 'good' data. ''' bad_channels = {}; bad_channels[1] = [5, 6, 15, 16, 17, 18, 20, 21]; bad_channels[2] = [ 7, 8, 15, 16, 17, 18, 20, 21]; bad_channels[3] = [5, 6, 15, 16, 17, 18, 20, 21]; bad_channels[4] = [ 7, 8, 15, 16, 17, 18, 20, 21]; bad_channels[5] = [ 7, 8, 15, 16, 17, 18, 20, 21]; bad_channels[6] = [ 7, 8, 9, 12, 15, 16, 17, 18 ]; bad_channels[7] = [5, 6, 12, 15, 16, 17, 18, 20 ]; bad_channels[8] = [ 7, 8, 15, 16, 17, 18, 20, 21]; bad_channels[9] = [5, 6, 12, 15, 16, 17, 18, 20 ]; bad_channels[10] = [5, 6, 15, 16, 17, 18, 20, 21]; bad_channels[11] = [5, 6, 15, 16, 17, 18, 20, 21]; bad_channels[12] = [5, 6, 15, 16, 17, 18, 20, 21]; bad_channels[13] = [5, 6, 12, 15, 16, 17, 18, 20 ]; with log_timing(log, 'generating datasets'): for subject_id in xrange(1,14): search_path = os.path.join(DATA_ROOT, 'Sub{0:03d}*'.format(subject_id)); path = glob.glob(search_path); if path is None or len(path) == 0: log.warn('nothing found at {}'.format(search_path)); continue; else: path = path[0]; trials_filename = os.path.join(path, 'trials.pklz'); trials = None; if not os.path.isfile(trials_filename): log.debug('{} not found. running split_trial()'.format(trials_filename)); trials = split_trial(path, TRIAL_SAMPLE_LENGTH); else: with log_timing(log, 'loading data from {}'.format(trials_filename)): trials = load(trials_filename); assert trials; dataset_filename = os.path.join(path, 'dataset_13goodchannels_plus4s.pklz'); dataset = generate_cases(subject_id, trials, bad_channels[subject_id]); # = data, labels with log_timing(log, 'saving dataset to {}'.format(dataset_filename)): save(dataset_filename, dataset);
def __init__(self, path, suffix='', # required data file parameters subjects='all', # optional selector (list) or 'all' start_sample = 0, stop_sample = None, # optional for selection of sub-sequences frame_size = -1, hop_size = -1, # values > 0 will lead to windowing label_mode='tempo', name = '', # optional name n_fft = 0, n_freq_bins = None, save_matrix_path = None, channels = None, resample = None, stimulus_id_filter = None, keep_metadata = False, spectrum_log_amplitude = False, spectrum_normalization_mode = None, include_phase = False, layout = 'tf' # 2D axes layout tf=time x features or ft= features x time ): ''' Constructor ''' # save params self.params = locals().copy() del self.params['self'] # print self.params self.name = name; self.include_phase = include_phase; self.spectrum_normalization_mode = spectrum_normalization_mode; self.spectrum_log_amplitude = spectrum_log_amplitude; self.datafiles = []; subject_paths = glob.glob(os.path.join(path, 'Sub*')); for path in subject_paths: dataset_filename = os.path.join(path, 'dataset'+suffix+'.pklz'); if os.path.isfile(dataset_filename): log.debug('addding {}'.format(dataset_filename)); self.datafiles.append(dataset_filename); else: log.warn('file does not exists {}'.format(dataset_filename)); self.datafiles.sort(); if subjects == 'all': subjects = np.arange(0,len(self.datafiles)); assert subjects is not None and len(subjects) > 0; self.label_mode = label_mode; self.label_converter = LabelConverter(); if stimulus_id_filter is None: stimulus_id_filter = []; self.stimulus_id_filter = stimulus_id_filter; self.subject_partitions = []; # used to keep track of original subjects self.sequence_partitions = []; # used to keep track of original sequences self.trial_partitions = []; # keeps track of original trials # metadata: [subject, trial_no, stimulus, channel, start, ] self.metadata = []; sequences = []; labels = []; n_sequences = 0; last_raw_label = -1; for i in xrange(len(self.datafiles)): if i in subjects: with log_timing(log, 'loading data from {}'.format(self.datafiles[i])): self.subject_partitions.append(n_sequences); # save start of next subject subject_sequences, subject_labels, channel_meta = load(self.datafiles[i]); subject_trial_no = -1; for j in xrange(len(subject_sequences)): l = subject_labels[j]; # get raw label if l in stimulus_id_filter: # log.debug('skipping stimulus {}'.format(l)); continue; c = channel_meta[j][0]; if channels is not None and not c in channels: # apply optional channel filter log.debug('skipping channel {}'.format(c)); continue; self.sequence_partitions.append(n_sequences); # save start of next sequence if l != last_raw_label: # if raw label changed... self.trial_partitions.append(n_sequences); # ...save start of next trial subject_trial_no += 1; # increment subject_trial_no counter last_raw_label = l; l = self.label_converter.get_label(l[0], self.label_mode); # convert to label_mode view s = subject_sequences[j]; s = s[start_sample:stop_sample]; # get sub-sequence in original space # down-sample if requested if resample is not None and resample[0] != resample[1]: s = librosa.resample(s, resample[0], resample[1]); if n_fft is not None and n_fft > 0: # Optionally: # transform to spectogram hop_length = n_fft / 4; ''' from http://theremin.ucsd.edu/~bmcfee/librosadoc/librosa.html >>> # Get a power spectrogram from a waveform y >>> S = np.abs(librosa.stft(y)) ** 2 >>> log_S = librosa.logamplitude(S) ''' # s = np.abs(librosa.core.stft(s, # n_fft=n_fft, # hop_length=hop_length) # )**2; S = librosa.core.stft(s, n_fft=n_fft, hop_length=hop_length); # mag = np.abs(S); # magnitude spectrum mag = np.abs(S)**2; # power spectrum # phase = np.unwrap(np.angle(S)); phase = np.angle(S); if n_freq_bins is not None: # Optionally: mag = mag[0:n_freq_bins, :]; # cut off high bands phase = phase[0:n_freq_bins, :]; if self.spectrum_log_amplitude: mag = librosa.logamplitude(mag); s = mag; # for normalization ''' NOTE on normalization: It depends on the structure of a neural network and (even more) on the properties of data. There is no best normalization algorithm because if there would be one, it would be used everywhere by default... In theory, there is no requirement for the data to be normalized at all. This is a purely practical thing because in practice convergence could take forever if your input is spread out too much. The simplest would be to just normalize it by scaling your data to (-1,1) (or (0,1) depending on activation function), and in most cases it does work. If your algorithm converges well, then this is your answer. If not, there are too many possible problems and methods to outline here without knowing the actual data. ''' ## normalize to mean 0, std 1 if self.spectrum_normalization_mode == 'mean0_std1': # s = preprocessing.scale(s, axis=0); mean = np.mean(s); std = np.std(s); s = (s - mean) / std; ## normalize by linear transform to [0,1] elif self.spectrum_normalization_mode == 'linear_0_1': s = s / np.max(s); ## normalize by linear transform to [-1,1] elif self.spectrum_normalization_mode == 'linear_-1_1': s = -1 + 2 * (s - np.min(s)) / (np.max(s) - np.min(s)); elif self.spectrum_normalization_mode is not None: raise ValueError( 'unsupported spectrum normalization mode {}'.format( self.spectrum_normalization_mode) ); #print s.mean(axis=0) #print s.std(axis=0) # include phase information if requested if self.include_phase: # normalize phase to [-1.1] phase = phase / np.pi s = np.vstack([s, phase]); # transpose to fit pylearn2 layout s = np.transpose(s); else: # normalize to max amplitude 1 s = librosa.util.normalize(s); s = np.asfarray(s, dtype='float32'); if frame_size > 0 and hop_size > 0: s, l = self._split_sequence(s, l, frame_size, hop_size); # print s.shape n_sequences += len(s); sequences.append(s); labels.extend(l); if keep_metadata: self.metadata.append({ 'subject' : i, # subject 'trial_no' : subject_trial_no, # trial_no 'stimulus' : last_raw_label[0], # stimulus 'channel' : c, # channel 'start' : self.sequence_partitions[-1], # start 'stop' : n_sequences # stop }); # turn into numpy arrays sequences = np.vstack(sequences); print sequences.shape; labels = np.hstack(labels); # one_hot_y = one_hot(labels) one_hot_formatter = OneHotFormatter(labels.max() + 1) one_hot_y = one_hot_formatter.format(labels) self.labels = labels; # save for later if n_fft > 0: sequences = np.array([sequences]); # re-arrange dimensions sequences = sequences.swapaxes(0,1).swapaxes(1,2).swapaxes(2,3); if layout == 'ft': sequences = sequences.swapaxes(1,2) log.debug('final dataset shape: {} (b,0,1,c)'.format(sequences.shape)); print 'final dataset shape: {} (b,0,1,c)'.format(sequences.shape) super(EEGDataset, self).__init__(topo_view=sequences, y=one_hot_y, axes=['b', 0, 1, 'c']); else: # if layout == 'ft': # sequences = sequences.swapaxes(1,2) super(EEGDataset, self).__init__(X=sequences, y=one_hot_y, axes=['b', 0, 1, 'c']); log.debug('generated dataset "{}" with shape X={} y={} labels={} '.format(self.name, self.X.shape, self.y.shape, self.labels.shape)); if save_matrix_path is not None: matrix = DenseDesignMatrix(X=sequences, y=one_hot_y); with log_timing(log, 'saving DenseDesignMatrix to {}'.format(save_matrix_path)): serial.save(save_matrix_path, matrix);