def read(filename): data = h5py.File(filename, 'r') obj = {} for key in data.keys(): value = data[key] if key == METADATA_TAG: for metakey in value.attrs.keys(): obj[metakey] = value.attrs[metakey] elif not key.startswith('__list'): obj[key] = value[:] list_keys = [key for key in data.keys() if key.startswith('__list')] if len(list_keys) > 0: list_keys.sort() for key in list_keys: match = list_regex.match(key) assert match is not None list_key = match.group(1) list_index = int(match.group(2)) out_list = obj.setdefault(list_key, []) assert len(out_list) == list_index out_list.append(data[key][:]) data.close() return jsdict(obj)
def memoize(fn, paths): cwd = os.getcwd() def change_to_target_dir(): for dir in paths[:-1]: try: os.mkdir(dir) except OSError as e: pass os.chdir(dir) change_to_target_dir() filename = paths[-1] if os.path.exists(filename): data = hdf5.read(filename) os.chdir(cwd) return data os.chdir(cwd) data = fn() change_to_target_dir() tmp = '%s.pid.%d.tmp' % (filename, os.getpid()) hdf5.write(tmp, data) os.rename(tmp, filename) os.chdir(cwd) return jsdict(data)
def process(): data = load_training_data(settings, target, pipeline, strategy=strategy, cv_fold_number=fold, check_only=False, quiet=quiet) if feature_mask is not None: s = [slice(None),] * data.X_train.ndim s[-1] = np.where(np.array(feature_mask) == True)[0] data['X_train'] = data.X_train[s] data['X_cv'] = data.X_cv[s] if not quiet: print ' feature mask', 'X_train', data.X_train.shape, 'y_train', data.y_train.shape, 'X_cv', data.X_cv.shape, 'y_cv', data.y_cv.shape train(classifier, data, quiet=quiet) if not quiet: print "Making predictions...", timer = time.Timer() mean_predictions, median_predictions, raw_predictions = make_predictions(classifier, data.X_cv, data.num_cv_segments) if not quiet: print timer.pretty_str() mean_score = roc_auc_score(data.y_cv, mean_predictions) median_score = roc_auc_score(data.y_cv, median_predictions) return jsdict({ 'mean_score': mean_score, 'median_score': median_score, 'mean_predictions': mean_predictions, 'median_predictions': median_predictions, 'y_cv': data.y_cv })
def make_fold(preictal_X_train, preictal_X_cv, interictal_X_train, interictal_X_cv): num_train_segments = preictal_X_train.shape[0] + interictal_X_train.shape[0] num_cv_segments = preictal_X_cv.shape[0] + interictal_X_cv.shape[0] assert (num_train_segments + num_cv_segments) == total_segments flattened_preictal_X_train = flatten(preictal_X_train) flattened_interictal_X_train = flatten(interictal_X_train) flattened_preictal_X_cv = flatten(preictal_X_cv) if cv else np.empty((0,)) flattened_interictal_X_cv = flatten(interictal_X_cv) if cv else np.empty((0,)) X_train = np.concatenate((flattened_preictal_X_train, flattened_interictal_X_train), axis=0) X_cv = np.concatenate((flattened_preictal_X_cv, flattened_interictal_X_cv), axis=0) preictal_y_train = np.ones((flattened_preictal_X_train.shape[0],)) preictal_y_cv = np.ones((preictal_X_cv.shape[0],)) interictal_y_train = np.zeros((flattened_interictal_X_train.shape[0],)) interictal_y_cv = np.zeros((interictal_X_cv.shape[0],)) y_train = np.concatenate((preictal_y_train, interictal_y_train), axis=0) y_cv = np.concatenate((preictal_y_cv, interictal_y_cv), axis=0) X_train, y_train = sklearn.utils.shuffle(X_train, y_train, random_state=0) return jsdict({ 'X_train': X_train, 'y_train': y_train, 'X_cv': X_cv, 'y_cv': y_cv, 'num_train_segments': num_train_segments, 'num_cv_segments': num_cv_segments })
def load_test_data(settings, target, pipeline, quiet=False): test, meta = load_pipeline_data(settings, target, 'test', pipeline, check_only=False, quiet=quiet) X_test = flatten(test) if not quiet: print 'X_test', test.shape, 'num_segments', meta.num_segments return jsdict({ 'X_test': X_test, 'num_segments': meta.num_segments })
def load_test_data(settings, target, pipeline, quiet=False): test, meta = load_pipeline_data(settings, target, 'test', pipeline, check_only=False, quiet=quiet) X_test = flatten(test) if not quiet: print 'X_test', test.shape, 'num_segments', meta.num_segments return jsdict({'X_test': X_test, 'num_segments': meta.num_segments})
def process_data_sub_job(filename_in, filename_out_fmt, id, num_jobs, process_data_fn): if not os.path.exists(filename_in): return 0 pid = os.getpid() num_processed = 0 for i in xrange(id, sys.maxint, num_jobs): filename_out = filename_out_fmt % i if filename_out_fmt is not None else None # Use temp filename then rename the completed file to the proper name. # This is more or less an atomic update. Cancelling the program should # never leave data in a half-written state. Hence only the tempfile # will be in a half-written state and the pid determines when the process # is still alive and still processing the data. An inactive pid means the # tempfile is trash and can be deleted. filename_out_temp = '%s.pid.%d.tmp' % ( filename_out, pid) if filename_out is not None else None if filename_out is not None and os.path.exists(filename_out): num_processed += 1 continue with h5py.File(filename_in, 'r') as f: segment = read_hdf5_segment(f, 'X', start=i, end=i + 1) if segment is None: break X, meta = segment data_obj = {} for k, v in meta.iteritems(): data_obj[k] = v # save disk space if X.dtype != np.float32: X = X.astype(np.float32) X = process_data_fn(X, jsdict(data_obj)) if filename_out is not None: with h5py.File(filename_out_temp, 'w', libver='latest') as f: if X.dtype != np.float32: X = X.astype(np.float32) write_hdf5_segment(f, 'X', X) os.rename(filename_out_temp, filename_out) num_processed += 1 return num_processed
def process_data_sub_job(filename_in, filename_out_fmt, id, num_jobs, process_data_fn): if not os.path.exists(filename_in): return 0 pid = os.getpid() num_processed = 0 for i in xrange(id, sys.maxint, num_jobs): filename_out = filename_out_fmt % i if filename_out_fmt is not None else None # Use temp filename then rename the completed file to the proper name. # This is more or less an atomic update. Cancelling the program should # never leave data in a half-written state. Hence only the tempfile # will be in a half-written state and the pid determines when the process # is still alive and still processing the data. An inactive pid means the # tempfile is trash and can be deleted. filename_out_temp = '%s.pid.%d.tmp' % (filename_out, pid) if filename_out is not None else None if filename_out is not None and os.path.exists(filename_out): num_processed += 1 continue with h5py.File(filename_in, 'r') as f: segment = read_hdf5_segment(f, 'X', start=i, end=i+1) if segment is None: break X, meta = segment data_obj = {} for k, v in meta.iteritems(): data_obj[k] = v # save disk space if X.dtype != np.float32: X = X.astype(np.float32) X = process_data_fn(X, jsdict(data_obj)) if filename_out is not None: with h5py.File(filename_out_temp, 'w', libver='latest') as f: if X.dtype != np.float32: X = X.astype(np.float32) write_hdf5_segment(f, 'X', X) os.rename(filename_out_temp, filename_out) num_processed += 1 return num_processed
def process_data(segment): data_key = [ key for key in list(segment.keys()) if not key.startswith('_') ][0] data = segment[data_key][0][0] X = data[0] data_length_sec = int(data[1][0][0]) sampling_frequency = float(data[2][0][0]) channels = [ch[0] for ch in data[3][0]] sequence = int(data[4][0][0]) if len(data) >= 5 else None min_freq = 195.0 def find_q(): q = 2 while True: f = sampling_frequency / q if f < min_freq: return q - 1 q += 1 if sampling_frequency > min_freq: q = find_q() if q > 1: # if X.dtype != np.float64: # X = X.astype(np.float64) # X -= X.mean(axis=0) X = scipy.signal.decimate(X, q, ftype='fir', axis=X.ndim - 1) X = np.round(X).astype(np.int16) # if X.dtype != np.float32: # X = X.astype(np.float32) sampling_frequency /= q channels = np.array(channels, dtype=str(channels[0].dtype).replace('U', 'S')) out = { 'X': X, 'data_length_sec': data_length_sec, 'sampling_frequency': sampling_frequency, 'num_channels': X.shape[0], 'channels': channels, 'target': target, 'data_type': data_type, } if sequence is not None: out['sequence'] = sequence return jsdict(out)
def process_data(segment): data_key = [key for key in segment.keys() if not key.startswith('_')][0] data = segment[data_key][0][0] X = data[0] data_length_sec = int(data[1][0][0]) sampling_frequency = float(data[2][0][0]) channels = [ch[0] for ch in data[3][0]] sequence = int(data[4][0][0]) if len(data) >= 5 else None min_freq = 195.0 def find_q(): q = 2 while True: f = sampling_frequency / q if f < min_freq: return q - 1 q += 1 if sampling_frequency > min_freq: q = find_q() if q > 1: # if X.dtype != np.float64: # X = X.astype(np.float64) # X -= X.mean(axis=0) X = scipy.signal.decimate(X, q, ftype='fir', axis=X.ndim-1) X = np.round(X).astype(np.int16) # if X.dtype != np.float32: # X = X.astype(np.float32) sampling_frequency /= q channels = np.array(channels, dtype=str(channels[0].dtype).replace('U', 'S')) out = { 'X': X, 'data_length_sec': data_length_sec, 'sampling_frequency': sampling_frequency, 'num_channels': X.shape[0], 'channels': channels, 'target': target, 'data_type': data_type, } if sequence is not None: out['sequence'] = sequence return jsdict(out)
def make_fold(preictal_X_train, preictal_X_cv, interictal_X_train, interictal_X_cv): num_train_segments = preictal_X_train.shape[ 0] + interictal_X_train.shape[0] num_cv_segments = preictal_X_cv.shape[0] + interictal_X_cv.shape[0] assert (num_train_segments + num_cv_segments) == total_segments flattened_preictal_X_train = flatten(preictal_X_train) flattened_interictal_X_train = flatten(interictal_X_train) flattened_preictal_X_cv = flatten(preictal_X_cv) if cv else np.empty( (0, )) flattened_interictal_X_cv = flatten( interictal_X_cv) if cv else np.empty((0, )) X_train = np.concatenate( (flattened_preictal_X_train, flattened_interictal_X_train), axis=0) X_cv = np.concatenate( (flattened_preictal_X_cv, flattened_interictal_X_cv), axis=0) preictal_y_train = np.ones((flattened_preictal_X_train.shape[0], )) preictal_y_cv = np.ones((preictal_X_cv.shape[0], )) interictal_y_train = np.zeros( (flattened_interictal_X_train.shape[0], )) interictal_y_cv = np.zeros((interictal_X_cv.shape[0], )) y_train = np.concatenate((preictal_y_train, interictal_y_train), axis=0) y_cv = np.concatenate((preictal_y_cv, interictal_y_cv), axis=0) X_train, y_train = sklearn.utils.shuffle(X_train, y_train, random_state=0) return jsdict({ 'X_train': X_train, 'y_train': y_train, 'X_cv': X_cv, 'y_cv': y_cv, 'num_train_segments': num_train_segments, 'num_cv_segments': num_cv_segments })
def process(): data = load_training_data(settings, target, pipeline, strategy=strategy, cv_fold_number=fold, check_only=False, quiet=quiet) if feature_mask is not None: s = [ slice(None), ] * data.X_train.ndim s[-1] = np.where(np.array(feature_mask) == True)[0] data['X_train'] = data.X_train[s] data['X_cv'] = data.X_cv[s] if not quiet: print(' feature mask', 'X_train', data.X_train.shape, 'y_train', data.y_train.shape, 'X_cv', data.X_cv.shape, 'y_cv', data.y_cv.shape) train(classifier, data, quiet=quiet) if not quiet: print("Making predictions...", end=' ') timer = time.Timer() mean_predictions, median_predictions, raw_predictions = make_predictions( classifier, data.X_cv, data.num_cv_segments) if not quiet: print(timer.pretty_str()) mean_score = roc_auc_score(data.y_cv, mean_predictions) median_score = roc_auc_score(data.y_cv, median_predictions) return jsdict({ 'mean_score': mean_score, 'median_score': median_score, 'mean_predictions': mean_predictions, 'median_predictions': median_predictions, 'y_cv': data.y_cv })
def cross_validation_score(settings, target, pipeline, classifier, classifier_name, strategy=None, pool=None, progress_str=None, feature_mask=None, return_data=True, quiet=False): if strategy is None: strategy = KFoldStrategy() if feature_mask is not None and np.count_nonzero(feature_mask) == len(feature_mask): feature_mask = None _, preictal_meta = load_pipeline_data(settings, target, 'preictal', pipeline, check_only=False, quiet=quiet, meta_only=True) cv_folds = strategy.get_folds(preictal_meta) if pool is not None: results = [pool.apply_async(cross_val_score_for_one_fold, [settings, target, pipeline, classifier, classifier_name, fold], {'strategy': strategy, 'feature_mask': feature_mask, 'progress_str': progress_str, 'quiet': quiet}) for fold in cv_folds] if return_data: out = [r.get() for r in results] else: out = [cross_val_score_for_one_fold(settings, target, pipeline, classifier, classifier_name, strategy=strategy, fold=fold, feature_mask=feature_mask, progress_str=progress_str, quiet=quiet) for fold in cv_folds] if return_data: mean_scores = [d.mean_score for d in out] median_scores = [d.median_score for d in out] mean_predictions = [d.mean_predictions for d in out] median_predictions = [d.median_predictions for d in out] y_cvs = [d.y_cv for d in out] return jsdict({ 'mean_score': np.mean(mean_scores), 'median_score': np.mean(median_scores), 'mean_scores': np.array(mean_scores), 'median_scores': np.array(median_scores), 'mean_predictions': mean_predictions, 'median_predictions': median_predictions, 'y_cvs': y_cvs })
change_to_target_dir() filename = paths[-1] if os.path.exists(filename): data = hdf5.read(filename) os.chdir(cwd) return data os.chdir(cwd) data = fn() change_to_target_dir() tmp = '%s.pid.%d.tmp' % (filename, os.getpid()) hdf5.write(tmp, data) os.rename(tmp, filename) os.chdir(cwd) return jsdict(data) # Fast process-if-not-yet-processed method for training data def check_training_data_loaded(settings, target, pipeline, quiet=False): if not load_pipeline_data( settings, target, 'preictal', pipeline, check_only=True, quiet=quiet): load_pipeline_data(settings, target, 'preictal', pipeline, check_only=False, quiet=quiet) if not load_pipeline_data( settings, target, 'interictal', pipeline, check_only=True,
change_to_target_dir() filename = paths[-1] if os.path.exists(filename): data = hdf5.read(filename) os.chdir(cwd) return data os.chdir(cwd) data = fn() change_to_target_dir() tmp = '%s.pid.%d.tmp' % (filename, os.getpid()) hdf5.write(tmp, data) os.rename(tmp, filename) os.chdir(cwd) return jsdict(data) # Fast process-if-not-yet-processed method for training data def check_training_data_loaded(settings, target, pipeline, quiet=False): if not load_pipeline_data(settings, target, 'preictal', pipeline, check_only=True, quiet=quiet): load_pipeline_data(settings, target, 'preictal', pipeline, check_only=False, quiet=quiet) if not load_pipeline_data(settings, target, 'interictal', pipeline, check_only=True, quiet=quiet): load_pipeline_data(settings, target, 'interictal', pipeline, check_only=False, quiet=quiet) # Fast process-if-not-yet-processed method for test data def check_test_data_loaded(settings, target, pipeline, quiet=False): if not load_pipeline_data(settings, target, 'test', pipeline, check_only=True, quiet=quiet): load_pipeline_data(settings, target, 'test', pipeline, check_only=False, quiet=quiet)
def accumulate_data(settings, target, data_type, tag, output_to_original_data_dir=False, quiet=False, meta_only=False): output_dir = settings.data_dir if output_to_original_data_dir else settings.cache_dir filename_out = single_filename_builder(target, data_type, output_dir, tag) orig_filename_in = single_filename_builder(target, data_type, settings.data_dir) def collect_meta(filename): meta = {} with h5py.File(filename, 'r') as f: meta['num_segments'] = f['X'].shape[0] if 'sequence' in f.keys(): meta['sequence'] = f['sequence'][:] for k, v in f['X'].attrs.iteritems(): meta[k] = v return meta # load already processed output file if os.path.exists(filename_out): # pull meta off original data meta = collect_meta(orig_filename_in) # pull X data off processed data with h5py.File(filename_out, 'r') as f: meta['X_shape'] = f['X'].shape X = f['X'][:] if not meta_only else None if not quiet: print 'from cache ...', return X, jsdict(meta) else: # get ready to process all segments into 1 file, starting with getting the meta-data ready if not quiet: print 'processing ...', pid = os.getpid() filename_in_fmt = segment_filename_builder(target, data_type, output_dir, tag) orig_filename_in = single_filename_builder(target, data_type, settings.data_dir) # meta-data is collected differently when doing the first data conversion from mat to hdf5 if output_to_original_data_dir: print 'Collecting metadata...' # Creating original files... pull metadata off first one, and also collect sequences meta = None sequence = [] num_segments = 0 for i in xrange(0, sys.maxint, 1): filename = filename_in_fmt % i if not os.path.exists(filename): if num_segments == 0: print 'Could not find file ', filename sys.exit(1) break with h5py.File(filename, 'r') as f_in: meta_attrs = f_in['__metadata'].attrs if 'sequence' in meta_attrs: sequence.append(meta_attrs['sequence']) if meta is None: meta = {} meta['channels'] = f_in['channels'][:] for key in meta_attrs.keys(): if key != 'sequence': meta[key] = meta_attrs[key] num_segments += 1 if len(sequence) > 0: meta['sequence'] = sequence meta['num_segments'] = num_segments print 'Accumulating segments...' else: # pull metadata off the original data files meta = collect_meta(orig_filename_in) # now accumulate X data to a single file num_segments = meta['num_segments'] filename_out_temp = '%s.pid.%d.tmp' % (filename_out, pid) if filename_out is not None else None with h5py.File(filename_out_temp, 'w-', libver='latest') as f_out: X_out = None for i in xrange(num_segments): with h5py.File(filename_in_fmt % i, 'r') as f_in: X_in = f_in['X'] # init X_out if X_out is None: X_out = f_out.create_dataset('X', shape=[num_segments] + list(X_in.shape), dtype=X_in.dtype) meta['X_shape'] = X_out.shape for k, v in meta.iteritems(): X_out.attrs[k] = v X_out[i] = X_in[:] X = X_out[:] # finalize os.rename(filename_out_temp, filename_out) # clean up for i in xrange(num_segments): try: os.remove(filename_in_fmt % i) except: pass return X, jsdict(meta)
def cross_validation_score(settings, target, pipeline, classifier, classifier_name, strategy=None, pool=None, progress_str=None, feature_mask=None, return_data=True, quiet=False): if strategy is None: strategy = KFoldStrategy() if feature_mask is not None and np.count_nonzero(feature_mask) == len( feature_mask): feature_mask = None _, preictal_meta = load_pipeline_data(settings, target, 'preictal', pipeline, check_only=False, quiet=quiet, meta_only=True) cv_folds = strategy.get_folds(preictal_meta) if pool is not None: results = [ pool.apply_async( cross_val_score_for_one_fold, [ settings, target, pipeline, classifier, classifier_name, fold ], { 'strategy': strategy, 'feature_mask': feature_mask, 'progress_str': progress_str, 'quiet': quiet }) for fold in cv_folds ] if return_data: out = [r.get() for r in results] else: out = [ cross_val_score_for_one_fold(settings, target, pipeline, classifier, classifier_name, strategy=strategy, fold=fold, feature_mask=feature_mask, progress_str=progress_str, quiet=quiet) for fold in cv_folds ] if return_data: mean_scores = [d.mean_score for d in out] median_scores = [d.median_score for d in out] mean_predictions = [d.mean_predictions for d in out] median_predictions = [d.median_predictions for d in out] y_cvs = [d.y_cv for d in out] return jsdict({ 'mean_score': np.mean(mean_scores), 'median_score': np.mean(median_scores), 'mean_scores': np.array(mean_scores), 'median_scores': np.array(median_scores), 'mean_predictions': mean_predictions, 'median_predictions': median_predictions, 'y_cvs': y_cvs })
def accumulate_data(settings, target, data_type, tag, output_to_original_data_dir=False, quiet=False, meta_only=False): output_dir = settings.data_dir if output_to_original_data_dir else settings.cache_dir filename_out = single_filename_builder(target, data_type, output_dir, tag) orig_filename_in = single_filename_builder(target, data_type, settings.data_dir) def collect_meta(filename): meta = {} with h5py.File(filename, 'r') as f: meta['num_segments'] = f['X'].shape[0] if 'sequence' in list(f.keys()): meta['sequence'] = f['sequence'][:] for k, v in f['X'].attrs.items(): meta[k] = v return meta # load already processed output file if os.path.exists(filename_out): # pull meta off original data meta = collect_meta(orig_filename_in) # pull X data off processed data with h5py.File(filename_out, 'r') as f: meta['X_shape'] = f['X'].shape X = f['X'][:] if not meta_only else None if not quiet: print('from cache ...', end=' ') return X, jsdict(meta) else: # get ready to process all segments into 1 file, starting with getting the meta-data ready if not quiet: print('processing ...', end=' ') pid = os.getpid() filename_in_fmt = segment_filename_builder(target, data_type, output_dir, tag) orig_filename_in = single_filename_builder(target, data_type, settings.data_dir) # meta-data is collected differently when doing the first data conversion from mat to hdf5 if output_to_original_data_dir: print('Collecting metadata...') # Creating original files... pull metadata off first one, and also collect sequences meta = None sequence = [] num_segments = 0 for i in range(0, sys.maxsize, 1): filename = filename_in_fmt % i if not os.path.exists(filename): if num_segments == 0: print('Could not find file ', filename) sys.exit(1) break with h5py.File(filename, 'r') as f_in: meta_attrs = f_in['__metadata'].attrs if 'sequence' in meta_attrs: sequence.append(meta_attrs['sequence']) if meta is None: meta = {} meta['channels'] = f_in['channels'][:] for key in list(meta_attrs.keys()): if key != 'sequence': meta[key] = meta_attrs[key] num_segments += 1 if len(sequence) > 0: meta['sequence'] = sequence meta['num_segments'] = num_segments print('Accumulating segments...') else: # pull metadata off the original data files meta = collect_meta(orig_filename_in) # now accumulate X data to a single file num_segments = meta['num_segments'] filename_out_temp = '%s.pid.%d.tmp' % (filename_out, pid) if filename_out is not None else None with h5py.File(filename_out_temp, 'w-', libver='latest') as f_out: X_out = None for i in range(num_segments): with h5py.File(filename_in_fmt % i, 'r') as f_in: X_in = f_in['X'] # init X_out if X_out is None: X_out = f_out.create_dataset('X', shape=[num_segments] + list(X_in.shape), dtype=X_in.dtype) meta['X_shape'] = X_out.shape for k, v in meta.items(): X_out.attrs[k] = v X_out[i] = X_in[:] X = X_out[:] # finalize os.rename(filename_out_temp, filename_out) # clean up for i in range(num_segments): try: os.remove(filename_in_fmt % i) except: pass return X, jsdict(meta)