def process_mat_into_hdf5(settings, target, data_type, N_jobs): assert data_type in ('preictal', 'interictal', 'test') print 'Loading data ...' timer = Timer() out_dir = os.path.join(settings.data_dir) metadata = Metadata() segments_processed = process_and_merge_segments(target, data_type, out_dir, metadata, N_jobs) print 'Processed %d segments in %s' % (segments_processed, timer.pretty_str()) print data_type, 'Metadata', metadata
def load_data_mp(settings, target, data_type, pipeline, check_only=False, quiet=False, meta_only=False): filename_out = single_filename_builder(target, data_type, settings.cache_dir, pipeline.get_name()) filename_out_exists = os.path.exists(filename_out) if check_only: return filename_out_exists input_source = pipeline.get_input_source() input_source_pipeline = input_source.get_pipeline() if input_source_pipeline is not None: if not load_data_mp(settings, target, data_type, input_source_pipeline, check_only=True, quiet=quiet, meta_only=meta_only): if not quiet: print('Preparing input source', input_source_pipeline.get_name()) load_data_mp(settings, target, data_type, input_source_pipeline, check_only=False, quiet=quiet, meta_only=meta_only) if not quiet: print('Input source ready') if not quiet: print('Loading %s data ...' % data_type, end=' ') timer = Timer() # TODO(mike): re-implement tmpfile cleanup that isn't really slow in the face of the genetic algorithm # spamming the disk with cross-validation score files. # clear cache of tmp files # regex = re.compile(r""".*\.pid\.(\d+)""") # for file in glob.glob(os.path.join(settings.cache_dir, '*.tmp')): # match = regex.match(file) # assert match is not None # pid = int(match.group(1)) # try: # os.getpgid(pid) # except: # print 'Removing', file # os.remove(file) if not filename_out_exists: # DEBUG debug = False # debug = True if debug: print('DEBUG') process_data_job(settings, target, data_type, 0, 1, pipeline) print('Done') else: pool = Pool(settings.N_jobs) [pool.apply_async(process_data_job, [settings, target, data_type, i, settings.N_jobs, pipeline]) for i in range(settings.N_jobs)] pool.close() pool.join() accum, accum_meta = accumulate_data(settings, target, data_type, pipeline.get_name(), quiet=quiet, meta_only=meta_only) if not quiet: print('prepared %d segments in %s %s %s' % (accum_meta.num_segments, timer.pretty_str(), accum_meta.X_shape, pipeline.get_name())) return accum, accum_meta
def process_mat_into_hdf5(settings, target, data_type, N_jobs): assert data_type in ('preictal', 'interictal', 'test') print('Loading data ...') timer = Timer() out_dir = os.path.join(settings.data_dir) metadata = Metadata() segments_processed = process_and_merge_segments(target, data_type, out_dir, metadata, N_jobs) print('Processed %d segments in %s' % (segments_processed, timer.pretty_str())) print(data_type, 'Metadata', metadata)
def load_data_mp(settings, target, data_type, pipeline, check_only=False, quiet=False, meta_only=False): filename_out = single_filename_builder(target, data_type, settings.cache_dir, pipeline.get_name()) filename_out_exists = os.path.exists(filename_out) if check_only: return filename_out_exists input_source = pipeline.get_input_source() input_source_pipeline = input_source.get_pipeline() if input_source_pipeline is not None: if not load_data_mp(settings, target, data_type, input_source_pipeline, check_only=True, quiet=quiet, meta_only=meta_only): if not quiet: print 'Preparing input source', input_source_pipeline.get_name() load_data_mp(settings, target, data_type, input_source_pipeline, check_only=False, quiet=quiet, meta_only=meta_only) if not quiet: print 'Input source ready' if not quiet: print 'Loading %s data ...' % data_type, timer = Timer() # TODO(mike): re-implement tmpfile cleanup that isn't really slow in the face of the genetic algorithm # spamming the disk with cross-validation score files. # clear cache of tmp files # regex = re.compile(r""".*\.pid\.(\d+)""") # for file in glob.glob(os.path.join(settings.cache_dir, '*.tmp')): # match = regex.match(file) # assert match is not None # pid = int(match.group(1)) # try: # os.getpgid(pid) # except: # print 'Removing', file # os.remove(file) if not filename_out_exists: # DEBUG debug = False # debug = True if debug: print 'DEBUG' process_data_job(settings, target, data_type, 0, 1, pipeline) print 'Done' else: pool = Pool(settings.N_jobs) [pool.apply_async(process_data_job, [settings, target, data_type, i, settings.N_jobs, pipeline]) for i in range(settings.N_jobs)] pool.close() pool.join() accum, accum_meta = accumulate_data(settings, target, data_type, pipeline.get_name(), quiet=quiet, meta_only=meta_only) if not quiet: print 'prepared %d segments in %s %s %s' % (accum_meta.num_segments, timer.pretty_str(), accum_meta.X_shape, pipeline.get_name()) return accum, accum_meta