def process_mat_into_hdf5(settings, target, data_type, N_jobs):
    assert data_type in ('preictal', 'interictal', 'test')

    print 'Loading data ...'
    timer = Timer()

    out_dir = os.path.join(settings.data_dir)
    metadata = Metadata()
    segments_processed = process_and_merge_segments(target, data_type, out_dir, metadata, N_jobs)

    print 'Processed %d segments in %s' % (segments_processed, timer.pretty_str())
    print data_type, 'Metadata', metadata
예제 #2
0
파일: data.py 프로젝트: Keesiu/meta-kaggle
def load_data_mp(settings, target, data_type, pipeline, check_only=False, quiet=False, meta_only=False):
    filename_out = single_filename_builder(target, data_type, settings.cache_dir, pipeline.get_name())
    filename_out_exists = os.path.exists(filename_out)
    if check_only:
        return filename_out_exists

    input_source = pipeline.get_input_source()
    input_source_pipeline = input_source.get_pipeline()
    if input_source_pipeline is not None:
        if not load_data_mp(settings, target, data_type, input_source_pipeline, check_only=True, quiet=quiet, meta_only=meta_only):
            if not quiet: print('Preparing input source', input_source_pipeline.get_name())
            load_data_mp(settings, target, data_type, input_source_pipeline, check_only=False, quiet=quiet, meta_only=meta_only)
            if not quiet: print('Input source ready')


    if not quiet: print('Loading %s data ...' % data_type, end=' ')
    timer = Timer()

    # TODO(mike): re-implement tmpfile cleanup that isn't really slow in the face of the genetic algorithm
    # spamming the disk with cross-validation score files.

    # clear cache of tmp files
    # regex = re.compile(r""".*\.pid\.(\d+)""")
    # for file in glob.glob(os.path.join(settings.cache_dir, '*.tmp')):
    #     match = regex.match(file)
    #     assert match is not None
    #     pid = int(match.group(1))
    #     try:
    #         os.getpgid(pid)
    #     except:
    #         print 'Removing', file
    #         os.remove(file)

    if not filename_out_exists:
        # DEBUG
        debug = False
        # debug = True
        if debug:
            print('DEBUG')
            process_data_job(settings, target, data_type, 0, 1, pipeline)
            print('Done')
        else:
            pool = Pool(settings.N_jobs)
            [pool.apply_async(process_data_job, [settings, target, data_type, i, settings.N_jobs, pipeline]) for i in range(settings.N_jobs)]
            pool.close()
            pool.join()

    accum, accum_meta = accumulate_data(settings, target, data_type, pipeline.get_name(), quiet=quiet, meta_only=meta_only)

    if not quiet: print('prepared %d segments in %s %s %s' % (accum_meta.num_segments, timer.pretty_str(), accum_meta.X_shape, pipeline.get_name()))

    return accum, accum_meta
예제 #3
0
def process_mat_into_hdf5(settings, target, data_type, N_jobs):
    assert data_type in ('preictal', 'interictal', 'test')

    print('Loading data ...')
    timer = Timer()

    out_dir = os.path.join(settings.data_dir)
    metadata = Metadata()
    segments_processed = process_and_merge_segments(target, data_type, out_dir,
                                                    metadata, N_jobs)

    print('Processed %d segments in %s' %
          (segments_processed, timer.pretty_str()))
    print(data_type, 'Metadata', metadata)
예제 #4
0
def load_data_mp(settings, target, data_type, pipeline, check_only=False, quiet=False, meta_only=False):
    filename_out = single_filename_builder(target, data_type, settings.cache_dir, pipeline.get_name())
    filename_out_exists = os.path.exists(filename_out)
    if check_only:
        return filename_out_exists

    input_source = pipeline.get_input_source()
    input_source_pipeline = input_source.get_pipeline()
    if input_source_pipeline is not None:
        if not load_data_mp(settings, target, data_type, input_source_pipeline, check_only=True, quiet=quiet, meta_only=meta_only):
            if not quiet: print 'Preparing input source', input_source_pipeline.get_name()
            load_data_mp(settings, target, data_type, input_source_pipeline, check_only=False, quiet=quiet, meta_only=meta_only)
            if not quiet: print 'Input source ready'


    if not quiet: print 'Loading %s data ...' % data_type,
    timer = Timer()

    # TODO(mike): re-implement tmpfile cleanup that isn't really slow in the face of the genetic algorithm
    # spamming the disk with cross-validation score files.

    # clear cache of tmp files
    # regex = re.compile(r""".*\.pid\.(\d+)""")
    # for file in glob.glob(os.path.join(settings.cache_dir, '*.tmp')):
    #     match = regex.match(file)
    #     assert match is not None
    #     pid = int(match.group(1))
    #     try:
    #         os.getpgid(pid)
    #     except:
    #         print 'Removing', file
    #         os.remove(file)

    if not filename_out_exists:
        # DEBUG
        debug = False
        # debug = True
        if debug:
            print 'DEBUG'
            process_data_job(settings, target, data_type, 0, 1, pipeline)
            print 'Done'
        else:
            pool = Pool(settings.N_jobs)
            [pool.apply_async(process_data_job, [settings, target, data_type, i, settings.N_jobs, pipeline]) for i in range(settings.N_jobs)]
            pool.close()
            pool.join()

    accum, accum_meta = accumulate_data(settings, target, data_type, pipeline.get_name(), quiet=quiet, meta_only=meta_only)

    if not quiet: print 'prepared %d segments in %s %s %s' % (accum_meta.num_segments, timer.pretty_str(), accum_meta.X_shape, pipeline.get_name())

    return accum, accum_meta