예제 #1
0
def read(filename):
    data = h5py.File(filename, 'r')
    obj = {}
    for key in data.keys():
        value = data[key]
        if key == METADATA_TAG:
            for metakey in value.attrs.keys():
                obj[metakey] = value.attrs[metakey]
        elif not key.startswith('__list'):
            obj[key] = value[:]

    list_keys = [key for key in data.keys() if key.startswith('__list')]
    if len(list_keys) > 0:
        list_keys.sort()
        for key in list_keys:
            match = list_regex.match(key)
            assert match is not None
            list_key = match.group(1)
            list_index = int(match.group(2))
            out_list = obj.setdefault(list_key, [])
            assert len(out_list) == list_index
            out_list.append(data[key][:])

    data.close()

    return jsdict(obj)
예제 #2
0
def read(filename):
    data = h5py.File(filename, 'r')
    obj = {}
    for key in data.keys():
        value = data[key]
        if key == METADATA_TAG:
            for metakey in value.attrs.keys():
                obj[metakey] = value.attrs[metakey]
        elif not key.startswith('__list'):
            obj[key] = value[:]

    list_keys = [key for key in data.keys() if key.startswith('__list')]
    if len(list_keys) > 0:
        list_keys.sort()
        for key in list_keys:
            match = list_regex.match(key)
            assert match is not None
            list_key = match.group(1)
            list_index = int(match.group(2))
            out_list = obj.setdefault(list_key, [])
            assert len(out_list) == list_index
            out_list.append(data[key][:])

    data.close()

    return jsdict(obj)
예제 #3
0
def memoize(fn, paths):
    cwd = os.getcwd()

    def change_to_target_dir():
        for dir in paths[:-1]:
            try:
                os.mkdir(dir)
            except OSError as e:
                pass
            os.chdir(dir)

    change_to_target_dir()
    filename = paths[-1]
    if os.path.exists(filename):
        data = hdf5.read(filename)
        os.chdir(cwd)
        return data

    os.chdir(cwd)
    data = fn()
    change_to_target_dir()
    tmp = '%s.pid.%d.tmp' % (filename, os.getpid())
    hdf5.write(tmp, data)
    os.rename(tmp, filename)
    os.chdir(cwd)

    return jsdict(data)
예제 #4
0
    def process():

        data = load_training_data(settings, target, pipeline, strategy=strategy, cv_fold_number=fold, check_only=False, quiet=quiet)

        if feature_mask is not None:
            s = [slice(None),] * data.X_train.ndim
            s[-1] = np.where(np.array(feature_mask) == True)[0]
            data['X_train'] = data.X_train[s]
            data['X_cv'] = data.X_cv[s]
            if not quiet: print ' feature mask', 'X_train', data.X_train.shape, 'y_train', data.y_train.shape, 'X_cv', data.X_cv.shape, 'y_cv', data.y_cv.shape

        train(classifier, data, quiet=quiet)
        if not quiet: print "Making predictions...",
        timer = time.Timer()
        mean_predictions, median_predictions, raw_predictions = make_predictions(classifier, data.X_cv, data.num_cv_segments)
        if not quiet: print timer.pretty_str()

        mean_score = roc_auc_score(data.y_cv, mean_predictions)
        median_score = roc_auc_score(data.y_cv, median_predictions)

        return jsdict({
            'mean_score': mean_score,
            'median_score': median_score,
            'mean_predictions': mean_predictions,
            'median_predictions': median_predictions,
            'y_cv': data.y_cv
        })
예제 #5
0
    def make_fold(preictal_X_train, preictal_X_cv, interictal_X_train, interictal_X_cv):
        num_train_segments = preictal_X_train.shape[0] + interictal_X_train.shape[0]
        num_cv_segments = preictal_X_cv.shape[0] + interictal_X_cv.shape[0]
        assert (num_train_segments + num_cv_segments) == total_segments

        flattened_preictal_X_train = flatten(preictal_X_train)
        flattened_interictal_X_train = flatten(interictal_X_train)
        flattened_preictal_X_cv = flatten(preictal_X_cv) if cv else np.empty((0,))
        flattened_interictal_X_cv = flatten(interictal_X_cv) if cv else np.empty((0,))

        X_train = np.concatenate((flattened_preictal_X_train, flattened_interictal_X_train), axis=0)
        X_cv = np.concatenate((flattened_preictal_X_cv, flattened_interictal_X_cv), axis=0)

        preictal_y_train = np.ones((flattened_preictal_X_train.shape[0],))
        preictal_y_cv = np.ones((preictal_X_cv.shape[0],))
        interictal_y_train = np.zeros((flattened_interictal_X_train.shape[0],))
        interictal_y_cv = np.zeros((interictal_X_cv.shape[0],))

        y_train = np.concatenate((preictal_y_train, interictal_y_train), axis=0)
        y_cv = np.concatenate((preictal_y_cv, interictal_y_cv), axis=0)

        X_train, y_train = sklearn.utils.shuffle(X_train, y_train, random_state=0)

        return jsdict({
            'X_train': X_train,
            'y_train': y_train,
            'X_cv': X_cv,
            'y_cv': y_cv,
            'num_train_segments': num_train_segments,
            'num_cv_segments': num_cv_segments
        })
예제 #6
0
def load_test_data(settings, target, pipeline, quiet=False):
    test, meta = load_pipeline_data(settings, target, 'test', pipeline, check_only=False, quiet=quiet)
    X_test = flatten(test)
    if not quiet: print 'X_test', test.shape, 'num_segments', meta.num_segments
    return jsdict({
        'X_test': X_test,
        'num_segments': meta.num_segments
    })
예제 #7
0
def load_test_data(settings, target, pipeline, quiet=False):
    test, meta = load_pipeline_data(settings,
                                    target,
                                    'test',
                                    pipeline,
                                    check_only=False,
                                    quiet=quiet)
    X_test = flatten(test)
    if not quiet: print 'X_test', test.shape, 'num_segments', meta.num_segments
    return jsdict({'X_test': X_test, 'num_segments': meta.num_segments})
예제 #8
0
def process_data_sub_job(filename_in, filename_out_fmt, id, num_jobs,
                         process_data_fn):
    if not os.path.exists(filename_in):
        return 0

    pid = os.getpid()

    num_processed = 0
    for i in xrange(id, sys.maxint, num_jobs):

        filename_out = filename_out_fmt % i if filename_out_fmt is not None else None
        # Use temp filename then rename the completed file to the proper name.
        # This is more or less an atomic update. Cancelling the program should
        # never leave data in a half-written state. Hence only the tempfile
        # will be in a half-written state and the pid determines when the process
        # is still alive and still processing the data. An inactive pid means the
        # tempfile is trash and can be deleted.
        filename_out_temp = '%s.pid.%d.tmp' % (
            filename_out, pid) if filename_out is not None else None

        if filename_out is not None and os.path.exists(filename_out):
            num_processed += 1
            continue

        with h5py.File(filename_in, 'r') as f:
            segment = read_hdf5_segment(f, 'X', start=i, end=i + 1)
            if segment is None:
                break
            X, meta = segment

        data_obj = {}
        for k, v in meta.iteritems():
            data_obj[k] = v

        # save disk space
        if X.dtype != np.float32:
            X = X.astype(np.float32)

        X = process_data_fn(X, jsdict(data_obj))

        if filename_out is not None:
            with h5py.File(filename_out_temp, 'w', libver='latest') as f:
                if X.dtype != np.float32:
                    X = X.astype(np.float32)
                write_hdf5_segment(f, 'X', X)

            os.rename(filename_out_temp, filename_out)

        num_processed += 1

    return num_processed
예제 #9
0
def process_data_sub_job(filename_in, filename_out_fmt, id, num_jobs, process_data_fn):
    if not os.path.exists(filename_in):
        return 0

    pid = os.getpid()

    num_processed = 0
    for i in xrange(id, sys.maxint, num_jobs):

        filename_out = filename_out_fmt % i if filename_out_fmt is not None else None
        # Use temp filename then rename the completed file to the proper name.
        # This is more or less an atomic update. Cancelling the program should
        # never leave data in a half-written state. Hence only the tempfile
        # will be in a half-written state and the pid determines when the process
        # is still alive and still processing the data. An inactive pid means the
        # tempfile is trash and can be deleted.
        filename_out_temp = '%s.pid.%d.tmp' % (filename_out, pid) if filename_out is not None else None

        if filename_out is not None and os.path.exists(filename_out):
            num_processed += 1
            continue

        with h5py.File(filename_in, 'r') as f:
            segment = read_hdf5_segment(f, 'X', start=i, end=i+1)
            if segment is None:
                break
            X, meta = segment

        data_obj = {}
        for k, v in meta.iteritems():
            data_obj[k] = v

        # save disk space
        if X.dtype != np.float32:
            X = X.astype(np.float32)

        X = process_data_fn(X, jsdict(data_obj))

        if filename_out is not None:
            with h5py.File(filename_out_temp, 'w', libver='latest') as f:
                if X.dtype != np.float32:
                    X = X.astype(np.float32)
                write_hdf5_segment(f, 'X', X)

            os.rename(filename_out_temp, filename_out)

        num_processed += 1

    return num_processed
예제 #10
0
def process_data(segment):
    data_key = [
        key for key in list(segment.keys()) if not key.startswith('_')
    ][0]
    data = segment[data_key][0][0]

    X = data[0]
    data_length_sec = int(data[1][0][0])
    sampling_frequency = float(data[2][0][0])
    channels = [ch[0] for ch in data[3][0]]
    sequence = int(data[4][0][0]) if len(data) >= 5 else None

    min_freq = 195.0

    def find_q():
        q = 2
        while True:
            f = sampling_frequency / q
            if f < min_freq:
                return q - 1
            q += 1

    if sampling_frequency > min_freq:
        q = find_q()
        if q > 1:
            # if X.dtype != np.float64:
            #     X = X.astype(np.float64)
            # X -= X.mean(axis=0)
            X = scipy.signal.decimate(X, q, ftype='fir', axis=X.ndim - 1)
            X = np.round(X).astype(np.int16)
            # if X.dtype != np.float32:
            #     X = X.astype(np.float32)
            sampling_frequency /= q

    channels = np.array(channels,
                        dtype=str(channels[0].dtype).replace('U', 'S'))
    out = {
        'X': X,
        'data_length_sec': data_length_sec,
        'sampling_frequency': sampling_frequency,
        'num_channels': X.shape[0],
        'channels': channels,
        'target': target,
        'data_type': data_type,
    }
    if sequence is not None:
        out['sequence'] = sequence

    return jsdict(out)
예제 #11
0
def process_data(segment):
    data_key = [key for key in segment.keys() if not key.startswith('_')][0]
    data = segment[data_key][0][0]

    X = data[0]
    data_length_sec = int(data[1][0][0])
    sampling_frequency = float(data[2][0][0])
    channels = [ch[0] for ch in data[3][0]]
    sequence = int(data[4][0][0]) if len(data) >= 5 else None

    min_freq = 195.0
    def find_q():
        q = 2
        while True:
            f = sampling_frequency / q
            if f < min_freq:
                return q - 1
            q += 1

    if sampling_frequency > min_freq:
        q = find_q()
        if q > 1:
            # if X.dtype != np.float64:
            #     X = X.astype(np.float64)
            # X -= X.mean(axis=0)
            X = scipy.signal.decimate(X, q, ftype='fir', axis=X.ndim-1)
            X = np.round(X).astype(np.int16)
            # if X.dtype != np.float32:
            #     X = X.astype(np.float32)
            sampling_frequency /= q

    channels = np.array(channels, dtype=str(channels[0].dtype).replace('U', 'S'))
    out = {
        'X': X,
        'data_length_sec': data_length_sec,
        'sampling_frequency': sampling_frequency,
        'num_channels': X.shape[0],
        'channels': channels,
        'target': target,
        'data_type': data_type,
    }
    if sequence is not None:
        out['sequence'] = sequence

    return jsdict(out)
예제 #12
0
    def make_fold(preictal_X_train, preictal_X_cv, interictal_X_train,
                  interictal_X_cv):
        num_train_segments = preictal_X_train.shape[
            0] + interictal_X_train.shape[0]
        num_cv_segments = preictal_X_cv.shape[0] + interictal_X_cv.shape[0]
        assert (num_train_segments + num_cv_segments) == total_segments

        flattened_preictal_X_train = flatten(preictal_X_train)
        flattened_interictal_X_train = flatten(interictal_X_train)
        flattened_preictal_X_cv = flatten(preictal_X_cv) if cv else np.empty(
            (0, ))
        flattened_interictal_X_cv = flatten(
            interictal_X_cv) if cv else np.empty((0, ))

        X_train = np.concatenate(
            (flattened_preictal_X_train, flattened_interictal_X_train), axis=0)
        X_cv = np.concatenate(
            (flattened_preictal_X_cv, flattened_interictal_X_cv), axis=0)

        preictal_y_train = np.ones((flattened_preictal_X_train.shape[0], ))
        preictal_y_cv = np.ones((preictal_X_cv.shape[0], ))
        interictal_y_train = np.zeros(
            (flattened_interictal_X_train.shape[0], ))
        interictal_y_cv = np.zeros((interictal_X_cv.shape[0], ))

        y_train = np.concatenate((preictal_y_train, interictal_y_train),
                                 axis=0)
        y_cv = np.concatenate((preictal_y_cv, interictal_y_cv), axis=0)

        X_train, y_train = sklearn.utils.shuffle(X_train,
                                                 y_train,
                                                 random_state=0)

        return jsdict({
            'X_train': X_train,
            'y_train': y_train,
            'X_cv': X_cv,
            'y_cv': y_cv,
            'num_train_segments': num_train_segments,
            'num_cv_segments': num_cv_segments
        })
예제 #13
0
    def process():

        data = load_training_data(settings,
                                  target,
                                  pipeline,
                                  strategy=strategy,
                                  cv_fold_number=fold,
                                  check_only=False,
                                  quiet=quiet)

        if feature_mask is not None:
            s = [
                slice(None),
            ] * data.X_train.ndim
            s[-1] = np.where(np.array(feature_mask) == True)[0]
            data['X_train'] = data.X_train[s]
            data['X_cv'] = data.X_cv[s]
            if not quiet:
                print(' feature mask', 'X_train', data.X_train.shape,
                      'y_train', data.y_train.shape, 'X_cv', data.X_cv.shape,
                      'y_cv', data.y_cv.shape)

        train(classifier, data, quiet=quiet)
        if not quiet: print("Making predictions...", end=' ')
        timer = time.Timer()
        mean_predictions, median_predictions, raw_predictions = make_predictions(
            classifier, data.X_cv, data.num_cv_segments)
        if not quiet: print(timer.pretty_str())

        mean_score = roc_auc_score(data.y_cv, mean_predictions)
        median_score = roc_auc_score(data.y_cv, median_predictions)

        return jsdict({
            'mean_score': mean_score,
            'median_score': median_score,
            'mean_predictions': mean_predictions,
            'median_predictions': median_predictions,
            'y_cv': data.y_cv
        })
예제 #14
0
def cross_validation_score(settings, target, pipeline, classifier, classifier_name, strategy=None, pool=None, progress_str=None, feature_mask=None, return_data=True, quiet=False):
    if strategy is None:
        strategy = KFoldStrategy()

    if feature_mask is not None and np.count_nonzero(feature_mask) == len(feature_mask):
        feature_mask = None

    _, preictal_meta = load_pipeline_data(settings, target, 'preictal', pipeline, check_only=False, quiet=quiet, meta_only=True)
    cv_folds = strategy.get_folds(preictal_meta)

    if pool is not None:
        results = [pool.apply_async(cross_val_score_for_one_fold, [settings, target, pipeline, classifier, classifier_name, fold],
            {'strategy': strategy, 'feature_mask': feature_mask, 'progress_str': progress_str, 'quiet': quiet})
            for fold in cv_folds]
        if return_data:
            out = [r.get() for r in results]
    else:
        out = [cross_val_score_for_one_fold(settings, target, pipeline, classifier, classifier_name, strategy=strategy,
            fold=fold, feature_mask=feature_mask, progress_str=progress_str, quiet=quiet) for fold in cv_folds]

    if return_data:
        mean_scores = [d.mean_score for d in out]
        median_scores = [d.median_score for d in out]
        mean_predictions = [d.mean_predictions for d in out]
        median_predictions = [d.median_predictions for d in out]
        y_cvs = [d.y_cv for d in out]

        return jsdict({
            'mean_score': np.mean(mean_scores),
            'median_score': np.mean(median_scores),
            'mean_scores': np.array(mean_scores),
            'median_scores': np.array(median_scores),
            'mean_predictions': mean_predictions,
            'median_predictions': median_predictions,
            'y_cvs': y_cvs
        })
예제 #15
0
    change_to_target_dir()
    filename = paths[-1]
    if os.path.exists(filename):
        data = hdf5.read(filename)
        os.chdir(cwd)
        return data

    os.chdir(cwd)
    data = fn()
    change_to_target_dir()
    tmp = '%s.pid.%d.tmp' % (filename, os.getpid())
    hdf5.write(tmp, data)
    os.rename(tmp, filename)
    os.chdir(cwd)

    return jsdict(data)


# Fast process-if-not-yet-processed method for training data
def check_training_data_loaded(settings, target, pipeline, quiet=False):
    if not load_pipeline_data(
            settings, target, 'preictal', pipeline, check_only=True,
            quiet=quiet):
        load_pipeline_data(settings,
                           target,
                           'preictal',
                           pipeline,
                           check_only=False,
                           quiet=quiet)
    if not load_pipeline_data(
            settings, target, 'interictal', pipeline, check_only=True,
예제 #16
0
    change_to_target_dir()
    filename = paths[-1]
    if os.path.exists(filename):
        data = hdf5.read(filename)
        os.chdir(cwd)
        return data

    os.chdir(cwd)
    data = fn()
    change_to_target_dir()
    tmp = '%s.pid.%d.tmp' % (filename, os.getpid())
    hdf5.write(tmp, data)
    os.rename(tmp, filename)
    os.chdir(cwd)

    return jsdict(data)


# Fast process-if-not-yet-processed method for training data
def check_training_data_loaded(settings, target, pipeline, quiet=False):
    if not load_pipeline_data(settings, target, 'preictal', pipeline, check_only=True, quiet=quiet):
        load_pipeline_data(settings, target, 'preictal', pipeline, check_only=False, quiet=quiet)
    if not load_pipeline_data(settings, target, 'interictal', pipeline, check_only=True, quiet=quiet):
        load_pipeline_data(settings, target, 'interictal', pipeline, check_only=False, quiet=quiet)


# Fast process-if-not-yet-processed method for test data
def check_test_data_loaded(settings, target, pipeline, quiet=False):
    if not load_pipeline_data(settings, target, 'test', pipeline, check_only=True, quiet=quiet):
        load_pipeline_data(settings, target, 'test', pipeline, check_only=False, quiet=quiet)
예제 #17
0
def accumulate_data(settings, target, data_type, tag, output_to_original_data_dir=False, quiet=False, meta_only=False):
    output_dir = settings.data_dir if output_to_original_data_dir else settings.cache_dir
    filename_out = single_filename_builder(target, data_type, output_dir, tag)
    orig_filename_in = single_filename_builder(target, data_type, settings.data_dir)

    def collect_meta(filename):
        meta = {}
        with h5py.File(filename, 'r') as f:
            meta['num_segments'] = f['X'].shape[0]
            if 'sequence' in f.keys():
                meta['sequence'] = f['sequence'][:]
            for k, v in f['X'].attrs.iteritems():
                meta[k] = v
        return meta

    # load already processed output file
    if os.path.exists(filename_out):
        # pull meta off original data
        meta = collect_meta(orig_filename_in)

        # pull X data off processed data
        with h5py.File(filename_out, 'r') as f:
            meta['X_shape'] = f['X'].shape
            X = f['X'][:] if not meta_only else None
            if not quiet: print 'from cache ...',
            return X, jsdict(meta)
    else:
        # get ready to process all segments into 1 file, starting with getting the meta-data ready
        if not quiet: print 'processing ...',
        pid = os.getpid()
        filename_in_fmt = segment_filename_builder(target, data_type, output_dir, tag)

        orig_filename_in = single_filename_builder(target, data_type, settings.data_dir)

        # meta-data is collected differently when doing the first data conversion from mat to hdf5
        if output_to_original_data_dir:
            print 'Collecting metadata...'
            # Creating original files... pull metadata off first one, and also collect sequences
            meta = None
            sequence = []
            num_segments = 0
            for i in xrange(0, sys.maxint, 1):
                filename = filename_in_fmt % i
                if not os.path.exists(filename):
                    if num_segments == 0:
                        print 'Could not find file ', filename
                        sys.exit(1)
                    break

                with h5py.File(filename, 'r') as f_in:
                    meta_attrs = f_in['__metadata'].attrs
                    if 'sequence' in meta_attrs:
                        sequence.append(meta_attrs['sequence'])

                    if meta is None:
                        meta = {}
                        meta['channels'] = f_in['channels'][:]
                        for key in meta_attrs.keys():
                            if key != 'sequence':
                                meta[key] = meta_attrs[key]
                num_segments += 1

            if len(sequence) > 0:
                meta['sequence'] = sequence

            meta['num_segments'] = num_segments

            print 'Accumulating segments...'
        else:
            # pull metadata off the original data files
            meta = collect_meta(orig_filename_in)

        # now accumulate X data to a single file
        num_segments = meta['num_segments']
        filename_out_temp = '%s.pid.%d.tmp' % (filename_out, pid) if filename_out is not None else None
        with h5py.File(filename_out_temp, 'w-', libver='latest') as f_out:
            X_out = None
            for i in xrange(num_segments):
                with h5py.File(filename_in_fmt % i, 'r') as f_in:
                    X_in = f_in['X']
                    # init X_out
                    if X_out is None:
                        X_out = f_out.create_dataset('X', shape=[num_segments] + list(X_in.shape), dtype=X_in.dtype)
                        meta['X_shape'] = X_out.shape
                        for k, v in meta.iteritems():
                            X_out.attrs[k] = v

                    X_out[i] = X_in[:]
            X = X_out[:]

        # finalize
        os.rename(filename_out_temp, filename_out)
        # clean up
        for i in xrange(num_segments):
            try:
                os.remove(filename_in_fmt % i)
            except:
                pass

        return X, jsdict(meta)
예제 #18
0
def cross_validation_score(settings,
                           target,
                           pipeline,
                           classifier,
                           classifier_name,
                           strategy=None,
                           pool=None,
                           progress_str=None,
                           feature_mask=None,
                           return_data=True,
                           quiet=False):
    if strategy is None:
        strategy = KFoldStrategy()

    if feature_mask is not None and np.count_nonzero(feature_mask) == len(
            feature_mask):
        feature_mask = None

    _, preictal_meta = load_pipeline_data(settings,
                                          target,
                                          'preictal',
                                          pipeline,
                                          check_only=False,
                                          quiet=quiet,
                                          meta_only=True)
    cv_folds = strategy.get_folds(preictal_meta)

    if pool is not None:
        results = [
            pool.apply_async(
                cross_val_score_for_one_fold, [
                    settings, target, pipeline, classifier, classifier_name,
                    fold
                ], {
                    'strategy': strategy,
                    'feature_mask': feature_mask,
                    'progress_str': progress_str,
                    'quiet': quiet
                }) for fold in cv_folds
        ]
        if return_data:
            out = [r.get() for r in results]
    else:
        out = [
            cross_val_score_for_one_fold(settings,
                                         target,
                                         pipeline,
                                         classifier,
                                         classifier_name,
                                         strategy=strategy,
                                         fold=fold,
                                         feature_mask=feature_mask,
                                         progress_str=progress_str,
                                         quiet=quiet) for fold in cv_folds
        ]

    if return_data:
        mean_scores = [d.mean_score for d in out]
        median_scores = [d.median_score for d in out]
        mean_predictions = [d.mean_predictions for d in out]
        median_predictions = [d.median_predictions for d in out]
        y_cvs = [d.y_cv for d in out]

        return jsdict({
            'mean_score': np.mean(mean_scores),
            'median_score': np.mean(median_scores),
            'mean_scores': np.array(mean_scores),
            'median_scores': np.array(median_scores),
            'mean_predictions': mean_predictions,
            'median_predictions': median_predictions,
            'y_cvs': y_cvs
        })
예제 #19
0
파일: data.py 프로젝트: Keesiu/meta-kaggle
def accumulate_data(settings, target, data_type, tag, output_to_original_data_dir=False, quiet=False, meta_only=False):
    output_dir = settings.data_dir if output_to_original_data_dir else settings.cache_dir
    filename_out = single_filename_builder(target, data_type, output_dir, tag)
    orig_filename_in = single_filename_builder(target, data_type, settings.data_dir)

    def collect_meta(filename):
        meta = {}
        with h5py.File(filename, 'r') as f:
            meta['num_segments'] = f['X'].shape[0]
            if 'sequence' in list(f.keys()):
                meta['sequence'] = f['sequence'][:]
            for k, v in f['X'].attrs.items():
                meta[k] = v
        return meta

    # load already processed output file
    if os.path.exists(filename_out):
        # pull meta off original data
        meta = collect_meta(orig_filename_in)

        # pull X data off processed data
        with h5py.File(filename_out, 'r') as f:
            meta['X_shape'] = f['X'].shape
            X = f['X'][:] if not meta_only else None
            if not quiet: print('from cache ...', end=' ')
            return X, jsdict(meta)
    else:
        # get ready to process all segments into 1 file, starting with getting the meta-data ready
        if not quiet: print('processing ...', end=' ')
        pid = os.getpid()
        filename_in_fmt = segment_filename_builder(target, data_type, output_dir, tag)

        orig_filename_in = single_filename_builder(target, data_type, settings.data_dir)

        # meta-data is collected differently when doing the first data conversion from mat to hdf5
        if output_to_original_data_dir:
            print('Collecting metadata...')
            # Creating original files... pull metadata off first one, and also collect sequences
            meta = None
            sequence = []
            num_segments = 0
            for i in range(0, sys.maxsize, 1):
                filename = filename_in_fmt % i
                if not os.path.exists(filename):
                    if num_segments == 0:
                        print('Could not find file ', filename)
                        sys.exit(1)
                    break

                with h5py.File(filename, 'r') as f_in:
                    meta_attrs = f_in['__metadata'].attrs
                    if 'sequence' in meta_attrs:
                        sequence.append(meta_attrs['sequence'])

                    if meta is None:
                        meta = {}
                        meta['channels'] = f_in['channels'][:]
                        for key in list(meta_attrs.keys()):
                            if key != 'sequence':
                                meta[key] = meta_attrs[key]
                num_segments += 1

            if len(sequence) > 0:
                meta['sequence'] = sequence

            meta['num_segments'] = num_segments

            print('Accumulating segments...')
        else:
            # pull metadata off the original data files
            meta = collect_meta(orig_filename_in)

        # now accumulate X data to a single file
        num_segments = meta['num_segments']
        filename_out_temp = '%s.pid.%d.tmp' % (filename_out, pid) if filename_out is not None else None
        with h5py.File(filename_out_temp, 'w-', libver='latest') as f_out:
            X_out = None
            for i in range(num_segments):
                with h5py.File(filename_in_fmt % i, 'r') as f_in:
                    X_in = f_in['X']
                    # init X_out
                    if X_out is None:
                        X_out = f_out.create_dataset('X', shape=[num_segments] + list(X_in.shape), dtype=X_in.dtype)
                        meta['X_shape'] = X_out.shape
                        for k, v in meta.items():
                            X_out.attrs[k] = v

                    X_out[i] = X_in[:]
            X = X_out[:]

        # finalize
        os.rename(filename_out_temp, filename_out)
        # clean up
        for i in range(num_segments):
            try:
                os.remove(filename_in_fmt % i)
            except:
                pass

        return X, jsdict(meta)