Example #1
0
def get_data(mode, max_samples, n_process, complex_feature_channels, max_bckg_samps_per_file,use_simple_hand_engineered_features, random_under_sample_data_gen, num_seconds, ref="01_tcp_ar", num_files=None, freq_bins=[0,3.5,7.5,14,20,25,40],  include_simple_coherence=True,):
    eds = getDataSampleGenerator()
    train_label_files_segs = eds.get_train_split()
    test_label_files_segs = eds.get_test_split()
    valid_label_files_segs = eds.get_valid_split()

    #increased n_process to deal with io processing
    train_edss = er.EdfDatasetSegmentedSampler(segment_file_tuples=train_label_files_segs, mode=mode, random_under_sample=random_under_sample_data_gen, num_samples=max_samples, max_bckg_samps_per_file=max_bckg_samps_per_file, n_process=int(n_process*2), gap=num_seconds*pd.Timedelta(seconds=1))[:]
    valid_edss = er.EdfDatasetSegmentedSampler(segment_file_tuples=valid_label_files_segs, mode=mode, random_under_sample=random_under_sample_data_gen, num_samples=max_samples, max_bckg_samps_per_file=max_bckg_samps_per_file, n_process=int(n_process*2), gap=num_seconds*pd.Timedelta(seconds=1))[:]
    test_edss = er.EdfDatasetSegmentedSampler(segment_file_tuples=test_label_files_segs, mode=mode, random_under_sample=random_under_sample_data_gen, num_samples=max_samples, max_bckg_samps_per_file=max_bckg_samps_per_file, n_process=int(n_process*2), gap=num_seconds*pd.Timedelta(seconds=1))[:]
    def simple_edss(edss):
        '''
        Use only a few columns so that we don't make 21*20 coherence pairs
        '''
        all_channels = util_funcs.get_common_channel_names()
        subset_channels = [all_channels.index(channel) for channel in complex_feature_channels]
        return [(datum[0][:, subset_channels], datum[1]) for datum in edss]
    if include_simple_coherence:
        trainCoherData = np.stack([datum.values for datum in [datum[0] for datum in wfdata.CoherenceTransformer(simple_edss(train_edss), columns_to_use=constants.SYMMETRIC_COLUMN_SUBSET, n_process=n_process, is_pandas=False)[:]]])
        validCoherData = np.stack([datum.values for datum in [datum[0] for datum in wfdata.CoherenceTransformer(simple_edss(valid_edss), columns_to_use=constants.SYMMETRIC_COLUMN_SUBSET, n_process=n_process, is_pandas=False)[:]]])
        testCoherData = np.stack([datum.values for datum in  [datum[0] for datum in wfdata.CoherenceTransformer(simple_edss(test_edss), columns_to_use=constants.SYMMETRIC_COLUMN_SUBSET, n_process=n_process, is_pandas=False)[:]]])
    if use_simple_hand_engineered_features:
        trainSHED = wfdata.SimpleHandEngineeredDataset(simple_edss(train_edss), n_process=n_process, is_pandas_data=False, features=[tsf.abs_energy, tsf.sample_entropy, lambda x: tsf.number_cwt_peaks(x, int(constants.COMMON_FREQ/25))], f_names=["abs_energy", "entropy", "num_peaks"], vectorize="full")[:]
        validSHED = wfdata.SimpleHandEngineeredDataset(simple_edss(valid_edss), n_process=n_process, is_pandas_data=False, features=[tsf.abs_energy, tsf.sample_entropy, lambda x: tsf.number_cwt_peaks(x, int(constants.COMMON_FREQ/25))], f_names=["abs_energy", "entropy", "num_peaks"], vectorize="full")[:]
        testSHED = wfdata.SimpleHandEngineeredDataset(simple_edss(test_edss), n_process=n_process, is_pandas_data=False, features=[tsf.abs_energy, tsf.sample_entropy, lambda x: tsf.number_cwt_peaks(x, int(constants.COMMON_FREQ/25))], f_names=["abs_energy", "entropy", "num_peaks"], vectorize="full")[:]

    train_edss = read.Flattener(read.EdfFFTDatasetTransformer(train_edss, freq_bins=freq_bins, is_pandas_data=False), n_process=n_process)[:]
    valid_edss = read.Flattener(read.EdfFFTDatasetTransformer(valid_edss, freq_bins=freq_bins, is_pandas_data=False), n_process=n_process)[:]
    test_edss = read.Flattener(read.EdfFFTDatasetTransformer(test_edss, freq_bins=freq_bins, is_pandas_data=False), n_process=n_process)[:]
    def split_tuples(data):
        return np.stack([datum[0] for datum in data]), np.stack([datum[1] for datum in data])
    train_edss, train_labels = split_tuples(train_edss)
    valid_edss, valid_labels = split_tuples(valid_edss)
    test_edss, test_labels = split_tuples(test_edss)


    if include_simple_coherence:
        train_edss = np.hstack([train_edss, trainCoherData])
        valid_edss = np.hstack([valid_edss, validCoherData])
        test_edss = np.hstack([test_edss, testCoherData])

    if use_simple_hand_engineered_features:
        train_edss = np.hstack([train_edss, np.stack(trainSHED)])
        valid_edss = np.hstack([valid_edss, np.stack(validSHED)])
        test_edss = np.hstack([test_edss, np.stack(testSHED)])


    print("Data Shape:", train_edss.shape)

    #some of the features are returning nans (assuming there is a log that may not play well?)
    return (np.nan_to_num(train_edss), train_labels), \
        (np.nan_to_num(valid_edss), valid_labels), \
        (np.nan_to_num(test_edss), test_labels)
Example #2
0
def get_data(split, ref="01_tcp_ar", num_files=None, freq_bins=[0,3.5,7.5,14,20,25,40], columns_to_use=constants.SYMMETRIC_COLUMN_SUBSET, n_process=4, include_simple_coherence=True, filter=True):
    genderDictItems = cta.getGenderAndFileNames(split, ref)
    clinicalTxtPaths = [genderDictItem[0]
                        for genderDictItem in genderDictItems]
    singGenders = [genderDictItem[1] for genderDictItem in genderDictItems]
    tokenFiles = []
    genders = []  # duplicate singGenders depending on number of tokens per session
    for i, txtPath in enumerate(clinicalTxtPaths):
        session_dir = path.dirname(txtPath)
        session_tkn_files = sorted(read.get_token_file_names(session_dir))
        tokenFiles += session_tkn_files
        genders += [singGenders[i] for tkn_file in session_tkn_files]
    edfRawData = read.EdfDataset(
        split, ref, num_files=num_files, columns_to_use=columns_to_use, expand_tse=False, filter=filter)
    edfRawData.edf_tokens = tokenFiles[:num_files]
    edfFFTData = read.EdfFFTDatasetTransformer(
        edfRawData, n_process=n_process, freq_bins=freq_bins, return_ann=False)
    fullData = edfFFTData[:]
    # transform to number
    genders = [1 if gender == 'm' else 0 for gender in genders]

    toReturnData = np.stack([datum.values.reshape(-1) for datum in fullData])

    if include_simple_coherence:
        coherData = wfdata.CoherenceTransformer(edfRawData, columns_to_use=columns_to_use, n_process=n_process)
        fullCoherData = [datum[0] for datum in coherData[:]]
        fullCoherData = np.stack([datum.values for datum in fullCoherData])
        toReturnData = np.hstack([toReturnData, fullCoherData])


    return toReturnData, \
        np.array(genders).reshape(-1, 1)[:num_files]
Example #3
0
def get_data(split,
             ref,
             n_process,
             precache,
             num_files,
             window,
             non_overlapping,
             return_mode,
             filter,
             use_multiple_tokens_per_session=False):
    if window is not None:
        window = window * pd.Timedelta(seconds=1)
    if return_mode == "age":
        ageData = cta.getAgesAndFileNames(split, ref)
    elif return_mode == "bpm":
        # not really agedata, bpmdata really
        ageData = cta.getBPMAndFileNames(split, ref)
    if num_files is not None:
        ageData = ageData[0:num_files]
    clinical_txt_paths = [ageDatum[0] for ageDatum in ageData]
    ages = [ageDatum[1] for ageDatum in ageData]

    # associate first token file with each session for now
    tokenFiles = []
    for session_file in clinical_txt_paths:
        session_dir = path.dirname(session_file)
        session_tkn_files = sorted(read.get_token_file_names(session_dir))
        tokenFiles.append(session_tkn_files[0])
    # discarding the annotations eventually
    edfReader = read.EdfDataset(split, ref, expand_tse=False, filter=filter)
    edfReader.edf_tokens = tokenFiles  # override to use only eegs with ages we have
    edfFFTData = read.EdfFFTDatasetTransformer(edf_dataset=edfReader,
                                               n_process=n_process,
                                               precache=True,
                                               window_size=window,
                                               non_overlapping=non_overlapping,
                                               return_ann=False)
    data = edfFFTData[:]
    if window is not None:
        for i, datum in enumerate(data):
            data[i] = datum.transpose(1, 0, 2).reshape(datum.shape[1], -1)
    return data, ages, clinical_txt_paths
def get_data(num_eegs, data_split, ref, precached_pkl):
    """Returns the data to process.

    Parameters
    ----------
    num_eegs : type
        Description of parameter `num_eegs`.
    data_split : type
        Description of parameter `data_split`.
    ref : type
        Description of parameter `ref`.

    Returns
    -------
    tuple
        first elem is data freq coefficients of (num_instances, num_freq_bins),
        second elem is annotation percentages of (num_instances, annotations)

    """
    if precached_pkl is None:
        edf_dataset = read.EdfDataset(data_split, ref, num_files=num_eegs)
        fft_reader = read.EdfFFTDatasetTransformer(edf_dataset=edf_dataset,
                                                   precache=True)
        data = fft_reader[0:num_eegs]
        annotations = np.array([datum[1].mean().values for datum in data])
    else:
        data = pkl.load(open(precached_pkl, 'rb'))[0:num_eegs]
        annotations = []
        for i in range(len(data)):
            annotations.append(
                read.expand_tse_file(
                    data[i][1],
                    pd.Series(list(range(int(data[i][1].end.max())))) *
                    pd.Timedelta(seconds=1)).mean().values)
        annotations = np.array(annotations)
    cols = [set(datum[0].dropna().columns) for datum in data]
    common_cols = list(cols[0].intersection(*cols))
    x_data = np.array([datum[0][common_cols].values for datum in data])
    x_data = x_data.reshape(x_data.shape[0], -1)
    return x_data, annotations
Example #5
0
def run_rf(use_combined, use_random_ensemble, combined_split, freq_bins,
           max_train_rf_samps, n_process, rf_data_pickle, use_cached_pkl):
    if not use_combined or not use_random_ensemble:
        raise NotImplemented("Have not completed this flow yet")
    else:
        if not use_cached_pkl or not path.exists(rf_data_pickle):
            trainEdfData, trainGender = split_data_gender(
                get_data(combined_split, is_test=False,
                         is_valid=False)[0][:max_train_rf_samps])
            trainEdfData = read.EdfFFTDatasetTransformer(trainEdfData,
                                                         return_numpy=True,
                                                         is_tuple_data=False,
                                                         is_pandas_data=False,
                                                         freq_bins=freq_bins,
                                                         n_process=n_process)
            trainEdfData.verbosity = 200
            trainEdfData = trainEdfData[:]
            validEdfData, validGender = split_data_gender(
                get_data(combined_split, is_test=False,
                         is_valid=True)[0][:max_train_rf_samps])
            validEdfData = read.EdfFFTDatasetTransformer(validEdfData,
                                                         return_numpy=True,
                                                         is_tuple_data=False,
                                                         is_pandas_data=False,
                                                         freq_bins=freq_bins,
                                                         n_process=n_process)
            validEdfData.verbosity = 200
            validEdfData = validEdfData[:]

            trainSize = len(trainEdfData)
            validSize = len(validEdfData)
            trainValidData = np.vstack(
                [np.stack(trainEdfData),
                 np.stack(validEdfData)])
            trainValidData = trainValidData.reshape(trainValidData.shape[0],
                                                    -1)
            trainValidGender = np.hstack(
                [np.array(trainGender),
                 np.array(validGender)]).reshape(-1, 1)
            # deallocate memory so o2 doesn't kick this out when we try to start training, etc.
            del trainEdfData
            del validEdfData
            pkl.dump((trainSize, validSize, trainValidData, trainValidGender),
                     open(rf_data_pickle, 'wb'))
        else:
            trainSize, validSize, trainValidData, trainValidGender = pkl.load(
                open(rf_data_pickle, 'rb'))

        rf = RandomForestClassifier()
        preSplit = PredefinedSplit([0 for i in range(trainSize)] +
                                   [-1 for i in range(validSize)])
        rf_parameters = {
            'criterion': ["gini", "entropy"],
            'n_estimators': [50, 100, 200, 400],
            'max_features': ['auto', 'log2', .1, .4],
            'max_depth': [None, 4, 8, 12],
            'min_samples_split': [2, 4, 8],
            'n_jobs': [1],
            'min_weight_fraction_leaf': [0, 0.2]
        }
        gridsearch = GridSearchCV(rf,
                                  rf_parameters,
                                  scoring=make_scorer(f1_score),
                                  cv=preSplit,
                                  n_jobs=n_process)
        gridsearch.fit(trainValidData, trainValidGender)

        if not use_cached_pkl or not path.exists("test_" + rf_data_pickle):
            testEdfData, testGender = split_data_gender(
                get_data(combined_split, is_test=True,
                         is_valid=False)[0][:max_train_rf_samps])
            testEdfData = read.EdfFFTDatasetTransformer(testEdfData,
                                                        return_numpy=True,
                                                        is_tuple_data=False,
                                                        is_pandas_data=False,
                                                        freq_bins=freq_bins,
                                                        n_process=n_process)
            testEdfData.verbosity = 200
            testEdfData = testEdfData[:]
            pkl.dump((testEdfData, testGender),
                     open("test_" + rf_data_pickle, 'wb'))
        else:
            testEdfData, testGender = pkl.load(
                open("test_" + rf_data_pickle, 'rb'))

        y_pred = gridsearch.predict(
            np.stack(testEdfData).reshape(len(testEdfData), -1))
        toReturn = {
            'f1_score': f1_score(testGender, y_pred),
            'auc': roc_auc_score(testGender, y_pred),
            'mcc': matthews_corrcoef(testGender, y_pred),
            'accuracy': accuracy_score(testGender, y_pred)
        }

        trainEdfTokens, validEdfTokens, testEdfTokens, trainGenders, validGenders, _testGendersCopy = get_test_train_split_from_combined(
        )

        testEdfEnsembler = get_base_dataset("combined",
                                            labels=_testGendersCopy,
                                            edfTokenPaths=testEdfTokens,
                                            is_test=True)
        y_pred = gridsearch.best_estimator_.predict_proba(
            np.stack(testEdfData).reshape(len(testEdfData), -1))
        label, average_pred = testEdfEnsembler.dataset.getEnsemblePrediction(
            y_pred, mode=er.EdfDatasetEnsembler.ENSEMBLE_PREDICTION_EQUAL_VOTE)

        pred = np.round(average_pred)
        toReturn["ensemble_score"] = {}
        toReturn["ensemble_score"]["auc"] = roc_auc_score(label, pred)
        toReturn["ensemble_score"]["acc"] = accuracy_score(label, pred)
        toReturn["ensemble_score"]["f1_score"] = f1_score(label, pred)
        toReturn["ensemble_score"]["discordance"] = np.abs(
            pred - average_pred).mean()
        toReturn["ensemble_score"]["equal_vote"]["auc"] = roc_auc_score(
            label, pred
        )  #keep auc here as well in top level for compatibility reasons when comparing
        toReturn["ensemble_score"]["equal_vote"]["acc"] = accuracy_score(
            label, pred)
        toReturn["ensemble_score"]["equal_vote"]["f1_score"] = f1_score(
            label, pred)
        toReturn["ensemble_score"]["equal_vote"]["discordance"] = np.abs(
            pred - average_pred).mean()

        label, average_over_all_pred = testEdfEnsembler.getEnsemblePrediction(
            y_pred,
            mode=er.EdfDatasetEnsembler.ENSEMBLE_PREDICTION_OVER_EACH_SAMP)
        pred = np.round(average_over_all_pred)
        toReturn["ensemble_score"]["over_all"]["auc"] = roc_auc_score(
            label, pred
        )  #keep auc here as well in top level for compatibility reasons when comparing
        toReturn["ensemble_score"]["over_all"]["acc"] = accuracy_score(
            label, pred)
        toReturn["ensemble_score"]["over_all"]["f1_score"] = f1_score(
            label, pred)
        toReturn["ensemble_score"]["over_all"]["discordance"] = np.abs(
            pred - average_pred).mean()

        return toReturn