def get_data(mode, max_samples, n_process, complex_feature_channels, max_bckg_samps_per_file,use_simple_hand_engineered_features, random_under_sample_data_gen, num_seconds, ref="01_tcp_ar", num_files=None, freq_bins=[0,3.5,7.5,14,20,25,40], include_simple_coherence=True,): eds = getDataSampleGenerator() train_label_files_segs = eds.get_train_split() test_label_files_segs = eds.get_test_split() valid_label_files_segs = eds.get_valid_split() #increased n_process to deal with io processing train_edss = er.EdfDatasetSegmentedSampler(segment_file_tuples=train_label_files_segs, mode=mode, random_under_sample=random_under_sample_data_gen, num_samples=max_samples, max_bckg_samps_per_file=max_bckg_samps_per_file, n_process=int(n_process*2), gap=num_seconds*pd.Timedelta(seconds=1))[:] valid_edss = er.EdfDatasetSegmentedSampler(segment_file_tuples=valid_label_files_segs, mode=mode, random_under_sample=random_under_sample_data_gen, num_samples=max_samples, max_bckg_samps_per_file=max_bckg_samps_per_file, n_process=int(n_process*2), gap=num_seconds*pd.Timedelta(seconds=1))[:] test_edss = er.EdfDatasetSegmentedSampler(segment_file_tuples=test_label_files_segs, mode=mode, random_under_sample=random_under_sample_data_gen, num_samples=max_samples, max_bckg_samps_per_file=max_bckg_samps_per_file, n_process=int(n_process*2), gap=num_seconds*pd.Timedelta(seconds=1))[:] def simple_edss(edss): ''' Use only a few columns so that we don't make 21*20 coherence pairs ''' all_channels = util_funcs.get_common_channel_names() subset_channels = [all_channels.index(channel) for channel in complex_feature_channels] return [(datum[0][:, subset_channels], datum[1]) for datum in edss] if include_simple_coherence: trainCoherData = np.stack([datum.values for datum in [datum[0] for datum in wfdata.CoherenceTransformer(simple_edss(train_edss), columns_to_use=constants.SYMMETRIC_COLUMN_SUBSET, n_process=n_process, is_pandas=False)[:]]]) validCoherData = np.stack([datum.values for datum in [datum[0] for datum in wfdata.CoherenceTransformer(simple_edss(valid_edss), columns_to_use=constants.SYMMETRIC_COLUMN_SUBSET, n_process=n_process, is_pandas=False)[:]]]) testCoherData = np.stack([datum.values for datum in [datum[0] for datum in wfdata.CoherenceTransformer(simple_edss(test_edss), columns_to_use=constants.SYMMETRIC_COLUMN_SUBSET, n_process=n_process, is_pandas=False)[:]]]) if use_simple_hand_engineered_features: trainSHED = wfdata.SimpleHandEngineeredDataset(simple_edss(train_edss), n_process=n_process, is_pandas_data=False, features=[tsf.abs_energy, tsf.sample_entropy, lambda x: tsf.number_cwt_peaks(x, int(constants.COMMON_FREQ/25))], f_names=["abs_energy", "entropy", "num_peaks"], vectorize="full")[:] validSHED = wfdata.SimpleHandEngineeredDataset(simple_edss(valid_edss), n_process=n_process, is_pandas_data=False, features=[tsf.abs_energy, tsf.sample_entropy, lambda x: tsf.number_cwt_peaks(x, int(constants.COMMON_FREQ/25))], f_names=["abs_energy", "entropy", "num_peaks"], vectorize="full")[:] testSHED = wfdata.SimpleHandEngineeredDataset(simple_edss(test_edss), n_process=n_process, is_pandas_data=False, features=[tsf.abs_energy, tsf.sample_entropy, lambda x: tsf.number_cwt_peaks(x, int(constants.COMMON_FREQ/25))], f_names=["abs_energy", "entropy", "num_peaks"], vectorize="full")[:] train_edss = read.Flattener(read.EdfFFTDatasetTransformer(train_edss, freq_bins=freq_bins, is_pandas_data=False), n_process=n_process)[:] valid_edss = read.Flattener(read.EdfFFTDatasetTransformer(valid_edss, freq_bins=freq_bins, is_pandas_data=False), n_process=n_process)[:] test_edss = read.Flattener(read.EdfFFTDatasetTransformer(test_edss, freq_bins=freq_bins, is_pandas_data=False), n_process=n_process)[:] def split_tuples(data): return np.stack([datum[0] for datum in data]), np.stack([datum[1] for datum in data]) train_edss, train_labels = split_tuples(train_edss) valid_edss, valid_labels = split_tuples(valid_edss) test_edss, test_labels = split_tuples(test_edss) if include_simple_coherence: train_edss = np.hstack([train_edss, trainCoherData]) valid_edss = np.hstack([valid_edss, validCoherData]) test_edss = np.hstack([test_edss, testCoherData]) if use_simple_hand_engineered_features: train_edss = np.hstack([train_edss, np.stack(trainSHED)]) valid_edss = np.hstack([valid_edss, np.stack(validSHED)]) test_edss = np.hstack([test_edss, np.stack(testSHED)]) print("Data Shape:", train_edss.shape) #some of the features are returning nans (assuming there is a log that may not play well?) return (np.nan_to_num(train_edss), train_labels), \ (np.nan_to_num(valid_edss), valid_labels), \ (np.nan_to_num(test_edss), test_labels)
def get_data(split, ref="01_tcp_ar", num_files=None, freq_bins=[0,3.5,7.5,14,20,25,40], columns_to_use=constants.SYMMETRIC_COLUMN_SUBSET, n_process=4, include_simple_coherence=True, filter=True): genderDictItems = cta.getGenderAndFileNames(split, ref) clinicalTxtPaths = [genderDictItem[0] for genderDictItem in genderDictItems] singGenders = [genderDictItem[1] for genderDictItem in genderDictItems] tokenFiles = [] genders = [] # duplicate singGenders depending on number of tokens per session for i, txtPath in enumerate(clinicalTxtPaths): session_dir = path.dirname(txtPath) session_tkn_files = sorted(read.get_token_file_names(session_dir)) tokenFiles += session_tkn_files genders += [singGenders[i] for tkn_file in session_tkn_files] edfRawData = read.EdfDataset( split, ref, num_files=num_files, columns_to_use=columns_to_use, expand_tse=False, filter=filter) edfRawData.edf_tokens = tokenFiles[:num_files] edfFFTData = read.EdfFFTDatasetTransformer( edfRawData, n_process=n_process, freq_bins=freq_bins, return_ann=False) fullData = edfFFTData[:] # transform to number genders = [1 if gender == 'm' else 0 for gender in genders] toReturnData = np.stack([datum.values.reshape(-1) for datum in fullData]) if include_simple_coherence: coherData = wfdata.CoherenceTransformer(edfRawData, columns_to_use=columns_to_use, n_process=n_process) fullCoherData = [datum[0] for datum in coherData[:]] fullCoherData = np.stack([datum.values for datum in fullCoherData]) toReturnData = np.hstack([toReturnData, fullCoherData]) return toReturnData, \ np.array(genders).reshape(-1, 1)[:num_files]
def get_data(split, ref, n_process, precache, num_files, window, non_overlapping, return_mode, filter, use_multiple_tokens_per_session=False): if window is not None: window = window * pd.Timedelta(seconds=1) if return_mode == "age": ageData = cta.getAgesAndFileNames(split, ref) elif return_mode == "bpm": # not really agedata, bpmdata really ageData = cta.getBPMAndFileNames(split, ref) if num_files is not None: ageData = ageData[0:num_files] clinical_txt_paths = [ageDatum[0] for ageDatum in ageData] ages = [ageDatum[1] for ageDatum in ageData] # associate first token file with each session for now tokenFiles = [] for session_file in clinical_txt_paths: session_dir = path.dirname(session_file) session_tkn_files = sorted(read.get_token_file_names(session_dir)) tokenFiles.append(session_tkn_files[0]) # discarding the annotations eventually edfReader = read.EdfDataset(split, ref, expand_tse=False, filter=filter) edfReader.edf_tokens = tokenFiles # override to use only eegs with ages we have edfFFTData = read.EdfFFTDatasetTransformer(edf_dataset=edfReader, n_process=n_process, precache=True, window_size=window, non_overlapping=non_overlapping, return_ann=False) data = edfFFTData[:] if window is not None: for i, datum in enumerate(data): data[i] = datum.transpose(1, 0, 2).reshape(datum.shape[1], -1) return data, ages, clinical_txt_paths
def get_data(num_eegs, data_split, ref, precached_pkl): """Returns the data to process. Parameters ---------- num_eegs : type Description of parameter `num_eegs`. data_split : type Description of parameter `data_split`. ref : type Description of parameter `ref`. Returns ------- tuple first elem is data freq coefficients of (num_instances, num_freq_bins), second elem is annotation percentages of (num_instances, annotations) """ if precached_pkl is None: edf_dataset = read.EdfDataset(data_split, ref, num_files=num_eegs) fft_reader = read.EdfFFTDatasetTransformer(edf_dataset=edf_dataset, precache=True) data = fft_reader[0:num_eegs] annotations = np.array([datum[1].mean().values for datum in data]) else: data = pkl.load(open(precached_pkl, 'rb'))[0:num_eegs] annotations = [] for i in range(len(data)): annotations.append( read.expand_tse_file( data[i][1], pd.Series(list(range(int(data[i][1].end.max())))) * pd.Timedelta(seconds=1)).mean().values) annotations = np.array(annotations) cols = [set(datum[0].dropna().columns) for datum in data] common_cols = list(cols[0].intersection(*cols)) x_data = np.array([datum[0][common_cols].values for datum in data]) x_data = x_data.reshape(x_data.shape[0], -1) return x_data, annotations
def run_rf(use_combined, use_random_ensemble, combined_split, freq_bins, max_train_rf_samps, n_process, rf_data_pickle, use_cached_pkl): if not use_combined or not use_random_ensemble: raise NotImplemented("Have not completed this flow yet") else: if not use_cached_pkl or not path.exists(rf_data_pickle): trainEdfData, trainGender = split_data_gender( get_data(combined_split, is_test=False, is_valid=False)[0][:max_train_rf_samps]) trainEdfData = read.EdfFFTDatasetTransformer(trainEdfData, return_numpy=True, is_tuple_data=False, is_pandas_data=False, freq_bins=freq_bins, n_process=n_process) trainEdfData.verbosity = 200 trainEdfData = trainEdfData[:] validEdfData, validGender = split_data_gender( get_data(combined_split, is_test=False, is_valid=True)[0][:max_train_rf_samps]) validEdfData = read.EdfFFTDatasetTransformer(validEdfData, return_numpy=True, is_tuple_data=False, is_pandas_data=False, freq_bins=freq_bins, n_process=n_process) validEdfData.verbosity = 200 validEdfData = validEdfData[:] trainSize = len(trainEdfData) validSize = len(validEdfData) trainValidData = np.vstack( [np.stack(trainEdfData), np.stack(validEdfData)]) trainValidData = trainValidData.reshape(trainValidData.shape[0], -1) trainValidGender = np.hstack( [np.array(trainGender), np.array(validGender)]).reshape(-1, 1) # deallocate memory so o2 doesn't kick this out when we try to start training, etc. del trainEdfData del validEdfData pkl.dump((trainSize, validSize, trainValidData, trainValidGender), open(rf_data_pickle, 'wb')) else: trainSize, validSize, trainValidData, trainValidGender = pkl.load( open(rf_data_pickle, 'rb')) rf = RandomForestClassifier() preSplit = PredefinedSplit([0 for i in range(trainSize)] + [-1 for i in range(validSize)]) rf_parameters = { 'criterion': ["gini", "entropy"], 'n_estimators': [50, 100, 200, 400], 'max_features': ['auto', 'log2', .1, .4], 'max_depth': [None, 4, 8, 12], 'min_samples_split': [2, 4, 8], 'n_jobs': [1], 'min_weight_fraction_leaf': [0, 0.2] } gridsearch = GridSearchCV(rf, rf_parameters, scoring=make_scorer(f1_score), cv=preSplit, n_jobs=n_process) gridsearch.fit(trainValidData, trainValidGender) if not use_cached_pkl or not path.exists("test_" + rf_data_pickle): testEdfData, testGender = split_data_gender( get_data(combined_split, is_test=True, is_valid=False)[0][:max_train_rf_samps]) testEdfData = read.EdfFFTDatasetTransformer(testEdfData, return_numpy=True, is_tuple_data=False, is_pandas_data=False, freq_bins=freq_bins, n_process=n_process) testEdfData.verbosity = 200 testEdfData = testEdfData[:] pkl.dump((testEdfData, testGender), open("test_" + rf_data_pickle, 'wb')) else: testEdfData, testGender = pkl.load( open("test_" + rf_data_pickle, 'rb')) y_pred = gridsearch.predict( np.stack(testEdfData).reshape(len(testEdfData), -1)) toReturn = { 'f1_score': f1_score(testGender, y_pred), 'auc': roc_auc_score(testGender, y_pred), 'mcc': matthews_corrcoef(testGender, y_pred), 'accuracy': accuracy_score(testGender, y_pred) } trainEdfTokens, validEdfTokens, testEdfTokens, trainGenders, validGenders, _testGendersCopy = get_test_train_split_from_combined( ) testEdfEnsembler = get_base_dataset("combined", labels=_testGendersCopy, edfTokenPaths=testEdfTokens, is_test=True) y_pred = gridsearch.best_estimator_.predict_proba( np.stack(testEdfData).reshape(len(testEdfData), -1)) label, average_pred = testEdfEnsembler.dataset.getEnsemblePrediction( y_pred, mode=er.EdfDatasetEnsembler.ENSEMBLE_PREDICTION_EQUAL_VOTE) pred = np.round(average_pred) toReturn["ensemble_score"] = {} toReturn["ensemble_score"]["auc"] = roc_auc_score(label, pred) toReturn["ensemble_score"]["acc"] = accuracy_score(label, pred) toReturn["ensemble_score"]["f1_score"] = f1_score(label, pred) toReturn["ensemble_score"]["discordance"] = np.abs( pred - average_pred).mean() toReturn["ensemble_score"]["equal_vote"]["auc"] = roc_auc_score( label, pred ) #keep auc here as well in top level for compatibility reasons when comparing toReturn["ensemble_score"]["equal_vote"]["acc"] = accuracy_score( label, pred) toReturn["ensemble_score"]["equal_vote"]["f1_score"] = f1_score( label, pred) toReturn["ensemble_score"]["equal_vote"]["discordance"] = np.abs( pred - average_pred).mean() label, average_over_all_pred = testEdfEnsembler.getEnsemblePrediction( y_pred, mode=er.EdfDatasetEnsembler.ENSEMBLE_PREDICTION_OVER_EACH_SAMP) pred = np.round(average_over_all_pred) toReturn["ensemble_score"]["over_all"]["auc"] = roc_auc_score( label, pred ) #keep auc here as well in top level for compatibility reasons when comparing toReturn["ensemble_score"]["over_all"]["acc"] = accuracy_score( label, pred) toReturn["ensemble_score"]["over_all"]["f1_score"] = f1_score( label, pred) toReturn["ensemble_score"]["over_all"]["discordance"] = np.abs( pred - average_pred).mean() return toReturn