def all_slides_tile_mat(): all_samples = processing.get_list_of_samples() corr_mat = [] for count, slide in enumerate(all_samples): utils.print_progress(count) cell_mat = np.load(slide).astype(int) corr_row = get_tile_correlation(cell_mat, dist=[50, 100, 200]) corr_mat.append(corr_row) corr_mat = np.vstack(corr_mat) return corr_mat
def map_cells_to_mat(): ''' Reconstructs original slides by reading original cell processing text files and recording cell phenotypes in a 1040x1392 numpy matrix. ''' outloc = os.path.join(DATA_PATH, 'processed') if not os.path.exists(outloc): os.makedirs(outloc) files = glob.glob(DATA_PATH + '*cell_seg_data.txt') files.sort() for i, item in enumerate(files): utils.print_progress(i) cells = processing.load_sample(item, confidence_thresh=0, verbose=False, radius_lim=0) mat = processing.map_phenotypes_to_mat(cells) name = item.split("EACRI HNSCC\\")[1].split("_cell_seg_data")[0] np.save(os.path.join(outloc, name), mat)
def main(): all_samples = processing.get_list_of_samples() combined_features = [] combined_response = [] used_samples = [] # iterate over all slides for count, slide in enumerate(all_samples): utils.print_progress(count) # load slide matrix and decision boundary cell_mat = np.load(slide) try: regions = np.load(slide.split(".npy")[0] + "_seg.npy") except IOError: print "skipping slide with no region info" continue # corr_row = texture_analysis.get_tile_correlation(cell_mat, dist=[50]) # disregard unclassified regions for total area tumor_area = np.sum(regions == 1) / 1000 stromal_area = np.sum(regions == 0) / 1000 total_area = tumor_area + stromal_area tumor_mask = (regions == 1).astype(int) stromal_mask = (regions == 0).astype(int) # slide-level cell counts n_tumor = np.sum(cell_mat == 1) n_pdl1 = np.sum(cell_mat == 2) n_any_tumor = n_tumor + n_pdl1 n_foxp3 = np.sum(cell_mat == 3) n_cd8 = np.sum(cell_mat == 4) n_cd4 = np.sum(cell_mat == 5) n_pdmac = np.sum(cell_mat == 6) n_other = np.sum(cell_mat == 7) n_macs = np.sum(cell_mat == 8) # all_densities = [n_any_tumor, n_foxp3, n_cd8, n_cd4, # n_pdmac, n_other, n_macs] / total_area # all_densities = [n_any_tumor, n_foxp3, n_cd8, n_cd4, # n_pdmac, n_other, n_macs] / np.sum(cell_mat > 0) # all_densities = [n_any_tumor, n_foxp3, n_cd8, n_cd4, # n_pdmac, n_other, n_macs] feature_row = [ tumor_area / total_area, stromal_area / total_area, n_tumor, n_pdl1, n_any_tumor, n_foxp3, n_cd8, n_cd4, n_pdmac, n_other, n_macs ] # # stromal cell counts # ns_tumor = np.sum((cell_mat == 1) * stromal_mask) # ns_pdl1 = np.sum((cell_mat == 2) * stromal_mask) # ns_any_tumor = ns_tumor + ns_pdl1 + 1 # ns_foxp3 = np.sum((cell_mat == 3) * stromal_mask) + 1 # ns_cd8 = np.sum((cell_mat == 4) * stromal_mask) + 1 # ns_cd4 = np.sum((cell_mat == 5) * stromal_mask) + 1 # ns_pdmac = np.sum((cell_mat == 6) * stromal_mask) + 1 # ns_other = np.sum((cell_mat == 7) * stromal_mask) + 1 # ns_macs = np.sum((cell_mat == 8) * stromal_mask) + 1 # # # stromal_densities = [ns_any_tumor, ns_foxp3, ns_cd8, ns_cd4, # # ns_pdmac, ns_other, ns_macs] / stromal_area # stromal_densities = [ns_any_tumor, ns_foxp3, ns_cd8, ns_cd4, # ns_pdmac, ns_other, ns_macs] / np.sum((cell_mat>0)*stromal_area) # # stromal_densities = [ns_any_tumor, ns_foxp3, ns_cd8, ns_cd4, # # ns_pdmac, ns_other, ns_macs] # # in-tumor cell counts # nt_tumor = np.sum((cell_mat == 1) * tumor_mask) # nt_pdl1 = np.sum((cell_mat == 2) * tumor_mask) # nt_any_tumor = nt_tumor + nt_pdl1 + 1 # nt_foxp3 = np.sum((cell_mat == 3) * tumor_mask) + 1 # nt_cd8 = np.sum((cell_mat == 4) * tumor_mask) + 1 # nt_cd4 = np.sum((cell_mat == 5) * tumor_mask) + 1 # nt_pdmac = np.sum((cell_mat == 6) * tumor_mask) + 1 # nt_other = np.sum((cell_mat == 7) * tumor_mask) + 1 # nt_macs = np.sum((cell_mat == 8) * tumor_mask) + 1 # # # tumor_densities = [nt_any_tumor, nt_foxp3, nt_cd8, nt_cd4, # # nt_pdmac, nt_other, nt_macs] / tumor_area # tumor_densities = [nt_any_tumor, nt_foxp3, nt_cd8, nt_cd4, # nt_pdmac, nt_other, nt_macs] / np.sum((cell_mat>0)*tumor_area) # # tumor_densities = [nt_any_tumor, nt_foxp3, nt_cd8, nt_cd4, # # nt_pdmac, nt_other, nt_macs] if n_any_tumor < 100: print "skipping slide with {0} tumor cells:".format(n_any_tumor) continue # if (tumor_area/stromal_area < 0.1) or (stromal_area/tumor_area < 0.1): # print "skipping slide with tumor:stromal area ratio: ", tumor_area / stromal_area # continue # feature_row = np.concatenate((stromal_densities, tumor_densities)) # feature_row = np.concatenate((stromal_densities, tumor_densities, corr_row)) # feature_row = corr_row ratio = n_pdl1 / (n_tumor + n_pdl1) combined_features.append(feature_row) combined_response.append(ratio) used_samples.append(slide) # convert feature and response to numpy arrays for analysis combined_features = np.vstack(combined_features) combined_response = np.array(combined_response) combined_features.shape # used_samples = [x.replace("processed_orig_seg", "processed") for x in used_samples] #### Combine features with clinical information #### lookup = clinical.clinical_lookup_table() lookup.shape feature_names = [ "".join(["f", str(x)]) for x in range(combined_features.shape[1]) ] tmp = pd.DataFrame(combined_features, columns=feature_names) tmp['response'] = combined_response tmp['slide'] = used_samples all_data = pd.merge(tmp, lookup) all_data.shape # create index for patients from 1-n ids = list(set(all_data.id)) id_convert = dict(zip(ids, np.arange(len(ids)))) all_data['idx'] = [id_convert[str(x)] for x in all_data.id.values] # # add logit transformed response variable # dtr = learning.VectorTransform(all_data.response) # all_data['y*'] = dtr.zero_one_scale().apply('logit') all_data.head() # fig = display.dotplot(all_data.response, all_data.idx, n_patients=40) # fig.savefig(HOME_DIR + '/results/whole_slide/dotplot.png', format='png', dpi=150) # display.scatter_hist(all_data.response, all_data.response) # display.scatter_hist(all_data['y*'], all_data['y*'], xlims=[-8,8], ylims=[-8,8]) # for setting aside a holdout set -- however, there are not enough patients to reliably # evaluate a holdout set. We rely on repeated k-fold cross validation and (potentially) # .632+ bootstrap estimates # patients = np.unique(all_data.idx) # holdout_patients = np.random.choice(patients, size=int(0.25 * len(patients)), replace=False) # holdout_data = all_data.loc[all_data.idx.isin(holdout_patients), :] # learn_data = all_data.loc[~all_data.idx.isin(holdout_patients), :] #### fit machine learning models #### from sklearn.linear_model import LassoCV from sklearn.linear_model import Lasso from sklearn.linear_model import RidgeCV from sklearn.linear_model import Ridge from sklearn.linear_model import ElasticNetCV from sklearn.linear_model import LinearRegression from sklearn.linear_model import LogisticRegression from sklearn.svm import SVR from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import MaxAbsScaler from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import RepeatedKFold, GroupKFold X_ = all_data.loc[:, feature_names].values y_ = all_data['y*'].values patient_ids = all_data.idx.values X_[:, :14] = np.log(X_[:, :14]) # sklearn's GroupKFold for grouped cross-validation does not implement shuffling so # returns the same train/test sets each call. We attempted taking row-wise permutations # of the patient dataframe for each iteration as a workaround, but that did not change # the splits, because GroupKFold seems to be using the ordered group values to split. # Instead, we shuffle the unique values of the group variable to make different group splits. alphas = np.logspace(-2, 3, 20) estimator = RidgeCV(alphas=alphas) # estimator = RandomForestRegressor(n_estimators=100) gcv = GroupKFold(n_splits=10) replications = 1 rep_scores = [] out_sample = {'pred': [], 'target': [], 'id': []} # pca = PCA() for iter in range(replications): # permute input arrays the same way ids_shuffle = utils.unique_value_shuffle(patient_ids) # perform grouped cross-validation estimation for train, test in gcv.split(X_, y_, ids_shuffle): X_train = X_[train] X_test = X_[test] y_train = y_[train] y_test = y_[test] # X_train = np.log(X_train) # X_test = np.log(X_test) scale = StandardScaler() X_train = scale.fit_transform(X_train) X_test = scale.transform(X_test) # X_train = pca.fit_transform(X_train) # X_test = pca.transform(X_test) preds = estimator.fit(X_train, y_train).predict(X_test) preds = dtr.undo(preds) y_test = dtr.undo(y_test) rep_scores.append(metrics.rmse(preds, y_test)) # rep_scores.append(estimator.score(X_test, y_test)) out_sample['pred'].extend(preds) out_sample['target'].extend(y_test) out_sample['id'].extend(ids_shuffle[test]) print np.mean(rep_scores), np.std(rep_scores) # TEST POLYNOMIAL FEATURES # take polynomial combinations of the inputs from sklearn.preprocessing import PolynomialFeatures poly = PolynomialFeatures(2) X_p = np.hstack((X_, 1 / X_)) X_p = poly.fit_transform(X_p) X_p = np.log(X_p) X_p.shape estimator = RidgeCV() gcv = GroupKFold(n_splits=10) scale = MaxAbsScaler() replications = 10 rep_scores = [] out_sample = {'pred': [], 'target': [], 'id': []} for iter in range(replications): # permute input arrays the same way ids_shuffle = utils.unique_value_shuffle(patient_ids) # perform grouped cross-validation estimation for train, test in gcv.split(X_p, y_, ids_shuffle): X_train = X_p[train] X_test = X_p[test] y_train = y_[train] y_test = y_[test] X_train = scale.fit_transform(X_train) X_test = scale.transform(X_test) preds = estimator.fit(X_train, y_train).predict(X_test) preds = dtr.undo(preds) y_test = dtr.undo(y_test) rep_scores.append(metrics.rmse(preds, y_test)) out_sample['pred'].extend(preds) out_sample['target'].extend(y_test) out_sample['id'].extend(ids_shuffle[test]) print np.mean(rep_scores), np.std(rep_scores) # dict elements from list to array for key, value in out_sample.iteritems(): out_sample[key] = np.array(value) metrics.rmse(out_sample['pred'], out_sample['target']) fig = display.fixed_scatter(out_sample['pred'], out_sample['target']) def aggregate_by_patient(prediction, target, patient_id): gdf = pd.DataFrame({ 'preds': prediction, 'targs': target, 'id': patient_id }) by_pt_results = gdf.groupby('id').mean() return by_pt_results.preds.values, by_pt_results.targs.values pt_pred, pt_targ = aggregate_by_patient(out_sample['pred'], out_sample['target'], out_sample['id']) fig = display.fixed_scatter(pt_pred, pt_targ) fig.suptitle('Patient level pdl1') fig.savefig(HOME_DIR + '/results/whole_slide/patient_sqrt_ratios.png', format='png', dpi=150) metrics.rmse(pt_pred, pt_targ) metrics.corr(pt_pred, pt_targ) plt.scatter(pt_targ - pt_pred, pt_pred) # try: converting cell counts to some kind of probability measure as feature # --> correcting cell counts via slide-level cellular density # from sklearn.feature_selection import RFECV # rfecv = RFECV(estimator=estimator, step=10, cv=10, scoring='mean_squared_error') # X_new = rfecv.fit_transform(X_all, y_all) # print("Optimal number of features : %d" % rfecv.n_features_) # # Plot number of features VS. cross-validation scores # plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) ########################################## ########################################## learning_data = all_data.loc[:, feature_names + ['response', 'y*', 'idx']] # create training and test set data split by patient (not slide) from sklearn.model_selection import train_test_split n_patients = np.max(all_data.idx) idx_train, idx_test = train_test_split(range(n_patients), test_size=0.1) train_data = learning_data[learning_data.idx.isin(idx_train)] test_data = learning_data[learning_data.idx.isin(idx_test)] from sklearn.preprocessing import PolynomialFeatures poly = PolynomialFeatures(2) X_all = tmp.loc[:, feature_names] X_all = np.array(X_all) X_all[X_all == 0] = 1 X_all = np.hstack((X_all, 1 / X_all)) X_all = poly.fit_transform(X_all) # separate out train/test arrays by X/y/y* X_train, y_train = train_data.loc[:, feature_names], train_data.response X_test, y_test = test_data.loc[:, feature_names], test_data.response ystar_train, ystar_test = train_data['y*'], test_data['y*'] # standardize X features using X_train and apply to X_test from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # test regression or learning algorithms from sklearn.neural_network import MLPRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import LinearRegression estimator = RandomForestRegressor(n_estimators=300, oob_score=True, bootstrap=True) estimator = LinearRegression() estimator = MLPRegressor(hidden_layer_sizes=(2), max_iter=10000, alpha=0.005, solver='lbfgs', activation='logistic', warm_start=False) estimator = ElasticNet(alpha=0.005, l1_ratio=.1, max_iter=10000) ##### SLIDE-LEVEL METRICS ##### X_preds = estimator.fit(X_train, y_train).predict(X_test) metrics.rmse(X_preds, y_test) metrics.corr(X_preds, y_test) display.fixed_scatter(X_preds, y_test) # repeat process to test *transformed* response var Xstar_preds = estimator.fit(X_train, ystar_train).predict(X_test) # undo the transform Xstar_restore = dtr.undo(Xstar_preds) ystar_test_restore = dtr.undo(ystar_test) if not np.allclose(ystar_test_restore, y_test): raise ValueError( "Undo on transformed response does not match original.") metrics.rmse(Xstar_restore, y_test) metrics.corr(Xstar_restore, y_test) display.fixed_scatter(Xstar_restore, y_test) ##### PATIENT-LEVEL METRICS ##### def aggregate_by_patient(prediction, target, patient_id): gdf = pd.DataFrame({ 'preds': prediction, 'targs': target, 'id': patient_id }) by_pt_results = gdf.groupby('id').mean() return by_pt_results.preds.values, by_pt_results.targs.values # test untransformed y-var pt_pred, pt_response = aggregate_by_patient(X_preds, y_test, patient_id=test_data.idx) metrics.rmse(pt_pred, pt_response) metrics.corr(pt_pred, pt_response) display.fixed_scatter(pt_pred, pt_response) # test transformed y-var pt_pred, pt_response = aggregate_by_patient(Xstar_restore, y_test, patient_id=test_data.idx) metrics.rmse(pt_pred, pt_response) metrics.corr(pt_pred, pt_response) display.fixed_scatter(pt_pred, pt_response) estimator.coef_ ################################################################3 # # def apply_boxcox(X): # return np.apply_along_axis(_boxcox_transform, 0, X) # # def _boxcox_transform(arr): # return stats.boxcox(arr)[0] def confusion_matrix(x, y, threshold=0.5): q1 = np.sum((x > threshold) & (y > threshold)) q2 = np.sum((x > threshold) & (y <= threshold)) q3 = np.sum((x <= threshold) & (y > threshold)) q4 = np.sum((x <= threshold) & (y <= threshold)) return np.array([[q1, q2], [q3, q4]]) confusion_matrix(pt_pred, pt_response, threshold=0.3) import helper.metrics as metrics tmp = all_data[all_data.STAGE == 4] metrics.corr_nan(tmp.response, tmp.radiation)
def extract_dataset(diams, sample_diam, flag): np.random.seed(1000) # set sampling parameters N_SLIDES = 314 # number of slides to use N_SAMPLES = 30 # max samples to take from a single slide # set tile feature extraction parameters sample_tile_width = sample_diam feature_tile_width = 1 Nx, Ny = int(1392 / sample_tile_width), int(1040 / sample_tile_width) # no. sample tiles nx, ny = Nx * sample_tile_width, Ny * sample_tile_width # no. feature tiles offset_px = int((max(diams) - sample_diam) / 2) offset_tiles = int(np.ceil(offset_px / sample_diam)) # get pre-processed slide matrices and select random sample of slides all_slides = processing.get_list_of_samples(processed_slides) SLIDES = [all_slides[i] for i in np.random.choice(len(all_slides), N_SLIDES, replace=False)] # initialize processed variables storage ncells_all = [] slides_all = [] X_all = [] y_all = [] overlap_all = [] # process samples in batches batch_size = 10 for idx in range(0, N_SLIDES, batch_size): BATCH = SLIDES[idx:idx + batch_size] # iterate over sampled slides to extract feature and response variables via tile sampling batch_ncells = [] batch_features = [] batch_response = [] batch_slides = [] batch_overlap = [] for i, slide in enumerate(BATCH): print_progress(i) # load slide and reshape into sample and feature tile stacks cell_mat = np.load(slide) sample_tile_stack = utils.restack_to_tiles(cell_mat, tile_width=sample_tile_width, nx=Nx, ny=Ny) feature_tile_stack = utils.restack_to_tiles(cell_mat, tile_width=feature_tile_width, nx=nx, ny=ny) # load seg file to compute ratio of processed area to total slide area seg = np.load(slide.split(".npy")[0] + "_seg.npy") correction = 1392*1040 / np.sum(seg != -1) n_cells_total = np.sum(cell_mat != 0) n_cells_corrected = n_cells_total * correction # make unprocessed region matrix from seg file seg_map = (seg == -1).astype(int) seg_tile_stack = utils.restack_to_tiles(seg_map, tile_width=feature_tile_width, nx=nx, ny=ny) ### used for limiting tile sampling to 'edge regions' between tumor and stroma. ### For now I think it is simpler and more explanable to permit sampling anywhere in the ### tumor, not just on the edge. I may revisit this in the future. # # load tumor edge matrix (skipping slide if no matrix is found) # try: # edges = np.load(slide.split(".npy")[0] + "_edges.npy") # edges_tile_stack = utils.restack_to_tiles(edges, tile_width=sample_tile_width, # nx=Nx, ny=Ny) # except IOError: # print 'No edge matrix. Skipping slide...' # continue # select valid tiles for sampling, skipping slide if no valid tiles are available # tile_mask = utils.tile_stack_mask(Nx, Ny, L=sample_layers, db_stack=edges_tile_stack) # get set of valid sampling tiles (tiles with enough offset from the edges) tile_mask = utils.tile_stack_mask(Nx, Ny, L=offset_tiles, db_stack=None) n_tiles = int(min(N_SAMPLES, np.sum(tile_mask))) if n_tiles == 0: print('0 valid samples. Skipping slide...') continue # store batch cell numbers and slide names batch_ncells.extend([n_cells_corrected] * n_tiles) batch_slides.extend([slide] * n_tiles) # uniformly sample tiles from the valid sample space of size n_samples # in this case, I have set it to just get all available samples from each slide sampled_indices = np.random.choice(a=Nx * Ny, size=n_tiles, p=tile_mask / np.sum(tile_mask), replace=False) sampled_tiles = sample_tile_stack[sampled_indices, :, :] # compute response variable over sampled tiles response, nts = utils.get_pdl1_response(sampled_tiles, circle=True, diameter=sample_tile_width, diagnostic=True) # compute feature arrays over sampled tiles from neighboring tiles feature_rows = [] overlap = [] for j in sampled_indices: feature_tiles = utils.get_feature_array(j, feature_tile_stack, Nx, sample_tile_width, offset_px, flag) seg_map_tiles = utils.get_feature_array(j, seg_tile_stack, Nx, sample_tile_width, offset_px, flag) # store feature tile and overlap with unprocessed regions feature_rows.append(feature_tiles) overlap.append(np.sum(seg_map_tiles)) del feature_tile_stack del seg_tile_stack # add to growing array as long as any valid samples have been collected if len(feature_rows) > 0: feature_rows = np.vstack(feature_rows) overlap = np.array(overlap) # # remove observations with significant overlap (>10%) with unprocessed regions # mask = (np.array(overlap) <= 0.1 * max(diams) ** 2) # feature_rows = feature_rows[mask, :] # response = response[mask] # nts = nts[mask] batch_response.extend(response) batch_features.append(feature_rows) batch_overlap.extend(overlap) # convert feature and response to numpy arrays for analysis batch_features = np.vstack(batch_features) batch_response = np.array(batch_response) batch_overlap = np.array(batch_overlap) # ----- variable processing ----- # # # remove all cases with no tumor cells in the sampled tile # mask = combined_response == -1 # combined_response = combined_response[~mask] # combined_features = combined_features[~mask, :] # # alternatively, remove all cases with <K tumor cells in the sampled tile # # print combined_nts.shape, combined_response.shape, combined_features.shape # mask = combined_nts < 10 # combined_response = combined_response[~mask] # combined_features = combined_features[~mask, :] # aggregate tiles within arbitrary shapes (e.g. discs or squares of increasing size) n_obs = batch_features.shape[0] side_len = sample_tile_width + 2 * offset_px n_tiles = side_len ** 2 if flag == 'n': phens = ['tumor','cd4','cd8','foxp3','pdmac','other'] elif flag == 'a': phens = ['tumor','pdl1','cd4','cd8','foxp3','pdmac','other'] elif flag == 't': phens = ['tumor','pdl1'] phen_columns = [] for phen in range(len(phens)): # iterate process over each phenotype tmp_tiles = batch_features[:, phen * n_tiles:(phen + 1) * n_tiles] tmp_3d = tmp_tiles.reshape(n_obs, side_len, side_len) range_columns = [] diams_0 = [0] + diams for i in range(len(diams)): print_progress('{0}: {1}'.format(phens[phen], diams[i])) if (flag in ['a','t']) and (phens[phen] in ['tumor','pdl1']) and (diams[i] <= sample_tile_width): print("skipping.") continue mask = utils.shape_mask(grid_dim=side_len, type='circle', S=diams_0[i+1], s=diams_0[i]) t = np.sum(np.multiply(tmp_3d, mask), axis=(1,2)).reshape(-1, 1) # sigma = np.std(np.multiply(tmp_3d, mask), axis=(1,2)).reshape(-1,1) range_columns.append(t) # range_columns.append(sigma) per_phen_features = np.hstack(range_columns) phen_columns.append(per_phen_features) del batch_features ncells_all.extend(batch_ncells) slides_all.extend(batch_slides) X_all.append(np.hstack(phen_columns)) y_all.extend(batch_response) overlap_all.extend(batch_overlap) ncells_all = np.array(ncells_all) X_all = np.vstack(X_all) y_all = np.array(y_all) overlap_all = np.array(overlap_all) # save processed data as csv feature_names = ["_".join([a, str(b)]) for a in phens for b in diams] feature_names.append('y') tmp = pd.DataFrame(np.hstack((X_all, y_all.reshape(-1,1)))) tmp.columns = feature_names tmp['slide'] = slides_all tmp['n_cells_corrected'] = ncells_all tmp['unscored_overlap'] = overlap_all tmp = tmp.set_index('slide') tmp.to_csv(os.path.join(HOME_PATH, 'data', 'local_discs.csv'))
def extract_dataset(diams, sample_diam, flag): np.random.seed(1000) # set sampling parameters N_SLIDES = 260 N_SAMPLES = 15 # set feature extraction parameters sample_tile_width = sample_diam feature_tile_width = 1 feature_layers = 75 # compute other parameters based on input parameters scale = int(sample_tile_width / feature_tile_width) assert ( scale == sample_tile_width / feature_tile_width ), "sample_tile_width must be integer multiple of feature_tile_width" Nx, Ny = int(1392 / sample_tile_width), int(1040 / sample_tile_width) nx, ny = Nx * scale, Ny * scale sample_layers = int( np.ceil(feature_layers * feature_tile_width / sample_tile_width)) # get pre-processed slide matrices and select random sample of slides all_samples = helper.processing.get_list_of_samples(DIR) SAMPLES = [ all_samples[i] for i in np.random.choice(len(all_samples), N_SLIDES, replace=False) ] # iterate over sampled slides to extract feature and response variables via tile sampling combined_features = [] combined_response = [] combined_nts = [] for i, slide in enumerate(SAMPLES): print_progress(i) # load slide and reshape into tile stacks cell_mat = np.load(slide) sample_tile_stack = utils.restack_to_tiles( cell_mat, tile_width=sample_tile_width, nx=Nx, ny=Ny) feature_tile_stack = utils.restack_to_tiles( cell_mat, tile_width=feature_tile_width, nx=nx, ny=ny) # load tumor edge matrix (skipping slide if no matrix is found) try: edges = np.load(slide.split(".npy")[0] + "_edges.npy") edges_tile_stack = utils.restack_to_tiles( edges, tile_width=sample_tile_width, nx=Nx, ny=Ny) except IOError: print 'No edge matrix. Skipping slide...' continue # select valid tiles for sampling, skipping slide if no valid tiles are available tile_mask = utils.tile_stack_mask(Nx, Ny, L=sample_layers, db_stack=edges_tile_stack) if np.sum(tile_mask) == 0: print '0 valid samples. Skipping slide...' continue # uniformly sample tiles from the valid sample space of size n_samples sampled_indices = np.random.choice(a=Nx * Ny, size=int( min(N_SAMPLES, np.sum(tile_mask))), p=tile_mask / np.sum(tile_mask), replace=False) sampled_tiles = sample_tile_stack[sampled_indices, :, :] # compute response variable over sampled tiles response, nts = utils.get_pdl1_response(sampled_tiles, circle=True, diameter=sample_tile_width, diagnostic=True) # compute feature arrays over sampled tiles from neighboring tiles feature_rows = np.vstack([ utils.get_feature_array(idx, feature_tile_stack, Nx, scale, feature_layers, flag) for idx in sampled_indices ]) # add outputs to growing array combined_response.extend(response) combined_features.append(feature_rows) combined_nts.extend(nts) # convert feature and response to numpy arrays for analysis combined_features = np.vstack(combined_features) combined_features[np.isnan(combined_features)] = -1 combined_response = np.array(combined_response) combined_nts = np.array(combined_nts) # ----- variable processing ----- # # # remove all cases with no tumor cells in the sampled tile # mask = combined_response == -1 # combined_response = combined_response[~mask] # combined_features = combined_features[~mask, :] # alternatively, remove all cases with <K tumor cells in the sampled tile print combined_nts.shape, combined_response.shape, combined_features.shape mask = combined_nts < 10 combined_response = combined_response[~mask] combined_features = combined_features[~mask, :] # aggregate tiles within arbitrary shapes (e.g. discs or squares of increasing size) n_obs = combined_features.shape[0] side_len = scale + 2 * feature_layers n_tiles = side_len**2 if flag == 'n': phens = ['tumor', 'cd4', 'cd8', 'foxp3', 'pdmac', 'other'] elif flag == 'a': phens = ['tumor', 'pdl1', 'cd4', 'cd8', 'foxp3', 'pdmac', 'other'] elif flag == 't': phens = ['tumor', 'pdl1'] phen_columns = [] for phen in range(len(phens)): # iterate process over each phenotype tmp_tiles = combined_features[:, phen * n_tiles:(phen + 1) * n_tiles] tmp_3d = tmp_tiles.reshape(n_obs, side_len, side_len) range_columns = [] d_seq_0 = [0] + d_seq for i in range(len(d_seq_0) - 1): # utils.print_progress(i) print phens[phen], d_seq[i] if (flag in ['a', 't']) and (phens[phen] in [ 'tumor', 'pdl1' ]) and (d_seq[i] <= sample_tile_width): print "skipping." continue mask = utils.shape_mask(grid_dim=side_len, type='circle', S=d_seq_0[i + 1], s=d_seq_0[i]) t = np.sum(np.multiply(tmp_3d, mask), axis=(1, 2)).reshape(-1, 1) # sigma = np.std(np.multiply(tmp_3d, mask), axis=(1,2)).reshape(-1,1) range_columns.append(t) # range_columns.append(sigma) per_phen_features = np.hstack(range_columns) phen_columns.append(per_phen_features) X = np.hstack(phen_columns) np.save(STORE_DIR + "data_x", X) np.save(STORE_DIR + "data_y", combined_response)
def automate_tile_extraction(SAMPLES): # iterate over sampled slides to extract feature and response variables via tile sampling combined_features = [] combined_response = [] combined_nts = [] for i, slide in enumerate(SAMPLES): print_progress(i) # load slide and reshape into tile stacks cell_mat = np.load(slide) sample_tile_stack = utils.restack_to_tiles( cell_mat, tile_width=sample_tile_width, nx=Nx, ny=Ny) feature_tile_stack = utils.restack_to_tiles( cell_mat, tile_width=feature_tile_width, nx=nx, ny=ny) # load tumor edge matrix (skipping slide if no matrix is found) try: edges = np.load(slide.split(".npy")[0] + "_edges.npy") edges_tile_stack = utils.restack_to_tiles( edges, tile_width=sample_tile_width, nx=Nx, ny=Ny) except IOError: print 'No edge matrix. Skipping slide...' continue # select valid tiles for sampling, skipping slide if no valid tiles are available tile_mask = utils.tile_stack_mask(Nx, Ny, L=sample_layers, db_stack=edges_tile_stack) if np.sum(tile_mask) == 0: print '0 valid samples. Skipping slide...' continue # uniformly sample tiles from the valid sample space of size n_samples sampled_indices = np.random.choice(a=Nx * Ny, size=int( min(N_SAMPLES, np.sum(tile_mask))), p=tile_mask / np.sum(tile_mask), replace=False) sampled_tiles = sample_tile_stack[sampled_indices, :, :] # compute response variable over sampled tiles response, nts = utils.get_pdl1_response(sampled_tiles, circle=True, diameter=sample_tile_width, diagnostic=True) # compute feature arrays over sampled tiles from neighboring tiles feature_rows = np.vstack([ utils.get_feature_array(idx, feature_tile_stack, Nx, scale, feature_layers, flag) for idx in sampled_indices ]) # add outputs to growing array combined_response.extend(response) combined_features.append(feature_rows) combined_nts.extend(nts) # convert feature and response to numpy arrays for analysis combined_features = np.vstack(combined_features) combined_features[np.isnan(combined_features)] = -1 combined_response = np.array(combined_response) combined_nts = np.array(combined_nts) return combined_features, combined_response, combined_nts