Example #1
0
def all_slides_tile_mat():
    all_samples = processing.get_list_of_samples()

    corr_mat = []
    for count, slide in enumerate(all_samples):
        utils.print_progress(count)

        cell_mat = np.load(slide).astype(int)

        corr_row = get_tile_correlation(cell_mat, dist=[50, 100, 200])

        corr_mat.append(corr_row)

    corr_mat = np.vstack(corr_mat)

    return corr_mat
def map_cells_to_mat():
    ''' Reconstructs original slides by reading original cell processing text files
        and recording cell phenotypes in a 1040x1392 numpy matrix.
    '''
    outloc = os.path.join(DATA_PATH, 'processed')
    if not os.path.exists(outloc):
        os.makedirs(outloc)

    files = glob.glob(DATA_PATH + '*cell_seg_data.txt')
    files.sort()

    for i, item in enumerate(files):
        utils.print_progress(i)

        cells = processing.load_sample(item,
                                       confidence_thresh=0,
                                       verbose=False,
                                       radius_lim=0)

        mat = processing.map_phenotypes_to_mat(cells)

        name = item.split("EACRI HNSCC\\")[1].split("_cell_seg_data")[0]
        np.save(os.path.join(outloc, name), mat)
Example #3
0
def main():
    all_samples = processing.get_list_of_samples()

    combined_features = []
    combined_response = []
    used_samples = []

    # iterate over all slides
    for count, slide in enumerate(all_samples):
        utils.print_progress(count)

        # load slide matrix and decision boundary
        cell_mat = np.load(slide)
        try:
            regions = np.load(slide.split(".npy")[0] + "_seg.npy")
        except IOError:
            print "skipping slide with no region info"
            continue

        # corr_row = texture_analysis.get_tile_correlation(cell_mat, dist=[50])

        # disregard unclassified regions for total area
        tumor_area = np.sum(regions == 1) / 1000
        stromal_area = np.sum(regions == 0) / 1000
        total_area = tumor_area + stromal_area

        tumor_mask = (regions == 1).astype(int)
        stromal_mask = (regions == 0).astype(int)

        # slide-level cell counts
        n_tumor = np.sum(cell_mat == 1)
        n_pdl1 = np.sum(cell_mat == 2)
        n_any_tumor = n_tumor + n_pdl1
        n_foxp3 = np.sum(cell_mat == 3)
        n_cd8 = np.sum(cell_mat == 4)
        n_cd4 = np.sum(cell_mat == 5)
        n_pdmac = np.sum(cell_mat == 6)
        n_other = np.sum(cell_mat == 7)
        n_macs = np.sum(cell_mat == 8)

        # all_densities = [n_any_tumor, n_foxp3, n_cd8, n_cd4,
        #                             n_pdmac, n_other, n_macs] / total_area
        # all_densities = [n_any_tumor, n_foxp3, n_cd8, n_cd4,
        #                             n_pdmac, n_other, n_macs] / np.sum(cell_mat > 0)
        # all_densities = [n_any_tumor, n_foxp3, n_cd8, n_cd4,
        #                             n_pdmac, n_other, n_macs]

        feature_row = [
            tumor_area / total_area, stromal_area / total_area, n_tumor,
            n_pdl1, n_any_tumor, n_foxp3, n_cd8, n_cd4, n_pdmac, n_other,
            n_macs
        ]

        # # stromal cell counts
        # ns_tumor = np.sum((cell_mat == 1) * stromal_mask)
        # ns_pdl1 = np.sum((cell_mat == 2) * stromal_mask)
        # ns_any_tumor = ns_tumor + ns_pdl1 + 1
        # ns_foxp3 = np.sum((cell_mat == 3) * stromal_mask) + 1
        # ns_cd8 = np.sum((cell_mat == 4) * stromal_mask) + 1
        # ns_cd4 = np.sum((cell_mat == 5) * stromal_mask) + 1
        # ns_pdmac = np.sum((cell_mat == 6) * stromal_mask) + 1
        # ns_other = np.sum((cell_mat == 7) * stromal_mask) + 1
        # ns_macs = np.sum((cell_mat == 8) * stromal_mask) + 1
        #
        # # stromal_densities = [ns_any_tumor, ns_foxp3, ns_cd8, ns_cd4,
        # #                                 ns_pdmac, ns_other, ns_macs] / stromal_area
        # stromal_densities = [ns_any_tumor, ns_foxp3, ns_cd8, ns_cd4,
        #                                 ns_pdmac, ns_other, ns_macs] / np.sum((cell_mat>0)*stromal_area)
        # # stromal_densities = [ns_any_tumor, ns_foxp3, ns_cd8, ns_cd4,
        # #                                 ns_pdmac, ns_other, ns_macs]
        # # in-tumor cell counts
        # nt_tumor = np.sum((cell_mat == 1) * tumor_mask)
        # nt_pdl1 = np.sum((cell_mat == 2) * tumor_mask)
        # nt_any_tumor = nt_tumor + nt_pdl1 + 1
        # nt_foxp3 = np.sum((cell_mat == 3) * tumor_mask) + 1
        # nt_cd8 = np.sum((cell_mat == 4) * tumor_mask) + 1
        # nt_cd4 = np.sum((cell_mat == 5) * tumor_mask) + 1
        # nt_pdmac = np.sum((cell_mat == 6) * tumor_mask) + 1
        # nt_other = np.sum((cell_mat == 7) * tumor_mask) + 1
        # nt_macs = np.sum((cell_mat == 8) * tumor_mask) + 1
        #
        # # tumor_densities = [nt_any_tumor, nt_foxp3, nt_cd8, nt_cd4,
        # #                                 nt_pdmac, nt_other, nt_macs] / tumor_area
        # tumor_densities = [nt_any_tumor, nt_foxp3, nt_cd8, nt_cd4,
        #                                 nt_pdmac, nt_other, nt_macs] / np.sum((cell_mat>0)*tumor_area)
        # # tumor_densities = [nt_any_tumor, nt_foxp3, nt_cd8, nt_cd4,
        # #                                 nt_pdmac, nt_other, nt_macs]

        if n_any_tumor < 100:
            print "skipping slide with {0} tumor cells:".format(n_any_tumor)
            continue
        # if (tumor_area/stromal_area < 0.1) or (stromal_area/tumor_area < 0.1):
        #     print "skipping slide with tumor:stromal area ratio: ", tumor_area / stromal_area
        #     continue

        # feature_row = np.concatenate((stromal_densities, tumor_densities))
        # feature_row = np.concatenate((stromal_densities, tumor_densities, corr_row))
        # feature_row = corr_row

        ratio = n_pdl1 / (n_tumor + n_pdl1)

        combined_features.append(feature_row)
        combined_response.append(ratio)
        used_samples.append(slide)

    # convert feature and response to numpy arrays for analysis
    combined_features = np.vstack(combined_features)
    combined_response = np.array(combined_response)
    combined_features.shape
    # used_samples = [x.replace("processed_orig_seg", "processed") for x in used_samples]

    #### Combine features with clinical information ####

    lookup = clinical.clinical_lookup_table()
    lookup.shape

    feature_names = [
        "".join(["f", str(x)]) for x in range(combined_features.shape[1])
    ]
    tmp = pd.DataFrame(combined_features, columns=feature_names)
    tmp['response'] = combined_response
    tmp['slide'] = used_samples
    all_data = pd.merge(tmp, lookup)
    all_data.shape

    # create index for patients from 1-n
    ids = list(set(all_data.id))
    id_convert = dict(zip(ids, np.arange(len(ids))))
    all_data['idx'] = [id_convert[str(x)] for x in all_data.id.values]

    # # add logit transformed response variable
    # dtr = learning.VectorTransform(all_data.response)
    # all_data['y*'] = dtr.zero_one_scale().apply('logit')

    all_data.head()
    # fig = display.dotplot(all_data.response, all_data.idx, n_patients=40)
    # fig.savefig(HOME_DIR + '/results/whole_slide/dotplot.png', format='png', dpi=150)

    # display.scatter_hist(all_data.response, all_data.response)
    # display.scatter_hist(all_data['y*'], all_data['y*'], xlims=[-8,8], ylims=[-8,8])

    # for setting aside a holdout set -- however, there are not enough patients to reliably
    # evaluate a holdout set. We rely on repeated k-fold cross validation and (potentially)
    # .632+ bootstrap estimates

    # patients = np.unique(all_data.idx)
    # holdout_patients = np.random.choice(patients, size=int(0.25 * len(patients)), replace=False)
    # holdout_data = all_data.loc[all_data.idx.isin(holdout_patients), :]
    # learn_data = all_data.loc[~all_data.idx.isin(holdout_patients), :]

    #### fit machine learning models ####

    from sklearn.linear_model import LassoCV
    from sklearn.linear_model import Lasso
    from sklearn.linear_model import RidgeCV
    from sklearn.linear_model import Ridge

    from sklearn.linear_model import ElasticNetCV
    from sklearn.linear_model import LinearRegression
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVR
    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.preprocessing import MaxAbsScaler

    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import RepeatedKFold, GroupKFold

    X_ = all_data.loc[:, feature_names].values
    y_ = all_data['y*'].values
    patient_ids = all_data.idx.values

    X_[:, :14] = np.log(X_[:, :14])

    # sklearn's GroupKFold for grouped cross-validation does not implement shuffling so
    # returns the same train/test sets each call. We attempted taking row-wise permutations
    # of the patient dataframe for each iteration as a workaround, but that did not change
    # the splits, because GroupKFold seems to be using the ordered group values to split.
    # Instead, we shuffle the unique values of the group variable to make different group splits.
    alphas = np.logspace(-2, 3, 20)
    estimator = RidgeCV(alphas=alphas)
    # estimator = RandomForestRegressor(n_estimators=100)
    gcv = GroupKFold(n_splits=10)
    replications = 1
    rep_scores = []
    out_sample = {'pred': [], 'target': [], 'id': []}

    # pca = PCA()

    for iter in range(replications):

        # permute input arrays the same way
        ids_shuffle = utils.unique_value_shuffle(patient_ids)

        # perform grouped cross-validation estimation
        for train, test in gcv.split(X_, y_, ids_shuffle):

            X_train = X_[train]
            X_test = X_[test]
            y_train = y_[train]
            y_test = y_[test]

            # X_train = np.log(X_train)
            # X_test = np.log(X_test)

            scale = StandardScaler()
            X_train = scale.fit_transform(X_train)
            X_test = scale.transform(X_test)

            # X_train = pca.fit_transform(X_train)
            # X_test = pca.transform(X_test)

            preds = estimator.fit(X_train, y_train).predict(X_test)
            preds = dtr.undo(preds)
            y_test = dtr.undo(y_test)

            rep_scores.append(metrics.rmse(preds, y_test))
            # rep_scores.append(estimator.score(X_test, y_test))

            out_sample['pred'].extend(preds)
            out_sample['target'].extend(y_test)
            out_sample['id'].extend(ids_shuffle[test])

    print np.mean(rep_scores), np.std(rep_scores)

    # TEST POLYNOMIAL FEATURES
    # take polynomial combinations of the inputs
    from sklearn.preprocessing import PolynomialFeatures
    poly = PolynomialFeatures(2)
    X_p = np.hstack((X_, 1 / X_))
    X_p = poly.fit_transform(X_p)
    X_p = np.log(X_p)
    X_p.shape

    estimator = RidgeCV()
    gcv = GroupKFold(n_splits=10)
    scale = MaxAbsScaler()
    replications = 10
    rep_scores = []
    out_sample = {'pred': [], 'target': [], 'id': []}

    for iter in range(replications):

        # permute input arrays the same way
        ids_shuffle = utils.unique_value_shuffle(patient_ids)

        # perform grouped cross-validation estimation
        for train, test in gcv.split(X_p, y_, ids_shuffle):

            X_train = X_p[train]
            X_test = X_p[test]
            y_train = y_[train]
            y_test = y_[test]

            X_train = scale.fit_transform(X_train)

            X_test = scale.transform(X_test)

            preds = estimator.fit(X_train, y_train).predict(X_test)
            preds = dtr.undo(preds)
            y_test = dtr.undo(y_test)

            rep_scores.append(metrics.rmse(preds, y_test))

            out_sample['pred'].extend(preds)
            out_sample['target'].extend(y_test)
            out_sample['id'].extend(ids_shuffle[test])

    print np.mean(rep_scores), np.std(rep_scores)

    # dict elements from list to array
    for key, value in out_sample.iteritems():
        out_sample[key] = np.array(value)

    metrics.rmse(out_sample['pred'], out_sample['target'])
    fig = display.fixed_scatter(out_sample['pred'], out_sample['target'])

    def aggregate_by_patient(prediction, target, patient_id):
        gdf = pd.DataFrame({
            'preds': prediction,
            'targs': target,
            'id': patient_id
        })
        by_pt_results = gdf.groupby('id').mean()
        return by_pt_results.preds.values, by_pt_results.targs.values

    pt_pred, pt_targ = aggregate_by_patient(out_sample['pred'],
                                            out_sample['target'],
                                            out_sample['id'])

    fig = display.fixed_scatter(pt_pred, pt_targ)
    fig.suptitle('Patient level pdl1')
    fig.savefig(HOME_DIR + '/results/whole_slide/patient_sqrt_ratios.png',
                format='png',
                dpi=150)

    metrics.rmse(pt_pred, pt_targ)
    metrics.corr(pt_pred, pt_targ)

    plt.scatter(pt_targ - pt_pred, pt_pred)

    # try: converting cell counts to some kind of probability measure as feature
    #   --> correcting cell counts via slide-level cellular density

    # from sklearn.feature_selection import RFECV
    # rfecv = RFECV(estimator=estimator, step=10, cv=10, scoring='mean_squared_error')
    # X_new = rfecv.fit_transform(X_all, y_all)
    # print("Optimal number of features : %d" % rfecv.n_features_)
    # # Plot number of features VS. cross-validation scores
    # plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)

    ##########################################
    ##########################################

    learning_data = all_data.loc[:, feature_names + ['response', 'y*', 'idx']]

    # create training and test set data split by patient (not slide)
    from sklearn.model_selection import train_test_split
    n_patients = np.max(all_data.idx)
    idx_train, idx_test = train_test_split(range(n_patients), test_size=0.1)
    train_data = learning_data[learning_data.idx.isin(idx_train)]
    test_data = learning_data[learning_data.idx.isin(idx_test)]

    from sklearn.preprocessing import PolynomialFeatures
    poly = PolynomialFeatures(2)
    X_all = tmp.loc[:, feature_names]
    X_all = np.array(X_all)

    X_all[X_all == 0] = 1
    X_all = np.hstack((X_all, 1 / X_all))
    X_all = poly.fit_transform(X_all)

    # separate out train/test arrays by X/y/y*
    X_train, y_train = train_data.loc[:, feature_names], train_data.response
    X_test, y_test = test_data.loc[:, feature_names], test_data.response
    ystar_train, ystar_test = train_data['y*'], test_data['y*']

    # standardize X features using X_train and apply to X_test
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # test regression or learning algorithms
    from sklearn.neural_network import MLPRegressor
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.linear_model import LinearRegression
    estimator = RandomForestRegressor(n_estimators=300,
                                      oob_score=True,
                                      bootstrap=True)
    estimator = LinearRegression()
    estimator = MLPRegressor(hidden_layer_sizes=(2),
                             max_iter=10000,
                             alpha=0.005,
                             solver='lbfgs',
                             activation='logistic',
                             warm_start=False)
    estimator = ElasticNet(alpha=0.005, l1_ratio=.1, max_iter=10000)

    ##### SLIDE-LEVEL METRICS #####
    X_preds = estimator.fit(X_train, y_train).predict(X_test)
    metrics.rmse(X_preds, y_test)
    metrics.corr(X_preds, y_test)
    display.fixed_scatter(X_preds, y_test)

    # repeat process to test *transformed* response var
    Xstar_preds = estimator.fit(X_train, ystar_train).predict(X_test)

    # undo the transform
    Xstar_restore = dtr.undo(Xstar_preds)
    ystar_test_restore = dtr.undo(ystar_test)
    if not np.allclose(ystar_test_restore, y_test):
        raise ValueError(
            "Undo on transformed response does not match original.")

    metrics.rmse(Xstar_restore, y_test)
    metrics.corr(Xstar_restore, y_test)
    display.fixed_scatter(Xstar_restore, y_test)

    ##### PATIENT-LEVEL METRICS #####
    def aggregate_by_patient(prediction, target, patient_id):
        gdf = pd.DataFrame({
            'preds': prediction,
            'targs': target,
            'id': patient_id
        })
        by_pt_results = gdf.groupby('id').mean()
        return by_pt_results.preds.values, by_pt_results.targs.values

    # test untransformed y-var
    pt_pred, pt_response = aggregate_by_patient(X_preds,
                                                y_test,
                                                patient_id=test_data.idx)
    metrics.rmse(pt_pred, pt_response)
    metrics.corr(pt_pred, pt_response)
    display.fixed_scatter(pt_pred, pt_response)

    # test transformed y-var
    pt_pred, pt_response = aggregate_by_patient(Xstar_restore,
                                                y_test,
                                                patient_id=test_data.idx)
    metrics.rmse(pt_pred, pt_response)
    metrics.corr(pt_pred, pt_response)
    display.fixed_scatter(pt_pred, pt_response)

    estimator.coef_

    ################################################################3

    #
    # def apply_boxcox(X):
    #     return np.apply_along_axis(_boxcox_transform, 0, X)
    #
    # def _boxcox_transform(arr):
    #     return stats.boxcox(arr)[0]

    def confusion_matrix(x, y, threshold=0.5):
        q1 = np.sum((x > threshold) & (y > threshold))
        q2 = np.sum((x > threshold) & (y <= threshold))
        q3 = np.sum((x <= threshold) & (y > threshold))
        q4 = np.sum((x <= threshold) & (y <= threshold))
        return np.array([[q1, q2], [q3, q4]])

    confusion_matrix(pt_pred, pt_response, threshold=0.3)

    import helper.metrics as metrics
    tmp = all_data[all_data.STAGE == 4]
    metrics.corr_nan(tmp.response, tmp.radiation)
def extract_dataset(diams, sample_diam, flag):

    np.random.seed(1000)

    # set sampling parameters
    N_SLIDES = 314     # number of slides to use
    N_SAMPLES = 30      # max samples to take from a single slide

    # set tile feature extraction parameters
    sample_tile_width = sample_diam
    feature_tile_width = 1
    Nx, Ny = int(1392 / sample_tile_width), int(1040 / sample_tile_width)       # no. sample tiles
    nx, ny = Nx * sample_tile_width, Ny * sample_tile_width                     # no. feature tiles
    offset_px = int((max(diams) - sample_diam) / 2)
    offset_tiles = int(np.ceil(offset_px / sample_diam))

    # get pre-processed slide matrices and select random sample of slides
    all_slides = processing.get_list_of_samples(processed_slides)
    SLIDES = [all_slides[i] for i in np.random.choice(len(all_slides), N_SLIDES, replace=False)]

    # initialize processed variables storage
    ncells_all = []
    slides_all = []
    X_all = []
    y_all = []
    overlap_all = []

    # process samples in batches
    batch_size = 10
    for idx in range(0, N_SLIDES, batch_size):
        BATCH = SLIDES[idx:idx + batch_size]

        # iterate over sampled slides to extract feature and response variables via tile sampling
        batch_ncells = []
        batch_features = []
        batch_response = []
        batch_slides = []
        batch_overlap = []
        for i, slide in enumerate(BATCH):
            print_progress(i)

            # load slide and reshape into sample and feature tile stacks
            cell_mat = np.load(slide)
            sample_tile_stack = utils.restack_to_tiles(cell_mat, tile_width=sample_tile_width,
                                                       nx=Nx, ny=Ny)
            feature_tile_stack = utils.restack_to_tiles(cell_mat, tile_width=feature_tile_width,
                                                        nx=nx, ny=ny)

            # load seg file to compute ratio of processed area to total slide area
            seg = np.load(slide.split(".npy")[0] + "_seg.npy")
            correction = 1392*1040 / np.sum(seg != -1)

            n_cells_total = np.sum(cell_mat != 0)
            n_cells_corrected = n_cells_total * correction

            # make unprocessed region matrix from seg file
            seg_map = (seg == -1).astype(int)
            seg_tile_stack = utils.restack_to_tiles(seg_map, tile_width=feature_tile_width,
                                                    nx=nx, ny=ny)

            ### used for limiting tile sampling to 'edge regions' between tumor and stroma.
            ### For now I think it is simpler and more explanable to permit sampling anywhere in the
            ### tumor, not just on the edge. I may revisit this in the future.
                # # load tumor edge matrix (skipping slide if no matrix is found)
                # try:
                #     edges = np.load(slide.split(".npy")[0] + "_edges.npy")
                #     edges_tile_stack = utils.restack_to_tiles(edges, tile_width=sample_tile_width,
                #                                               nx=Nx, ny=Ny)
                # except IOError:
                #     print 'No edge matrix. Skipping slide...'
                #     continue

                # select valid tiles for sampling, skipping slide if no valid tiles are available
                # tile_mask = utils.tile_stack_mask(Nx, Ny, L=sample_layers, db_stack=edges_tile_stack)

            # get set of valid sampling tiles (tiles with enough offset from the edges)
            tile_mask = utils.tile_stack_mask(Nx, Ny, L=offset_tiles, db_stack=None)
            n_tiles = int(min(N_SAMPLES, np.sum(tile_mask)))
            if n_tiles == 0:
                print('0 valid samples. Skipping slide...')
                continue

            # store batch cell numbers and slide names
            batch_ncells.extend([n_cells_corrected] * n_tiles)
            batch_slides.extend([slide] * n_tiles)

            # uniformly sample tiles from the valid sample space of size n_samples
            # in this case, I have set it to just get all available samples from each slide
            sampled_indices = np.random.choice(a=Nx * Ny, size=n_tiles,
                                               p=tile_mask / np.sum(tile_mask), replace=False)
            sampled_tiles = sample_tile_stack[sampled_indices, :, :]

            # compute response variable over sampled tiles
            response, nts = utils.get_pdl1_response(sampled_tiles, circle=True,
                                                    diameter=sample_tile_width, diagnostic=True)

            # compute feature arrays over sampled tiles from neighboring tiles
            feature_rows = []
            overlap = []
            for j in sampled_indices:
                feature_tiles = utils.get_feature_array(j, feature_tile_stack, Nx,
                                                        sample_tile_width, offset_px, flag)
                seg_map_tiles = utils.get_feature_array(j, seg_tile_stack, Nx,
                                                        sample_tile_width, offset_px, flag)

                # store feature tile and overlap with unprocessed regions
                feature_rows.append(feature_tiles)
                overlap.append(np.sum(seg_map_tiles))

            del feature_tile_stack
            del seg_tile_stack

            # add to growing array as long as any valid samples have been collected
            if len(feature_rows) > 0:
                feature_rows = np.vstack(feature_rows)
                overlap = np.array(overlap)
                # # remove observations with significant overlap (>10%) with unprocessed regions
                # mask = (np.array(overlap) <= 0.1 * max(diams) ** 2)
                # feature_rows = feature_rows[mask, :]
                # response = response[mask]
                # nts = nts[mask]

                batch_response.extend(response)
                batch_features.append(feature_rows)
                batch_overlap.extend(overlap)

        # convert feature and response to numpy arrays for analysis
        batch_features = np.vstack(batch_features)
        batch_response = np.array(batch_response)
        batch_overlap = np.array(batch_overlap)

        # ----- variable processing ----- #

        # # remove all cases with no tumor cells in the sampled tile
        # mask = combined_response == -1
        # combined_response = combined_response[~mask]
        # combined_features = combined_features[~mask, :]

        # # alternatively, remove all cases with <K tumor cells in the sampled tile
        # # print combined_nts.shape, combined_response.shape, combined_features.shape
        # mask = combined_nts < 10
        # combined_response = combined_response[~mask]
        # combined_features = combined_features[~mask, :]


        # aggregate tiles within arbitrary shapes (e.g. discs or squares of increasing size)
        n_obs = batch_features.shape[0]
        side_len = sample_tile_width + 2 * offset_px
        n_tiles = side_len ** 2

        if flag == 'n':
            phens = ['tumor','cd4','cd8','foxp3','pdmac','other']
        elif flag == 'a':
            phens = ['tumor','pdl1','cd4','cd8','foxp3','pdmac','other']
        elif flag == 't':
            phens = ['tumor','pdl1']

        phen_columns = []
        for phen in range(len(phens)):    # iterate process over each phenotype
            tmp_tiles = batch_features[:, phen * n_tiles:(phen + 1) * n_tiles]
            tmp_3d = tmp_tiles.reshape(n_obs, side_len, side_len)

            range_columns = []

            diams_0 = [0] + diams
            for i in range(len(diams)):
                print_progress('{0}: {1}'.format(phens[phen], diams[i]))
                if (flag in ['a','t']) and (phens[phen] in ['tumor','pdl1']) and (diams[i] <= sample_tile_width):
                    print("skipping.")
                    continue

                mask = utils.shape_mask(grid_dim=side_len, type='circle',
                S=diams_0[i+1], s=diams_0[i])

                t = np.sum(np.multiply(tmp_3d, mask), axis=(1,2)).reshape(-1, 1)
                # sigma = np.std(np.multiply(tmp_3d, mask), axis=(1,2)).reshape(-1,1)
                range_columns.append(t)
                # range_columns.append(sigma)

            per_phen_features = np.hstack(range_columns)
            phen_columns.append(per_phen_features)

        del batch_features
        ncells_all.extend(batch_ncells)
        slides_all.extend(batch_slides)
        X_all.append(np.hstack(phen_columns))
        y_all.extend(batch_response)
        overlap_all.extend(batch_overlap)

    ncells_all = np.array(ncells_all)
    X_all = np.vstack(X_all)
    y_all = np.array(y_all)
    overlap_all = np.array(overlap_all)

    # save processed data as csv
    feature_names = ["_".join([a, str(b)]) for a in phens for b in diams]
    feature_names.append('y')
    tmp = pd.DataFrame(np.hstack((X_all, y_all.reshape(-1,1))))
    tmp.columns = feature_names
    tmp['slide'] = slides_all
    tmp['n_cells_corrected'] = ncells_all
    tmp['unscored_overlap'] = overlap_all
    tmp = tmp.set_index('slide')
    tmp.to_csv(os.path.join(HOME_PATH, 'data', 'local_discs.csv'))
def extract_dataset(diams, sample_diam, flag):

    np.random.seed(1000)

    # set sampling parameters
    N_SLIDES = 260
    N_SAMPLES = 15

    # set feature extraction parameters
    sample_tile_width = sample_diam
    feature_tile_width = 1
    feature_layers = 75

    # compute other parameters based on input parameters
    scale = int(sample_tile_width / feature_tile_width)
    assert (
        scale == sample_tile_width / feature_tile_width
    ), "sample_tile_width must be integer multiple of feature_tile_width"
    Nx, Ny = int(1392 / sample_tile_width), int(1040 / sample_tile_width)
    nx, ny = Nx * scale, Ny * scale
    sample_layers = int(
        np.ceil(feature_layers * feature_tile_width / sample_tile_width))

    # get pre-processed slide matrices and select random sample of slides
    all_samples = helper.processing.get_list_of_samples(DIR)
    SAMPLES = [
        all_samples[i]
        for i in np.random.choice(len(all_samples), N_SLIDES, replace=False)
    ]

    # iterate over sampled slides to extract feature and response variables via tile sampling
    combined_features = []
    combined_response = []
    combined_nts = []
    for i, slide in enumerate(SAMPLES):
        print_progress(i)

        # load slide and reshape into tile stacks
        cell_mat = np.load(slide)
        sample_tile_stack = utils.restack_to_tiles(
            cell_mat, tile_width=sample_tile_width, nx=Nx, ny=Ny)
        feature_tile_stack = utils.restack_to_tiles(
            cell_mat, tile_width=feature_tile_width, nx=nx, ny=ny)

        # load tumor edge matrix (skipping slide if no matrix is found)
        try:
            edges = np.load(slide.split(".npy")[0] + "_edges.npy")
            edges_tile_stack = utils.restack_to_tiles(
                edges, tile_width=sample_tile_width, nx=Nx, ny=Ny)
        except IOError:
            print 'No edge matrix. Skipping slide...'
            continue

        # select valid tiles for sampling, skipping slide if no valid tiles are available
        tile_mask = utils.tile_stack_mask(Nx,
                                          Ny,
                                          L=sample_layers,
                                          db_stack=edges_tile_stack)
        if np.sum(tile_mask) == 0:
            print '0 valid samples. Skipping slide...'
            continue

        # uniformly sample tiles from the valid sample space of size n_samples
        sampled_indices = np.random.choice(a=Nx * Ny,
                                           size=int(
                                               min(N_SAMPLES,
                                                   np.sum(tile_mask))),
                                           p=tile_mask / np.sum(tile_mask),
                                           replace=False)
        sampled_tiles = sample_tile_stack[sampled_indices, :, :]

        # compute response variable over sampled tiles
        response, nts = utils.get_pdl1_response(sampled_tiles,
                                                circle=True,
                                                diameter=sample_tile_width,
                                                diagnostic=True)

        # compute feature arrays over sampled tiles from neighboring tiles
        feature_rows = np.vstack([
            utils.get_feature_array(idx, feature_tile_stack, Nx, scale,
                                    feature_layers, flag)
            for idx in sampled_indices
        ])

        # add outputs to growing array
        combined_response.extend(response)
        combined_features.append(feature_rows)
        combined_nts.extend(nts)

    # convert feature and response to numpy arrays for analysis
    combined_features = np.vstack(combined_features)
    combined_features[np.isnan(combined_features)] = -1
    combined_response = np.array(combined_response)
    combined_nts = np.array(combined_nts)

    # ----- variable processing ----- #

    # # remove all cases with no tumor cells in the sampled tile
    # mask = combined_response == -1
    # combined_response = combined_response[~mask]
    # combined_features = combined_features[~mask, :]

    # alternatively, remove all cases with <K tumor cells in the sampled tile
    print combined_nts.shape, combined_response.shape, combined_features.shape
    mask = combined_nts < 10
    combined_response = combined_response[~mask]
    combined_features = combined_features[~mask, :]

    # aggregate tiles within arbitrary shapes (e.g. discs or squares of increasing size)
    n_obs = combined_features.shape[0]
    side_len = scale + 2 * feature_layers
    n_tiles = side_len**2

    if flag == 'n':
        phens = ['tumor', 'cd4', 'cd8', 'foxp3', 'pdmac', 'other']
    elif flag == 'a':
        phens = ['tumor', 'pdl1', 'cd4', 'cd8', 'foxp3', 'pdmac', 'other']
    elif flag == 't':
        phens = ['tumor', 'pdl1']

    phen_columns = []
    for phen in range(len(phens)):  # iterate process over each phenotype
        tmp_tiles = combined_features[:, phen * n_tiles:(phen + 1) * n_tiles]
        tmp_3d = tmp_tiles.reshape(n_obs, side_len, side_len)

        range_columns = []

        d_seq_0 = [0] + d_seq
        for i in range(len(d_seq_0) - 1):
            # utils.print_progress(i)
            print phens[phen], d_seq[i]
            if (flag in ['a', 't']) and (phens[phen] in [
                    'tumor', 'pdl1'
            ]) and (d_seq[i] <= sample_tile_width):
                print "skipping."
                continue

            mask = utils.shape_mask(grid_dim=side_len,
                                    type='circle',
                                    S=d_seq_0[i + 1],
                                    s=d_seq_0[i])

            t = np.sum(np.multiply(tmp_3d, mask), axis=(1, 2)).reshape(-1, 1)
            # sigma = np.std(np.multiply(tmp_3d, mask), axis=(1,2)).reshape(-1,1)
            range_columns.append(t)
            # range_columns.append(sigma)

        per_phen_features = np.hstack(range_columns)
        phen_columns.append(per_phen_features)
    X = np.hstack(phen_columns)

    np.save(STORE_DIR + "data_x", X)
    np.save(STORE_DIR + "data_y", combined_response)
def automate_tile_extraction(SAMPLES):
    # iterate over sampled slides to extract feature and response variables via tile sampling
    combined_features = []
    combined_response = []
    combined_nts = []
    for i, slide in enumerate(SAMPLES):
        print_progress(i)

        # load slide and reshape into tile stacks
        cell_mat = np.load(slide)
        sample_tile_stack = utils.restack_to_tiles(
            cell_mat, tile_width=sample_tile_width, nx=Nx, ny=Ny)
        feature_tile_stack = utils.restack_to_tiles(
            cell_mat, tile_width=feature_tile_width, nx=nx, ny=ny)

        # load tumor edge matrix (skipping slide if no matrix is found)
        try:
            edges = np.load(slide.split(".npy")[0] + "_edges.npy")
            edges_tile_stack = utils.restack_to_tiles(
                edges, tile_width=sample_tile_width, nx=Nx, ny=Ny)
        except IOError:
            print 'No edge matrix. Skipping slide...'
            continue

        # select valid tiles for sampling, skipping slide if no valid tiles are available
        tile_mask = utils.tile_stack_mask(Nx,
                                          Ny,
                                          L=sample_layers,
                                          db_stack=edges_tile_stack)
        if np.sum(tile_mask) == 0:
            print '0 valid samples. Skipping slide...'
            continue

        # uniformly sample tiles from the valid sample space of size n_samples
        sampled_indices = np.random.choice(a=Nx * Ny,
                                           size=int(
                                               min(N_SAMPLES,
                                                   np.sum(tile_mask))),
                                           p=tile_mask / np.sum(tile_mask),
                                           replace=False)
        sampled_tiles = sample_tile_stack[sampled_indices, :, :]

        # compute response variable over sampled tiles
        response, nts = utils.get_pdl1_response(sampled_tiles,
                                                circle=True,
                                                diameter=sample_tile_width,
                                                diagnostic=True)

        # compute feature arrays over sampled tiles from neighboring tiles
        feature_rows = np.vstack([
            utils.get_feature_array(idx, feature_tile_stack, Nx, scale,
                                    feature_layers, flag)
            for idx in sampled_indices
        ])

        # add outputs to growing array
        combined_response.extend(response)
        combined_features.append(feature_rows)
        combined_nts.extend(nts)

    # convert feature and response to numpy arrays for analysis
    combined_features = np.vstack(combined_features)
    combined_features[np.isnan(combined_features)] = -1
    combined_response = np.array(combined_response)
    combined_nts = np.array(combined_nts)

    return combined_features, combined_response, combined_nts