Ejemplo n.º 1
0
    def test_restacking(self):
        print "Running:", sys._getframe().f_code.co_name
        slide = helper.processing.get_list_of_samples()[100]

        cell_mat = np.load(slide)

        sample_tile_stack = utils.restack_to_tiles(cell_mat,
                                                   tile_width=100,
                                                   nx=5,
                                                   ny=5)
        feature_tile_stack = utils.restack_to_tiles(cell_mat,
                                                    tile_width=50,
                                                    nx=10,
                                                    ny=10)

        self.assertTrue(
            np.all(
                sample_tile_stack[0, :50, :50] == feature_tile_stack[0, :, :]))
        self.assertTrue(
            np.all(
                sample_tile_stack[1, :50, :50] == feature_tile_stack[2, :, :]))
        self.assertTrue(
            np.all(sample_tile_stack[0, :50,
                                     50:100] == feature_tile_stack[1, :, :]))

        self.assertTrue(
            np.all(sample_tile_stack[5, :50, :50] == feature_tile_stack[
                20, :, :]))
        self.assertTrue(
            np.all(sample_tile_stack[24, 50:100, :50] == feature_tile_stack[
                98, :, :]))
Ejemplo n.º 2
0
    def test_tile_pdl1_count(self):
        print "Running:", sys._getframe().f_code.co_name
        tile_width = 200
        protected_edge_layers = 1
        Nx, Ny = int(1392 / tile_width), int(1040 / tile_width)

        slide = helper.processing.get_list_of_samples()[26]

        cell_mat = np.load(slide)
        imloc = helper.processing.get_original_image(slide)

        cell_mat = cell_mat[:Ny * tile_width, :Nx * tile_width]
        sample_tile_stack = utils.restack_to_tiles(cell_mat,
                                                   tile_width=tile_width,
                                                   nx=Nx,
                                                   ny=Ny)

        tile_mask = utils.tile_stack_mask(Nx,
                                          Ny,
                                          L=protected_edge_layers,
                                          db_stack=None)

        # uniformly sample tiles from the valid sample space (where tile_mask == 1)
        np.random.seed(30)
        sample_index = np.random.choice(a=Nx * Ny,
                                        size=1,
                                        p=tile_mask / np.sum(tile_mask),
                                        replace=False)
        # only keep the selected tile
        tile_mask[:sample_index[0]] = 0
        tile_mask[sample_index[0] + 1:] = 0

        # select sample tile and compute response variable
        sampled_tiles = sample_tile_stack[sample_index, :, :]
        response, nts = utils.get_pdl1_response(sampled_tiles,
                                                circle=True,
                                                diameter=tile_width,
                                                diagnostic=True)

        # form visualization mask
        tmp = tile_mask.reshape((Ny, Nx))
        tmp2 = np.repeat(tmp, tile_width, axis=0)
        expanded_mask = np.repeat(tmp2, tile_width, axis=1)

        # convert selected tile shape from square to a circle
        expanded_mask_tiles = utils.restack_to_tiles(expanded_mask,
                                                     tile_width=tile_width,
                                                     nx=Nx,
                                                     ny=Ny)
        mask = utils.shape_mask(tile_width, type='circle', S=tile_width, s=0)
        expanded_mask_tiles_masked = np.multiply(expanded_mask_tiles, mask)
        expanded_mask_circle = utils.flatten_tile_stack(
            expanded_mask_tiles_masked, tile_width=tile_width, nx=Nx, ny=Ny)

        print 'Percent pdl1+ (red): ', response[0]
        print 'Total no. of tumor cells (blue and red)', nts[0]
        display.visualize_sampling(db=expanded_mask_circle, cell_mat=cell_mat)
Ejemplo n.º 3
0
    def test_edge_masking_visualization(self):
        print "Running:", sys._getframe().f_code.co_name
        tile_width = 200
        protected_edge_layers = 1
        Nx, Ny = int(1392 / tile_width), int(1040 / tile_width)

        slide = helper.processing.get_list_of_samples()[100]

        cell_mat = np.load(slide)
        imloc = helper.processing.get_original_image(slide)

        edges = np.load(slide.split(".npy")[0] + "_seg.npy")

        cell_mat = cell_mat[:Ny * tile_width, :Nx * tile_width]
        edges = edges[:Ny * tile_width, :Nx * tile_width]

        edges_tile_stack = utils.restack_to_tiles(edges,
                                                  tile_width=tile_width,
                                                  nx=Nx,
                                                  ny=Ny)

        tile_mask = utils.tile_stack_mask(Nx,
                                          Ny,
                                          L=protected_edge_layers,
                                          db_stack=edges_tile_stack)

        tmp = tile_mask.reshape((Ny, Nx))
        tmp2 = np.repeat(tmp, tile_width, axis=0)
        expanded_mask = np.repeat(tmp2, tile_width, axis=1)

        display.visualize_sampling(image=imloc)
        display.visualize_sampling(db=expanded_mask, cell_mat=cell_mat)
Ejemplo n.º 4
0
    def test_feature_tile_matching_offset(self):
        # TODO
        print "Running:", sys._getframe().f_code.co_name
        feature_diameter = 10
        sample_diam = 5
        feature_tile_width = 1
        sample_tile_width = sample_diam

        nx, ny = 50, 40  # no. feature tiles
        Nx, Ny = int(50 / sample_tile_width), int(
            40 / sample_tile_width)  # no. sample tiles
        offset_px = int((feature_diameter - sample_diam) / 2)
        offset_tiles = int(np.ceil(offset_px / sample_diam))

        a = np.random.permutation(ny * nx).reshape(ny, nx)
        feature_tile_stack = utils.restack_to_tiles(
            a, tile_width=feature_tile_width, nx=nx, ny=ny)
        feature_tile_stack.shape

        idx = 11
        scale = sample_tile_width / feature_tile_width
        output = utils.id_feature_tiles(idx,
                                        Nx,
                                        scale,
                                        feature_layers=offset_px)
        output
Ejemplo n.º 5
0
    def test_feature_tile_matching(self):
        print "Running:", sys._getframe().f_code.co_name
        feature_diameter = 5
        sample_diam = 5
        feature_tile_width = 1
        sample_tile_width = sample_diam

        nx, ny = 25, 20  # no. feature tiles
        Nx, Ny = int(25 / sample_tile_width), int(
            20 / sample_tile_width)  # no. sample tiles
        offset_px = int((feature_diameter - sample_diam) / 2)
        offset_tiles = int(np.ceil(offset_px / sample_diam))

        a = np.random.permutation(ny * nx).reshape(ny, nx)
        feature_tile_stack = utils.restack_to_tiles(
            a, tile_width=feature_tile_width, nx=nx, ny=ny)
        feature_tile_stack.shape

        idx = 6
        scale = sample_tile_width / feature_tile_width
        output = utils.id_feature_tiles(idx,
                                        Nx,
                                        scale,
                                        feature_layers=offset_px)

        self.assertTrue(np.all(output[4:7] == np.array([134, 155, 156])))
Ejemplo n.º 6
0
    def test_stack_and_flatten(self):
        print "Running:", sys._getframe().f_code.co_name
        nx, ny = 8, 5
        tw = 3

        a = np.random.permutation(tw * tw * nx * ny).reshape(tw * ny, tw * nx)
        tiles = utils.restack_to_tiles(a, tile_width=tw, nx=nx, ny=ny)
        b = utils.flatten_tile_stack(tiles, tile_width=tw, nx=nx,
                                     ny=ny).astype(int)

        self.assertTrue(np.all(a == b))
Ejemplo n.º 7
0
    def test_tile_feature_extraction(self):
        print "Running:", sys._getframe().f_code.co_name
        tile_width = 200
        protected_edge_layers = 1
        feature_tile_width = 1
        diams = [100]
        offset_px = int((max(diams) - tile_width) / 2)
        offset_tiles = int(np.ceil(offset_px / tile_width))
        Nx, Ny = int(1392 / tile_width), int(1040 / tile_width)
        nx, ny = Nx * tile_width, Ny * tile_width

        slide = helper.processing.get_list_of_samples()[26]

        cell_mat = np.load(slide)
        imloc = helper.processing.get_original_image(slide)

        # cell_mat = cell_mat[:Ny * tile_width, :Nx * tile_width]
        sample_tile_stack = utils.restack_to_tiles(cell_mat,
                                                   tile_width=tile_width,
                                                   nx=Nx,
                                                   ny=Ny)
        feature_tile_stack = utils.restack_to_tiles(
            cell_mat, tile_width=feature_tile_width, nx=nx, ny=ny)
        tile_mask = utils.tile_stack_mask(Nx,
                                          Ny,
                                          L=protected_edge_layers,
                                          db_stack=None)

        # uniformly sample tiles from the valid sample space (where tile_mask == 1)
        np.random.seed(30)
        sample_index = np.random.choice(a=Nx * Ny,
                                        size=1,
                                        p=tile_mask / np.sum(tile_mask),
                                        replace=False)
        # only keep the selected tile
        tile_mask[:sample_index[0]] = 0
        tile_mask[sample_index[0] + 1:] = 0

        # select sample tile and compute response variable
        sampled_tiles = sample_tile_stack[sample_index, :, :]
        response, nts = utils.get_pdl1_response(sampled_tiles,
                                                circle=True,
                                                diameter=tile_width,
                                                diagnostic=True)
        # compute feature arrays over sampled tiles from neighboring tiles
        feature_rows = np.vstack([
            utils.get_feature_array(idx, feature_tile_stack, Nx, tile_width,
                                    offset_px, 'n') for idx in sample_index
        ])

        # form visualization mask
        tmp = tile_mask.reshape((Ny, Nx))
        tmp2 = np.repeat(tmp, tile_width, axis=0)
        expanded_mask = np.repeat(tmp2, tile_width, axis=1)

        # convert selected tile shape from square to a circle
        expanded_mask_tiles = utils.restack_to_tiles(expanded_mask,
                                                     tile_width=tile_width,
                                                     nx=Nx,
                                                     ny=Ny)
        mask = utils.shape_mask(tile_width, type='circle', S=100, s=0)
        expanded_mask_tiles_masked = np.multiply(expanded_mask_tiles, mask)
        expanded_mask_circle = utils.flatten_tile_stack(
            expanded_mask_tiles_masked, tile_width=tile_width, nx=Nx, ny=Ny)

        # aggregate tiles within arbitrary shapes (e.g. discs or squares of increasing size)
        n_obs = 1
        side_len = tile_width + 2 * offset_px
        n_tiles = side_len**2

        phens = ['tumor', 'foxp3', 'cd8', 'cd4', 'pdmac', 'other', 'mac']

        phen_columns = []
        for phen in range(len(phens)):  # iterate process over each phenotype
            # phen = 0
            tmp_tiles = feature_rows[:, phen * n_tiles:(phen + 1) * n_tiles]
            tmp_3d = tmp_tiles.reshape(n_obs, side_len, side_len)

            range_columns = []

            diams_0 = [0] + diams
            for i in range(len(diams)):
                print phens[phen], diams[i]

                mask = utils.shape_mask(grid_dim=side_len,
                                        type='circle',
                                        S=diams_0[i + 1],
                                        s=diams_0[i])

                t = np.sum(np.multiply(tmp_3d, mask),
                           axis=(1, 2)).reshape(-1, 1)
                # sigma = np.std(np.multiply(tmp_3d, mask), axis=(1,2)).reshape(-1,1)
                range_columns.append(t)
                # range_columns.append(sigma)

            per_phen_features = np.hstack(range_columns)
            phen_columns.append(per_phen_features)
        print zip(phens, [x[0][0] for x in phen_columns])
        display.visualize_sampling(db=expanded_mask_circle,
                                   cell_mat=cell_mat,
                                   phen='all')
def extract_dataset(diams, sample_diam, flag):

    np.random.seed(1000)

    # set sampling parameters
    N_SLIDES = 314     # number of slides to use
    N_SAMPLES = 30      # max samples to take from a single slide

    # set tile feature extraction parameters
    sample_tile_width = sample_diam
    feature_tile_width = 1
    Nx, Ny = int(1392 / sample_tile_width), int(1040 / sample_tile_width)       # no. sample tiles
    nx, ny = Nx * sample_tile_width, Ny * sample_tile_width                     # no. feature tiles
    offset_px = int((max(diams) - sample_diam) / 2)
    offset_tiles = int(np.ceil(offset_px / sample_diam))

    # get pre-processed slide matrices and select random sample of slides
    all_slides = processing.get_list_of_samples(processed_slides)
    SLIDES = [all_slides[i] for i in np.random.choice(len(all_slides), N_SLIDES, replace=False)]

    # initialize processed variables storage
    ncells_all = []
    slides_all = []
    X_all = []
    y_all = []
    overlap_all = []

    # process samples in batches
    batch_size = 10
    for idx in range(0, N_SLIDES, batch_size):
        BATCH = SLIDES[idx:idx + batch_size]

        # iterate over sampled slides to extract feature and response variables via tile sampling
        batch_ncells = []
        batch_features = []
        batch_response = []
        batch_slides = []
        batch_overlap = []
        for i, slide in enumerate(BATCH):
            print_progress(i)

            # load slide and reshape into sample and feature tile stacks
            cell_mat = np.load(slide)
            sample_tile_stack = utils.restack_to_tiles(cell_mat, tile_width=sample_tile_width,
                                                       nx=Nx, ny=Ny)
            feature_tile_stack = utils.restack_to_tiles(cell_mat, tile_width=feature_tile_width,
                                                        nx=nx, ny=ny)

            # load seg file to compute ratio of processed area to total slide area
            seg = np.load(slide.split(".npy")[0] + "_seg.npy")
            correction = 1392*1040 / np.sum(seg != -1)

            n_cells_total = np.sum(cell_mat != 0)
            n_cells_corrected = n_cells_total * correction

            # make unprocessed region matrix from seg file
            seg_map = (seg == -1).astype(int)
            seg_tile_stack = utils.restack_to_tiles(seg_map, tile_width=feature_tile_width,
                                                    nx=nx, ny=ny)

            ### used for limiting tile sampling to 'edge regions' between tumor and stroma.
            ### For now I think it is simpler and more explanable to permit sampling anywhere in the
            ### tumor, not just on the edge. I may revisit this in the future.
                # # load tumor edge matrix (skipping slide if no matrix is found)
                # try:
                #     edges = np.load(slide.split(".npy")[0] + "_edges.npy")
                #     edges_tile_stack = utils.restack_to_tiles(edges, tile_width=sample_tile_width,
                #                                               nx=Nx, ny=Ny)
                # except IOError:
                #     print 'No edge matrix. Skipping slide...'
                #     continue

                # select valid tiles for sampling, skipping slide if no valid tiles are available
                # tile_mask = utils.tile_stack_mask(Nx, Ny, L=sample_layers, db_stack=edges_tile_stack)

            # get set of valid sampling tiles (tiles with enough offset from the edges)
            tile_mask = utils.tile_stack_mask(Nx, Ny, L=offset_tiles, db_stack=None)
            n_tiles = int(min(N_SAMPLES, np.sum(tile_mask)))
            if n_tiles == 0:
                print('0 valid samples. Skipping slide...')
                continue

            # store batch cell numbers and slide names
            batch_ncells.extend([n_cells_corrected] * n_tiles)
            batch_slides.extend([slide] * n_tiles)

            # uniformly sample tiles from the valid sample space of size n_samples
            # in this case, I have set it to just get all available samples from each slide
            sampled_indices = np.random.choice(a=Nx * Ny, size=n_tiles,
                                               p=tile_mask / np.sum(tile_mask), replace=False)
            sampled_tiles = sample_tile_stack[sampled_indices, :, :]

            # compute response variable over sampled tiles
            response, nts = utils.get_pdl1_response(sampled_tiles, circle=True,
                                                    diameter=sample_tile_width, diagnostic=True)

            # compute feature arrays over sampled tiles from neighboring tiles
            feature_rows = []
            overlap = []
            for j in sampled_indices:
                feature_tiles = utils.get_feature_array(j, feature_tile_stack, Nx,
                                                        sample_tile_width, offset_px, flag)
                seg_map_tiles = utils.get_feature_array(j, seg_tile_stack, Nx,
                                                        sample_tile_width, offset_px, flag)

                # store feature tile and overlap with unprocessed regions
                feature_rows.append(feature_tiles)
                overlap.append(np.sum(seg_map_tiles))

            del feature_tile_stack
            del seg_tile_stack

            # add to growing array as long as any valid samples have been collected
            if len(feature_rows) > 0:
                feature_rows = np.vstack(feature_rows)
                overlap = np.array(overlap)
                # # remove observations with significant overlap (>10%) with unprocessed regions
                # mask = (np.array(overlap) <= 0.1 * max(diams) ** 2)
                # feature_rows = feature_rows[mask, :]
                # response = response[mask]
                # nts = nts[mask]

                batch_response.extend(response)
                batch_features.append(feature_rows)
                batch_overlap.extend(overlap)

        # convert feature and response to numpy arrays for analysis
        batch_features = np.vstack(batch_features)
        batch_response = np.array(batch_response)
        batch_overlap = np.array(batch_overlap)

        # ----- variable processing ----- #

        # # remove all cases with no tumor cells in the sampled tile
        # mask = combined_response == -1
        # combined_response = combined_response[~mask]
        # combined_features = combined_features[~mask, :]

        # # alternatively, remove all cases with <K tumor cells in the sampled tile
        # # print combined_nts.shape, combined_response.shape, combined_features.shape
        # mask = combined_nts < 10
        # combined_response = combined_response[~mask]
        # combined_features = combined_features[~mask, :]


        # aggregate tiles within arbitrary shapes (e.g. discs or squares of increasing size)
        n_obs = batch_features.shape[0]
        side_len = sample_tile_width + 2 * offset_px
        n_tiles = side_len ** 2

        if flag == 'n':
            phens = ['tumor','cd4','cd8','foxp3','pdmac','other']
        elif flag == 'a':
            phens = ['tumor','pdl1','cd4','cd8','foxp3','pdmac','other']
        elif flag == 't':
            phens = ['tumor','pdl1']

        phen_columns = []
        for phen in range(len(phens)):    # iterate process over each phenotype
            tmp_tiles = batch_features[:, phen * n_tiles:(phen + 1) * n_tiles]
            tmp_3d = tmp_tiles.reshape(n_obs, side_len, side_len)

            range_columns = []

            diams_0 = [0] + diams
            for i in range(len(diams)):
                print_progress('{0}: {1}'.format(phens[phen], diams[i]))
                if (flag in ['a','t']) and (phens[phen] in ['tumor','pdl1']) and (diams[i] <= sample_tile_width):
                    print("skipping.")
                    continue

                mask = utils.shape_mask(grid_dim=side_len, type='circle',
                S=diams_0[i+1], s=diams_0[i])

                t = np.sum(np.multiply(tmp_3d, mask), axis=(1,2)).reshape(-1, 1)
                # sigma = np.std(np.multiply(tmp_3d, mask), axis=(1,2)).reshape(-1,1)
                range_columns.append(t)
                # range_columns.append(sigma)

            per_phen_features = np.hstack(range_columns)
            phen_columns.append(per_phen_features)

        del batch_features
        ncells_all.extend(batch_ncells)
        slides_all.extend(batch_slides)
        X_all.append(np.hstack(phen_columns))
        y_all.extend(batch_response)
        overlap_all.extend(batch_overlap)

    ncells_all = np.array(ncells_all)
    X_all = np.vstack(X_all)
    y_all = np.array(y_all)
    overlap_all = np.array(overlap_all)

    # save processed data as csv
    feature_names = ["_".join([a, str(b)]) for a in phens for b in diams]
    feature_names.append('y')
    tmp = pd.DataFrame(np.hstack((X_all, y_all.reshape(-1,1))))
    tmp.columns = feature_names
    tmp['slide'] = slides_all
    tmp['n_cells_corrected'] = ncells_all
    tmp['unscored_overlap'] = overlap_all
    tmp = tmp.set_index('slide')
    tmp.to_csv(os.path.join(HOME_PATH, 'data', 'local_discs.csv'))
def extract_dataset(diams, sample_diam, flag):

    np.random.seed(1000)

    # set sampling parameters
    N_SLIDES = 260
    N_SAMPLES = 15

    # set feature extraction parameters
    sample_tile_width = sample_diam
    feature_tile_width = 1
    feature_layers = 75

    # compute other parameters based on input parameters
    scale = int(sample_tile_width / feature_tile_width)
    assert (
        scale == sample_tile_width / feature_tile_width
    ), "sample_tile_width must be integer multiple of feature_tile_width"
    Nx, Ny = int(1392 / sample_tile_width), int(1040 / sample_tile_width)
    nx, ny = Nx * scale, Ny * scale
    sample_layers = int(
        np.ceil(feature_layers * feature_tile_width / sample_tile_width))

    # get pre-processed slide matrices and select random sample of slides
    all_samples = helper.processing.get_list_of_samples(DIR)
    SAMPLES = [
        all_samples[i]
        for i in np.random.choice(len(all_samples), N_SLIDES, replace=False)
    ]

    # iterate over sampled slides to extract feature and response variables via tile sampling
    combined_features = []
    combined_response = []
    combined_nts = []
    for i, slide in enumerate(SAMPLES):
        print_progress(i)

        # load slide and reshape into tile stacks
        cell_mat = np.load(slide)
        sample_tile_stack = utils.restack_to_tiles(
            cell_mat, tile_width=sample_tile_width, nx=Nx, ny=Ny)
        feature_tile_stack = utils.restack_to_tiles(
            cell_mat, tile_width=feature_tile_width, nx=nx, ny=ny)

        # load tumor edge matrix (skipping slide if no matrix is found)
        try:
            edges = np.load(slide.split(".npy")[0] + "_edges.npy")
            edges_tile_stack = utils.restack_to_tiles(
                edges, tile_width=sample_tile_width, nx=Nx, ny=Ny)
        except IOError:
            print 'No edge matrix. Skipping slide...'
            continue

        # select valid tiles for sampling, skipping slide if no valid tiles are available
        tile_mask = utils.tile_stack_mask(Nx,
                                          Ny,
                                          L=sample_layers,
                                          db_stack=edges_tile_stack)
        if np.sum(tile_mask) == 0:
            print '0 valid samples. Skipping slide...'
            continue

        # uniformly sample tiles from the valid sample space of size n_samples
        sampled_indices = np.random.choice(a=Nx * Ny,
                                           size=int(
                                               min(N_SAMPLES,
                                                   np.sum(tile_mask))),
                                           p=tile_mask / np.sum(tile_mask),
                                           replace=False)
        sampled_tiles = sample_tile_stack[sampled_indices, :, :]

        # compute response variable over sampled tiles
        response, nts = utils.get_pdl1_response(sampled_tiles,
                                                circle=True,
                                                diameter=sample_tile_width,
                                                diagnostic=True)

        # compute feature arrays over sampled tiles from neighboring tiles
        feature_rows = np.vstack([
            utils.get_feature_array(idx, feature_tile_stack, Nx, scale,
                                    feature_layers, flag)
            for idx in sampled_indices
        ])

        # add outputs to growing array
        combined_response.extend(response)
        combined_features.append(feature_rows)
        combined_nts.extend(nts)

    # convert feature and response to numpy arrays for analysis
    combined_features = np.vstack(combined_features)
    combined_features[np.isnan(combined_features)] = -1
    combined_response = np.array(combined_response)
    combined_nts = np.array(combined_nts)

    # ----- variable processing ----- #

    # # remove all cases with no tumor cells in the sampled tile
    # mask = combined_response == -1
    # combined_response = combined_response[~mask]
    # combined_features = combined_features[~mask, :]

    # alternatively, remove all cases with <K tumor cells in the sampled tile
    print combined_nts.shape, combined_response.shape, combined_features.shape
    mask = combined_nts < 10
    combined_response = combined_response[~mask]
    combined_features = combined_features[~mask, :]

    # aggregate tiles within arbitrary shapes (e.g. discs or squares of increasing size)
    n_obs = combined_features.shape[0]
    side_len = scale + 2 * feature_layers
    n_tiles = side_len**2

    if flag == 'n':
        phens = ['tumor', 'cd4', 'cd8', 'foxp3', 'pdmac', 'other']
    elif flag == 'a':
        phens = ['tumor', 'pdl1', 'cd4', 'cd8', 'foxp3', 'pdmac', 'other']
    elif flag == 't':
        phens = ['tumor', 'pdl1']

    phen_columns = []
    for phen in range(len(phens)):  # iterate process over each phenotype
        tmp_tiles = combined_features[:, phen * n_tiles:(phen + 1) * n_tiles]
        tmp_3d = tmp_tiles.reshape(n_obs, side_len, side_len)

        range_columns = []

        d_seq_0 = [0] + d_seq
        for i in range(len(d_seq_0) - 1):
            # utils.print_progress(i)
            print phens[phen], d_seq[i]
            if (flag in ['a', 't']) and (phens[phen] in [
                    'tumor', 'pdl1'
            ]) and (d_seq[i] <= sample_tile_width):
                print "skipping."
                continue

            mask = utils.shape_mask(grid_dim=side_len,
                                    type='circle',
                                    S=d_seq_0[i + 1],
                                    s=d_seq_0[i])

            t = np.sum(np.multiply(tmp_3d, mask), axis=(1, 2)).reshape(-1, 1)
            # sigma = np.std(np.multiply(tmp_3d, mask), axis=(1,2)).reshape(-1,1)
            range_columns.append(t)
            # range_columns.append(sigma)

        per_phen_features = np.hstack(range_columns)
        phen_columns.append(per_phen_features)
    X = np.hstack(phen_columns)

    np.save(STORE_DIR + "data_x", X)
    np.save(STORE_DIR + "data_y", combined_response)
def automate_tile_extraction(SAMPLES):
    # iterate over sampled slides to extract feature and response variables via tile sampling
    combined_features = []
    combined_response = []
    combined_nts = []
    for i, slide in enumerate(SAMPLES):
        print_progress(i)

        # load slide and reshape into tile stacks
        cell_mat = np.load(slide)
        sample_tile_stack = utils.restack_to_tiles(
            cell_mat, tile_width=sample_tile_width, nx=Nx, ny=Ny)
        feature_tile_stack = utils.restack_to_tiles(
            cell_mat, tile_width=feature_tile_width, nx=nx, ny=ny)

        # load tumor edge matrix (skipping slide if no matrix is found)
        try:
            edges = np.load(slide.split(".npy")[0] + "_edges.npy")
            edges_tile_stack = utils.restack_to_tiles(
                edges, tile_width=sample_tile_width, nx=Nx, ny=Ny)
        except IOError:
            print 'No edge matrix. Skipping slide...'
            continue

        # select valid tiles for sampling, skipping slide if no valid tiles are available
        tile_mask = utils.tile_stack_mask(Nx,
                                          Ny,
                                          L=sample_layers,
                                          db_stack=edges_tile_stack)
        if np.sum(tile_mask) == 0:
            print '0 valid samples. Skipping slide...'
            continue

        # uniformly sample tiles from the valid sample space of size n_samples
        sampled_indices = np.random.choice(a=Nx * Ny,
                                           size=int(
                                               min(N_SAMPLES,
                                                   np.sum(tile_mask))),
                                           p=tile_mask / np.sum(tile_mask),
                                           replace=False)
        sampled_tiles = sample_tile_stack[sampled_indices, :, :]

        # compute response variable over sampled tiles
        response, nts = utils.get_pdl1_response(sampled_tiles,
                                                circle=True,
                                                diameter=sample_tile_width,
                                                diagnostic=True)

        # compute feature arrays over sampled tiles from neighboring tiles
        feature_rows = np.vstack([
            utils.get_feature_array(idx, feature_tile_stack, Nx, scale,
                                    feature_layers, flag)
            for idx in sampled_indices
        ])

        # add outputs to growing array
        combined_response.extend(response)
        combined_features.append(feature_rows)
        combined_nts.extend(nts)

    # convert feature and response to numpy arrays for analysis
    combined_features = np.vstack(combined_features)
    combined_features[np.isnan(combined_features)] = -1
    combined_response = np.array(combined_response)
    combined_nts = np.array(combined_nts)

    return combined_features, combined_response, combined_nts