Example #1
0
def test_continous_generator():
    eligible_cells = ['K562', 'HepG2', 'H1', 'A549', 'HeLa-S3']
    eligible_targets = ['DNase', 'CTCF', 'RAD21', 'LARP7']
    dataset = EpitomeDataset(targets=eligible_targets, cells=eligible_cells)
    test_celltypes = ['K562']
    eligible_cells.remove(test_celltypes[0])
    radii = [1, 10]
    # fake data for DNase
    similarity_matrix = np.ones(dataset.get_data(Dataset.TRAIN).shape[1])

    results = load_data(dataset.get_data(Dataset.TRAIN),
                        test_celltypes,
                        eligible_cells,
                        dataset.matrix,
                        dataset.targetmap,
                        dataset.cellmap,
                        radii,
                        mode=Dataset.RUNTIME,
                        continuous=True,
                        similarity_matrix=similarity_matrix,
                        similarity_targets='DNase',
                        return_feature_names=True,
                        indices=np.arange(0, 10))()
    li_results = list(results)
    feature_names = li_results[0][1][0]

    # make sure we don't have the agreement features
    assert (len(li_results[0][0][0]) == 20)
    assert (len(list(filter(lambda x: '_agree' in x, feature_names))) == 0)
Example #2
0
    def test_generator_only_H3(self):

        # generate consistent data
        data = np.zeros(self.train_shape)
        data[::2] = 1  # every 2nd row is 1s

        eligible_cells = ['K562', 'HepG2', 'H1', 'HeLa-S3']
        eligible_targets = ['H3K27ac', 'CTCF']
        matrix, cellmap, targetmap = self.getFeatureData(
            eligible_targets, eligible_cells, similarity_targets=['H3K27ac'])
        assert (len(list(targetmap)) == 2)  # should not have added DNase

        label_cell_types = ['K562']
        eligible_cells.remove(label_cell_types[0])

        results = load_data(data, ['K562'],
                            eligible_cells,
                            matrix,
                            targetmap,
                            cellmap,
                            radii=[1, 3],
                            mode=Dataset.VALID,
                            similarity_targets=['H3K27ac'],
                            indices=np.arange(0, 2),
                            return_feature_names=True)()

        li_results = list(results)
        labels = li_results[0][1][0]
        assert (labels[0] == 'HepG2_H3K27ac')
Example #3
0
def test_generator_dnase_array():
    # should not fail if similarity_targets are just for DNase and is a single array.
    # https://github.com/YosefLab/epitome/issues/4
    eligible_cells = ['K562', 'HepG2', 'H1', 'A549', 'HeLa-S3']
    eligible_targets = ['DNase', 'CTCF', 'RAD21', 'LARP7']
    dataset = EpitomeDataset(targets=eligible_targets, cells=eligible_cells)
    test_celltypes = ['K562']
    eligible_cells.remove(test_celltypes[0])
    radii = [1, 10]
    # fake data for DNase
    similarity_matrix = np.ones(dataset.get_data(Dataset.TRAIN).shape[1])

    results = load_data(dataset.get_data(Dataset.TRAIN),
                        test_celltypes,
                        eligible_cells,
                        dataset.matrix,
                        dataset.targetmap,
                        dataset.cellmap,
                        radii,
                        mode=Dataset.RUNTIME,
                        similarity_matrix=similarity_matrix,
                        similarity_targets='DNase',
                        return_feature_names=True,
                        indices=np.arange(0, 10))()
    li_results = list(results)

    # if we reach here, an error was not thrown :)
    assert (len(li_results) == 10)
    assert (len(li_results[0][0][0]) == 28)

    feature_names = li_results[0][1][0]
    assert (len(list(filter(lambda x: '_agree' in x,
                            feature_names))) == len(radii) *
            len(eligible_cells))
Example #4
0
    def test_generator_no_dnase(self):

        # generate consistent data
        data = np.zeros(self.train_shape)
        data[::2] = 1  # every 2nd row is 1s

        eligible_cells = ['K562', 'HepG2', 'H1', 'A549', 'HeLa-S3']
        eligible_targets = ['DNase', 'CTCF']
        matrix, cellmap, targetmap = self.getFeatureData(
            eligible_targets, eligible_cells)

        label_cell_types = ['K562']
        eligible_cells.remove(label_cell_types[0])

        results = load_data(
            data,
            ['K562'],
            eligible_cells,
            matrix,
            targetmap,
            cellmap,
            radii=[],  # no dnase
            mode=Dataset.VALID,
            indices=np.arange(0, 10))()
        li_results = list(results)

        # this element is a positive
        pos_position = 6
        print(li_results[pos_position][-2])
        assert (np.all(li_results[pos_position][-2] == 1))
Example #5
0
def test_generator_multiple_sim():
    eligible_cells = ['K562', 'HepG2', 'H1', 'A549', 'HeLa-S3']
    eligible_targets = ['DNase', 'CTCF', 'RAD21']
    dataset = EpitomeDataset(targets=eligible_targets, cells=eligible_cells)

    label_cell_types = ['K562']
    eligible_cells.remove(label_cell_types[0])

    similarity_matrix = np.ones([2, dataset.get_data(Dataset.TRAIN).shape[1]])

    radii = [1, 10]

    results = load_data(dataset.get_data(Dataset.TRAIN), ['K562'],
                        eligible_cells,
                        dataset.matrix,
                        dataset.targetmap,
                        dataset.cellmap,
                        radii=radii,
                        mode=Dataset.RUNTIME,
                        similarity_matrix=similarity_matrix,
                        similarity_targets=['DNase', 'CTCF'],
                        indices=np.arange(0, 10))()
    li_results = list(results)

    # length should include eligible targets and 2* radius for pos and agreement
    # for each of the 2 similarity targets
    assert (len(li_results[0][0]) == len(eligible_cells) *
            (len(eligible_targets) + len(radii) * 4))
Example #6
0
def test_generator_sparse_data():

    eligible_cells = ['K562', 'HepG2', 'H1', 'A549', 'HeLa-S3']
    eligible_targets = ['DNase', 'CTCF', 'RAD21', 'LARP7']
    dataset = EpitomeDataset(targets=eligible_targets,
                             cells=eligible_cells,
                             min_cells_per_target=1,
                             min_targets_per_cell=1)

    label_cell_types = ['HepG2']
    eligible_cells.remove(label_cell_types[0])

    results = list(
        load_data(dataset.get_data(Dataset.TRAIN),
                  label_cell_types,
                  eligible_cells,
                  dataset.matrix,
                  dataset.targetmap,
                  dataset.cellmap,
                  radii=[],
                  mode=Dataset.VALID,
                  return_feature_names=True,
                  indices=np.arange(0, 10))())

    # get first features
    features = results[0][0]

    # get labels
    labels = results[0][1]

    #  all cell types but K562 are missing LARP7 data
    assert (len(features[0]) == len(eligible_cells) * len(eligible_targets) -
            3)

    # make sure mask is masking out LARP7 for HepG2
    assert (np.all(features[-1] == [1., 0., 1.]))

    # make sure first label cell is not the test cell K562
    assert (labels[-2][0] == 'lbl_HepG2_RAD21')
    assert (labels[-2][1] == 'lbl_HepG2_LARP7')
    assert (labels[-2][2] == 'lbl_HepG2_CTCF')
Example #7
0
def test_generator_radius():
    eligible_cells = ['K562', 'HepG2', 'H1', 'A549', 'HeLa-S3']
    eligible_targets = ['DNase', 'CTCF', 'RAD21']

    dataset = EpitomeDataset(targets=eligible_targets, cells=eligible_cells)

    label_cell_types = ['K562']
    eligible_cells.remove(label_cell_types[0])

    radii = [1, 10]

    results = load_data(dataset.get_data(Dataset.TRAIN), ['K562'],
                        eligible_cells,
                        dataset.matrix,
                        dataset.targetmap,
                        dataset.cellmap,
                        radii=radii,
                        mode=Dataset.VALID,
                        indices=np.arange(0, 10))()
    li_results = list(results)

    # length should include eligible targets and 2* radius for pos and agreement
    assert (len(li_results[0][0]) == len(eligible_cells) *
            (len(eligible_targets) + len(radii) * 2))