Ejemplo n.º 1
0
def test_large_dataframe(pca_large_dataframe, kwargs):
    from flotilla.visualize.decomposition import DecompositionViz

    dv = DecompositionViz(pca_large_dataframe.reduced_space,
                          pca_large_dataframe.components_,
                          pca_large_dataframe.explained_variance_ratio_,
                          **kwargs)
    x_pc = kwargs['x_pc']
    y_pc = kwargs['y_pc']
    pcs = [x_pc, y_pc]

    true_top_features = set([])
    true_pc_loadings_labels = {}
    true_pc_loadings = {}

    for pc in pcs:
        x = pca_large_dataframe.components_.ix[pc].copy()
        x.sort(ascending=True)
        half_features = int(kwargs['n_top_pc_features'] / 2)
        if len(x) > kwargs['n_top_pc_features']:
            a = x[:half_features]
            b = x[-half_features:]
            labels = np.r_[a.index, b.index]
            true_pc_loadings[pc] = np.r_[a, b]
        else:
            labels = x.index
            true_pc_loadings[pc] = x

        true_pc_loadings_labels[pc] = labels
        true_top_features.update(labels)
    pdt.assert_array_equal(dv.top_features, true_top_features)
    pdt.assert_dict_equal(dv.pc_loadings_labels, true_pc_loadings_labels)
    pdt.assert_dict_equal(dv.pc_loadings, true_pc_loadings)
Ejemplo n.º 2
0
    def test_init(self, gene_ontology_data, gene_ontology):
        true_data = gene_ontology_data.dropna()
        true_all_genes = true_data['Ensembl Gene ID'].unique()
        true_ontology = defaultdict(dict)

        for go, df in true_data.groupby('GO Term Accession'):
            true_ontology[go]['genes'] = set(df['Ensembl Gene ID'])
            true_ontology[go]['name'] = df['GO Term Name'].values[0]
            true_ontology[go]['domain'] = df['GO domain'].values[0]
            true_ontology[go]['n_genes'] = len(true_ontology[go]['genes'])

        pdt.assert_frame_equal(true_data, gene_ontology.data)
        pdt.assert_array_equal(sorted(true_all_genes),
                               sorted(gene_ontology.all_genes))

        pdt.assert_contains_all(true_ontology.keys(), gene_ontology.ontology)
        pdt.assert_contains_all(gene_ontology.ontology.keys(), true_ontology)

        for go, true_attributes in true_ontology.items():
            test_attributes = gene_ontology.ontology[go]
            true_genes = sorted(true_attributes['genes'])
            test_genes = sorted(test_attributes['genes'])
            pdt.assert_array_equal(true_genes, test_genes)
            pdt.assert_equal(true_attributes['name'], test_attributes['name'])
            pdt.assert_equal(true_attributes['domain'],
                             test_attributes['domain'])
            pdt.assert_equal(true_attributes['n_genes'],
                             test_attributes['n_genes'])
Ejemplo n.º 3
0
    def test_init(self, gene_ontology_data, gene_ontology):
        true_data = gene_ontology_data.dropna()
        true_all_genes = true_data['Ensembl Gene ID'].unique()
        true_ontology = defaultdict(dict)

        for go, df in true_data.groupby('GO Term Accession'):
            true_ontology[go]['genes'] = set(df['Ensembl Gene ID'])
            true_ontology[go]['name'] = df['GO Term Name'].values[0]
            true_ontology[go]['domain'] = df['GO domain'].values[0]
            true_ontology[go]['n_genes'] = len(true_ontology[go]['genes'])

        pdt.assert_frame_equal(true_data, gene_ontology.data)
        pdt.assert_array_equal(sorted(true_all_genes),
                               sorted(gene_ontology.all_genes))

        pdt.assert_contains_all(true_ontology.keys(), gene_ontology.ontology)
        pdt.assert_contains_all(gene_ontology.ontology.keys(), true_ontology)

        for go, true_attributes in true_ontology.items():
            test_attributes = gene_ontology.ontology[go]
            true_genes = sorted(true_attributes['genes'])
            test_genes = sorted(test_attributes['genes'])
            pdt.assert_array_equal(true_genes, test_genes)
            pdt.assert_equal(true_attributes['name'], test_attributes['name'])
            pdt.assert_equal(true_attributes['domain'],
                             test_attributes['domain'])
            pdt.assert_equal(true_attributes['n_genes'],
                             test_attributes['n_genes'])
Ejemplo n.º 4
0
    def test_feature_subset_to_feature_ids(self, expression_data_no_na,
                                           expression_feature_data,
                                           feature_subset):
        from flotilla.data_model.base import BaseData

        expression = BaseData(expression_data_no_na,
                              feature_data=expression_feature_data)
        test_feature_ids = expression.feature_subset_to_feature_ids(
            feature_subset, rename=False)

        true_feature_ids = pd.Index([])
        if feature_subset is not None:
            try:
                if feature_subset in expression.feature_subsets:
                    true_feature_ids = expression.feature_subsets[
                        feature_subset]
                elif feature_subset.startswith('all'):
                    true_feature_ids = expression.data.columns
            except TypeError:
                if not isinstance(feature_subset, str):
                    feature_ids = feature_subset
                    n_custom = expression.feature_data.columns.map(
                        lambda x: x.startswith('custom')).sum()
                    ind = 'custom_{}'.format(n_custom + 1)
                    expression.feature_data[ind] = \
                        expression.feature_data.index.isin(feature_ids)
                else:
                    raise ValueError(
                        "There are no {} features in this data: "
                        "{}".format(feature_subset, self))
        else:
            true_feature_ids = expression.data.columns
        pdt.assert_array_equal(test_feature_ids, true_feature_ids)
Ejemplo n.º 5
0
def test_order(pca, kwargs):
    from flotilla.visualize.decomposition import DecompositionViz

    kw = kwargs.copy()
    kw.pop('order')
    kw.pop('groupby')
    groups = ['group1', 'group2', 'group3']

    groupby = pd.Series(
        [np.random.choice(groups) for i in pca.reduced_space.index],
        index=pca.reduced_space.index)
    order = ['group3', 'group1', 'group2']

    dv = DecompositionViz(pca.reduced_space,
                          pca.components_,
                          pca.explained_variance_ratio_,
                          order=order,
                          groupby=groupby,
                          **kw)

    color_ordered = [dv.label_to_color[x] for x in order]

    pdt.assert_series_equal(dv.groupby, groupby)
    pdt.assert_array_equal(dv.order, order)
    pdt.assert_array_equal(dv.color_ordered, color_ordered)
Ejemplo n.º 6
0
    def test_sample_subset_to_sample_ids(self, study, sample_subset):
        test_sample_subset = study.sample_subset_to_sample_ids(sample_subset)

        try:
            true_sample_subset = study.metadata.sample_subsets[sample_subset]
        except (KeyError, TypeError):
            try:
                ind = study.metadata.sample_id_to_phenotype == sample_subset
                if ind.sum() > 0:
                    true_sample_subset = \
                        study.metadata.sample_id_to_phenotype.index[ind]
                else:
                    if sample_subset is None or 'all_samples'.startswith(
                            sample_subset):
                        sample_ind = np.ones(study.metadata.data.shape[0],
                                             dtype=bool)
                    elif sample_subset.startswith("~"):
                        sample_ind = ~pd.Series(
                            study.metadata.data[sample_subset.lstrip("~")],
                            dtype='bool')

                    else:
                        sample_ind = pd.Series(
                            study.metadata.data[sample_subset], dtype='bool')
                    true_sample_subset = study.metadata.data.index[sample_ind]
            except (AttributeError, ValueError):
                true_sample_subset = sample_subset

        pdt.assert_array_equal(true_sample_subset, test_sample_subset)
Ejemplo n.º 7
0
    def test_sample_subset_to_sample_ids(self, study, sample_subset):
        test_sample_subset = study.sample_subset_to_sample_ids(sample_subset)

        try:
            true_sample_subset = study.metadata.sample_subsets[sample_subset]
        except (KeyError, TypeError):
            try:
                ind = study.metadata.sample_id_to_phenotype == sample_subset
                if ind.sum() > 0:
                    true_sample_subset = \
                        study.metadata.sample_id_to_phenotype.index[ind]
                else:
                    if sample_subset is None or 'all_samples'.startswith(
                            sample_subset):
                        sample_ind = np.ones(study.metadata.data.shape[0],
                                             dtype=bool)
                    elif sample_subset.startswith("~"):
                        sample_ind = ~pd.Series(
                            study.metadata.data[sample_subset.lstrip("~")],
                            dtype='bool')

                    else:
                        sample_ind = pd.Series(
                            study.metadata.data[sample_subset], dtype='bool')
                    true_sample_subset = study.metadata.data.index[sample_ind]
            except (AttributeError, ValueError):
                true_sample_subset = sample_subset

        pdt.assert_array_equal(true_sample_subset, test_sample_subset)
Ejemplo n.º 8
0
    def test_feature_subset_to_feature_ids(self, expression_data_no_na,
                                           expression_feature_data,
                                           feature_subset):
        from flotilla.data_model.base import BaseData

        expression = BaseData(expression_data_no_na,
                              feature_data=expression_feature_data)
        test_feature_ids = expression.feature_subset_to_feature_ids(
            feature_subset, rename=False)

        true_feature_ids = pd.Index([])
        if feature_subset is not None:
            try:
                if feature_subset in expression.feature_subsets:
                    true_feature_ids = expression.feature_subsets[
                        feature_subset]
                elif feature_subset.startswith('all'):
                    true_feature_ids = expression.data.columns
            except TypeError:
                if not isinstance(feature_subset, str):
                    feature_ids = feature_subset
                    n_custom = expression.feature_data.columns.map(
                        lambda x: x.startswith('custom')).sum()
                    ind = 'custom_{}'.format(n_custom + 1)
                    expression.feature_data[ind] = \
                        expression.feature_data.index.isin(feature_ids)
                else:
                    raise ValueError("There are no {} features in this data: "
                                     "{}".format(feature_subset, self))
        else:
            true_feature_ids = expression.data.columns
        pdt.assert_array_equal(test_feature_ids, true_feature_ids)
Ejemplo n.º 9
0
def test_large_dataframe(pca_large_dataframe, kwargs):
    from flotilla.visualize.decomposition import DecompositionViz

    dv = DecompositionViz(pca_large_dataframe.reduced_space,
                          pca_large_dataframe.components_,
                          pca_large_dataframe.explained_variance_ratio_,
                          **kwargs)
    x_pc = kwargs['x_pc']
    y_pc = kwargs['y_pc']
    pcs = [x_pc, y_pc]

    true_top_features = set([])
    true_pc_loadings_labels = {}
    true_pc_loadings = {}

    for pc in pcs:
        x = pca_large_dataframe.components_.ix[pc].copy()
        x.sort(ascending=True)
        half_features = int(kwargs['n_top_pc_features'] / 2)
        if len(x) > kwargs['n_top_pc_features']:
            a = x[:half_features]
            b = x[-half_features:]
            labels = np.r_[a.index, b.index]
            true_pc_loadings[pc] = np.r_[a, b]
        else:
            labels = x.index
            true_pc_loadings[pc] = x

        true_pc_loadings_labels[pc] = labels
        true_top_features.update(labels)
    pdt.assert_array_equal(dv.top_features, true_top_features)
    pdt.assert_dict_equal(dv.pc_loadings_labels, true_pc_loadings_labels)
    pdt.assert_dict_equal(dv.pc_loadings, true_pc_loadings)
Ejemplo n.º 10
0
    def test_save(self, study, tmpdir):
        from flotilla.datapackage import name_to_resource

        study_name = 'test_save'
        study.supplemental.expression_corr = study.expression.data.corr()
        study.save(study_name, flotilla_dir=tmpdir)

        assert len(tmpdir.listdir()) == 1
        save_dir = tmpdir.listdir()[0]

        with open('{}/datapackage.json'.format(save_dir)) as f:
            test_datapackage = json.load(f)

        assert study_name == save_dir.purebasename

        # resource_keys_to_ignore = ('compression', 'format', 'path',
        #                            'url')
        keys_from_study = {
            'splicing': [],
            'expression': ['thresh', 'log_base', 'plus_one'],
            'metadata': [
                'phenotype_order', 'phenotype_to_color', 'phenotype_col',
                'phenotype_to_marker', 'pooled_col', 'minimum_samples'
            ],
            'mapping_stats': ['number_mapped_col', 'min_reads'],
            'expression_feature': ['rename_col', 'ignore_subset_cols'],
            'splicing_feature':
            ['rename_col', 'ignore_subset_cols', 'expression_id_col'],
            'gene_ontology': []
        }
        resource_names = keys_from_study.keys()

        # Add auto-generated attributes into the true datapackage
        for name, keys in keys_from_study.iteritems():
            resource = name_to_resource(test_datapackage, name)
            for key in keys:
                command = self.get_data_eval_command(name, key)
                test_value = resource[key]
                true_value = eval(command)
                if isinstance(test_value, dict):
                    pdt.assert_dict_equal(test_value, true_value)
                elif isinstance(test_value, Iterable):
                    pdt.assert_array_equal(test_value, true_value)

        for name in resource_names:
            resource = name_to_resource(test_datapackage, name)
            path = '{}.csv.gz'.format(name)
            assert resource['path'] == path
            test_df = pd.read_csv('{}/{}/{}'.format(tmpdir, study_name, path),
                                  index_col=0,
                                  compression='gzip')
            command = self.get_data_eval_command(name, 'data_original')
            true_df = eval(command)
            pdt.assert_frame_equal(test_df, true_df)

        version = semantic_version.Version(study.version)
        version.patch += 1
        assert str(version) == test_datapackage['datapackage_version']
        assert study_name == test_datapackage['name']
Ejemplo n.º 11
0
def test_chr_start_stop_to_sj_ind(chr_start_stop, sj):
    from sj2psi import chr_start_stop_to_sj_ind
    test_output = chr_start_stop_to_sj_ind(chr_start_stop, sj)

    chrom, startstop = chr_start_stop.replace(',', '').split(':')
    start, stop = map(int, startstop.split('-'))
    true_output = (sj.chrom == chrom) & (start < sj.intron_start) \
        & (sj.intron_stop < stop)
    pdt.assert_array_equal(test_output, true_output)
Ejemplo n.º 12
0
def test_chr_start_stop_to_sj_ind(chr_start_stop, sj):
    from sj2psi import chr_start_stop_to_sj_ind
    test_output = chr_start_stop_to_sj_ind(chr_start_stop, sj)

    chrom, startstop = chr_start_stop.replace(',', '').split(':')
    start, stop = map(int, startstop.split('-'))
    true_output = (sj.chrom == chrom) & (start < sj.intron_start) \
        & (sj.intron_stop < stop)
    pdt.assert_array_equal(test_output, true_output)
Ejemplo n.º 13
0
    def test_too_few_mapped(self, mapping_stats, mapping_stats_data,
                            mapping_stats_kws):
        from flotilla.data_model.quality_control import MIN_READS

        min_reads = mapping_stats_kws.get('min_reads', MIN_READS)
        number_mapped_col = mapping_stats_kws.get('number_mapped_col')
        number_mapped = mapping_stats_data[number_mapped_col]
        too_few_mapped = number_mapped.index[number_mapped < min_reads]
        pdt.assert_array_equal(mapping_stats.too_few_mapped, too_few_mapped)
Ejemplo n.º 14
0
    def test__variant(self, expression_data):
        from flotilla.data_model.base import BaseData

        base_data = BaseData(expression_data)

        var = expression_data.var()
        var_cut = var.mean() + 2 * var.std()
        variant = expression_data.columns[var > var_cut]

        pdt.assert_equal(base_data._var_cut, var_cut)
        pdt.assert_array_equal(base_data.variant, variant)
Ejemplo n.º 15
0
    def test__variant(self, expression_data):
        from flotilla.data_model.base import BaseData

        base_data = BaseData(expression_data)

        var = expression_data.var()
        var_cut = var.mean() + 2 * var.std()
        variant = expression_data.columns[var > var_cut]

        pdt.assert_equal(base_data._var_cut, var_cut)
        pdt.assert_array_equal(base_data.variant, variant)
Ejemplo n.º 16
0
 def test_feature_subset_to_feature_ids(self, study, data_type,
                                        feature_subset):
     test_feature_subset = study.feature_subset_to_feature_ids(
         data_type, feature_subset)
     if 'expression'.startswith(data_type):
         true_feature_subset = \
             study.expression.feature_subset_to_feature_ids(feature_subset,
                                                            rename=False)
     elif 'splicing'.startswith(data_type):
         true_feature_subset = study.splicing.feature_subset_to_feature_ids(
             feature_subset, rename=False)
     pdt.assert_array_equal(test_feature_subset, true_feature_subset)
Ejemplo n.º 17
0
 def test_feature_subset_to_feature_ids(self, study, data_type,
                                        feature_subset):
     test_feature_subset = study.feature_subset_to_feature_ids(
         data_type, feature_subset)
     if 'expression'.startswith(data_type):
         true_feature_subset = \
             study.expression.feature_subset_to_feature_ids(feature_subset,
                                                            rename=False)
     elif 'splicing'.startswith(data_type):
         true_feature_subset = study.splicing.feature_subset_to_feature_ids(
             feature_subset, rename=False)
     pdt.assert_array_equal(test_feature_subset, true_feature_subset)
Ejemplo n.º 18
0
    def test_group_var_generic_2d_all_finite(self):
        prng = RandomState(1234)

        out = (np.nan * np.ones((5, 2))).astype(self.dtype)
        counts = np.zeros(5, dtype=int)
        values = 10 * prng.rand(10, 2).astype(self.dtype)
        labels = np.tile(np.arange(5), (2, ))

        expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0)**2
        expected_counts = counts + 2

        self.algo(out, counts, values, labels)
        np.testing.assert_allclose(out, expected_out, self.rtol)
        tm.assert_array_equal(counts, expected_counts)
Ejemplo n.º 19
0
def test_unique_label_indices():
    from pandas.hashtable import unique_label_indices

    a = np.random.randint(1, 1 << 10, 1 << 15).astype('i8')

    left = unique_label_indices(a)
    right = np.unique(a, return_index=True)[1]

    tm.assert_array_equal(left, right)

    a[np.random.choice(len(a), 10)] = -1
    left= unique_label_indices(a)
    right = np.unique(a, return_index=True)[1][1:]
    tm.assert_array_equal(left, right)
Ejemplo n.º 20
0
    def test_group_var_generic_2d_all_finite(self):
        prng = RandomState(1234)

        out = (np.nan * np.ones((5, 2))).astype(self.dtype)
        counts = np.zeros(5, dtype="int64")
        values = 10 * prng.rand(10, 2).astype(self.dtype)
        labels = np.tile(np.arange(5), (2,)).astype("int64")

        expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2
        expected_counts = counts + 2

        self.algo(out, counts, values, labels)
        np.testing.assert_allclose(out, expected_out, self.rtol)
        tm.assert_array_equal(counts, expected_counts)
Ejemplo n.º 21
0
    def test_group_var_generic_1d(self):
        prng = RandomState(1234)

        out = (np.nan * np.ones((5, 1))).astype(self.dtype)
        counts = np.zeros(5, dtype="int64")
        values = 10 * prng.rand(15, 1).astype(self.dtype)
        labels = np.tile(np.arange(5), (3,)).astype("int64")

        expected_out = (np.squeeze(values).reshape((5, 3), order="F").std(axis=1, ddof=1) ** 2)[:, np.newaxis]
        expected_counts = counts + 3

        self.algo(out, counts, values, labels)
        np.testing.assert_allclose(out, expected_out, self.rtol)
        tm.assert_array_equal(counts, expected_counts)
Ejemplo n.º 22
0
def test_unique_label_indices():
    from pandas.hashtable import unique_label_indices

    a = np.random.randint(1, 1 << 10, 1 << 15).astype('i8')

    left = unique_label_indices(a)
    right = np.unique(a, return_index=True)[1]

    tm.assert_array_equal(left, right)

    a[np.random.choice(len(a), 10)] = -1
    left= unique_label_indices(a)
    right = np.unique(a, return_index=True)[1][1:]
    tm.assert_array_equal(left, right)
Ejemplo n.º 23
0
    def test_init_splicing(self, metadata_data, metadata_kws, splicing_data,
                           splicing_kws):
        from flotilla import Study

        metadata = metadata_data.copy()
        splicing = splicing_data.copy()

        kw_pairs = (('metadata', metadata_kws), ('splicing', splicing_kws))
        kwargs = {}
        for name, kws in kw_pairs:
            for k, v in kws.items():
                kwargs['{}_{}'.format(name, k)] = v
        study = Study(metadata, splicing_data=splicing, **kwargs)
        pdt.assert_array_equal(study.splicing.data_original, splicing_data)
Ejemplo n.º 24
0
    def test_init_expression(self, metadata_data, metadata_kws,
                             expression_data, expression_kws):
        from flotilla import Study

        metadata = metadata_data.copy()
        expression = expression_data.copy()

        kw_pairs = (('metadata', metadata_kws), ('expression', expression_kws))
        kwargs = {}
        for name, kws in kw_pairs:
            for k, v in kws.items():
                kwargs['{}_{}'.format(name, k)] = v
        study = Study(metadata, expression_data=expression, **kwargs)
        pdt.assert_array_equal(study.expression.data_original, expression_data)
Ejemplo n.º 25
0
    def test_group_var_generic_1d_flat_labels(self):
        prng = RandomState(1234)

        out = (np.nan * np.ones((1, 1))).astype(self.dtype)
        counts = np.zeros(1, dtype=int)
        values = 10 * prng.rand(5, 1).astype(self.dtype)
        labels = np.zeros(5, dtype=int)

        expected_out = np.array([[values.std(ddof=1) ** 2]])
        expected_counts = counts + 5

        self.algo(out, counts, values, labels)

        np.testing.assert_allclose(out, expected_out, self.rtol)
        tm.assert_array_equal(counts, expected_counts)
Ejemplo n.º 26
0
    def test_group_var_generic_1d(self):
        prng = RandomState(1234)

        out = (np.nan * np.ones((5, 1))).astype(self.dtype)
        counts = np.zeros(5, dtype=int)
        values = 10 * prng.rand(15, 1).astype(self.dtype)
        labels = np.tile(np.arange(5), (3, ))

        expected_out = (np.squeeze(values).reshape(
            (5, 3), order='F').std(axis=1, ddof=1)**2)[:, np.newaxis]
        expected_counts = counts + 3

        self.algo(out, counts, values, labels)
        np.testing.assert_allclose(out, expected_out, self.rtol)
        tm.assert_array_equal(counts, expected_counts)
Ejemplo n.º 27
0
    def test_group_var_generic_1d_flat_labels(self):
        prng = RandomState(1234)

        out = (np.nan * np.ones((1, 1))).astype(self.dtype)
        counts = np.zeros(1, dtype='int64')
        values = 10 * prng.rand(5, 1).astype(self.dtype)
        labels = np.zeros(5, dtype='int64')

        expected_out = np.array([[values.std(ddof=1) ** 2]])
        expected_counts = counts + 5

        self.algo(out, counts, values, labels)

        np.testing.assert_allclose(out, expected_out, self.rtol)
        tm.assert_array_equal(counts, expected_counts)
Ejemplo n.º 28
0
    def test_init_splicing(self, metadata_data, metadata_kws,
                           splicing_data, splicing_kws):
        from flotilla import Study

        metadata = metadata_data.copy()
        splicing = splicing_data.copy()

        kw_pairs = (('metadata', metadata_kws),
                    ('splicing', splicing_kws))
        kwargs = {}
        for name, kws in kw_pairs:
            for k, v in kws.items():
                kwargs['{}_{}'.format(name, k)] = v
        study = Study(metadata, splicing_data=splicing,
                      **kwargs)
        pdt.assert_array_equal(study.splicing.data_original,
                               splicing_data)
Ejemplo n.º 29
0
    def test_init_expression(self, metadata_data, metadata_kws,
                             expression_data, expression_kws):
        from flotilla import Study

        metadata = metadata_data.copy()
        expression = expression_data.copy()

        kw_pairs = (('metadata', metadata_kws),
                    ('expression', expression_kws))
        kwargs = {}
        for name, kws in kw_pairs:
            for k, v in kws.items():
                kwargs['{}_{}'.format(name, k)] = v
        study = Study(metadata, expression_data=expression,
                      **kwargs)
        pdt.assert_array_equal(study.expression.data_original,
                               expression_data)
Ejemplo n.º 30
0
    def test_group_var_generic_2d_some_nan(self):
        prng = RandomState(1234)

        out = (np.nan * np.ones((5, 2))).astype(self.dtype)
        counts = np.zeros(5, dtype="int64")
        values = 10 * prng.rand(10, 2).astype(self.dtype)
        values[:, 1] = np.nan
        labels = np.tile(np.arange(5), (2,)).astype("int64")

        expected_out = np.vstack(
            [values[:, 0].reshape(5, 2, order="F").std(ddof=1, axis=1) ** 2, np.nan * np.ones(5)]
        ).T
        expected_counts = counts + 2

        self.algo(out, counts, values, labels)
        np.testing.assert_allclose(out, expected_out, self.rtol)
        tm.assert_array_equal(counts, expected_counts)
Ejemplo n.º 31
0
    def test_init_technical_outlier(self, metadata_data, metadata_kws,
                                    technical_outliers, mapping_stats_data,
                                    mapping_stats_kws):
        from flotilla import Study

        metadata = metadata_data.copy()

        kw_pairs = (('metadata', metadata_kws),
                    ('mapping_stats', mapping_stats_kws))
        kwargs = {}
        for name, kws in kw_pairs:
            for k, v in kws.items():
                kwargs['{}_{}'.format(name, k)] = v
        study = Study(metadata, mapping_stats_data=mapping_stats_data,
                      **kwargs)
        pdt.assert_array_equal(sorted(study.technical_outliers),
                               sorted(technical_outliers))
Ejemplo n.º 32
0
    def test_group_var_generic_2d_some_nan(self):
        prng = RandomState(1234)

        out = (np.nan * np.ones((5, 2))).astype(self.dtype)
        counts = np.zeros(5, dtype='int64')
        values = 10 * prng.rand(10, 2).astype(self.dtype)
        values[:, 1] = np.nan
        labels = np.tile(np.arange(5), (2, )).astype('int64')

        expected_out = np.vstack([
            values[:, 0].reshape(5, 2, order='F').std(ddof=1, axis=1) ** 2,
            np.nan * np.ones(5)
        ]).T
        expected_counts = counts + 2

        self.algo(out, counts, values, labels)
        np.testing.assert_allclose(out, expected_out, self.rtol)
        tm.assert_array_equal(counts, expected_counts)
Ejemplo n.º 33
0
    def test_init_technical_outlier(self, metadata_data, metadata_kws,
                                    technical_outliers, mapping_stats_data,
                                    mapping_stats_kws):
        from flotilla import Study

        metadata = metadata_data.copy()

        kw_pairs = (('metadata', metadata_kws), ('mapping_stats',
                                                 mapping_stats_kws))
        kwargs = {}
        for name, kws in kw_pairs:
            for k, v in kws.items():
                kwargs['{}_{}'.format(name, k)] = v
        study = Study(metadata,
                      mapping_stats_data=mapping_stats_data,
                      **kwargs)
        pdt.assert_array_equal(sorted(study.technical_outliers),
                               sorted(technical_outliers))
Ejemplo n.º 34
0
    def test_change_phenotype_col(self, phenotype_order, phenotype_to_color,
                                  phenotype_to_marker):
        from flotilla.data_model.metadata import MetaData

        metadata = self.metadata.copy()
        metadata['phenotype2'] = np.random.choice(list('QXYZ'), size=self.n)

        test_metadata = MetaData(metadata, phenotype_order,
                                 phenotype_to_color,
                                 phenotype_to_marker,
                                 phenotype_col='phenotype')
        test_metadata.phenotype_col = 'phenotype2'

        pdt.assert_array_equal(test_metadata.unique_phenotypes,
                               metadata.phenotype2.unique())
        pdt.assert_contains_all(metadata.phenotype2.unique(),
                                test_metadata.phenotype_to_color)
        pdt.assert_contains_all(metadata.phenotype2.unique(),
                                test_metadata.phenotype_to_marker)
        pdt.assert_array_equal(test_metadata.phenotype_order,
                               list(sorted(metadata.phenotype2.unique())))
Ejemplo n.º 35
0
def test_order(pca, kwargs):
    from flotilla.visualize.decomposition import DecompositionViz

    kw = kwargs.copy()
    kw.pop('order')
    kw.pop('groupby')
    groups = ['group1', 'group2', 'group3']

    groupby = pd.Series([np.random.choice(groups)
                         for i in pca.reduced_space.index],
                        index=pca.reduced_space.index)
    order = ['group3', 'group1', 'group2']

    dv = DecompositionViz(pca.reduced_space, pca.components_,
                          pca.explained_variance_ratio_, order=order,
                          groupby=groupby, **kw)

    color_ordered = [dv.label_to_color[x] for x in order]

    pdt.assert_series_equal(dv.groupby, groupby)
    pdt.assert_array_equal(dv.order, order)
    pdt.assert_array_equal(dv.color_ordered, color_ordered)
Ejemplo n.º 36
0
    def test__init(self, expression_data_no_na, outliers):
        from flotilla.data_model.base import BaseData
        from flotilla.compute.predict import PredictorConfigManager, \
            PredictorDataSetManager

        base_data = BaseData(expression_data_no_na, outliers=outliers)
        outlier_samples = outliers.copy() if outliers is not None else []
        outliers_df = expression_data_no_na.ix[outlier_samples]

        feature_renamer_series = pd.Series(expression_data_no_na.columns,
                                           index=expression_data_no_na.columns)

        pdt.assert_frame_equal(base_data.data_original, expression_data_no_na)
        pdt.assert_equal(base_data.feature_data, None)
        pdt.assert_frame_equal(base_data.data, expression_data_no_na)
        pdt.assert_series_equal(base_data.feature_renamer_series,
                                feature_renamer_series)
        pdt.assert_frame_equal(base_data.outliers, outliers_df)
        pdt.assert_array_equal(base_data.outlier_samples, outlier_samples)
        assert isinstance(base_data.predictor_config_manager,
                          PredictorConfigManager)
        assert isinstance(base_data.predictor_dataset_manager,
                          PredictorDataSetManager)
Ejemplo n.º 37
0
    def test__init(self, expression_data_no_na, outliers):
        from flotilla.data_model.base import BaseData
        from flotilla.compute.predict import PredictorConfigManager, \
            PredictorDataSetManager

        base_data = BaseData(expression_data_no_na, outliers=outliers)
        outlier_samples = outliers.copy() if outliers is not None else []
        outliers_df = expression_data_no_na.ix[outlier_samples]

        feature_renamer_series = pd.Series(expression_data_no_na.columns,
                                           index=expression_data_no_na.columns)

        pdt.assert_frame_equal(base_data.data_original, expression_data_no_na)
        pdt.assert_equal(base_data.feature_data, None)
        pdt.assert_frame_equal(base_data.data, expression_data_no_na)
        pdt.assert_series_equal(base_data.feature_renamer_series,
                                feature_renamer_series)
        pdt.assert_frame_equal(base_data.outliers, outliers_df)
        pdt.assert_array_equal(base_data.outlier_samples, outlier_samples)
        assert isinstance(base_data.predictor_config_manager,
                          PredictorConfigManager)
        assert isinstance(base_data.predictor_dataset_manager,
                          PredictorDataSetManager)
Ejemplo n.º 38
0
    def test_init(self, metadata_data):
        from flotilla import Study

        metadata = metadata_data.copy()
        study = Study(metadata)

        metadata['outlier'] = False

        true_default_sample_subsets = list(
            sorted(
                list(
                    set(study.metadata.sample_subsets.keys()).difference(
                        set(study.default_sample_subset)))))
        true_default_sample_subsets.insert(0, study.default_sample_subset)

        pdt.assert_frame_equal(study.metadata.data, metadata)
        pdt.assert_equal(study.version, '0.1.0')
        pdt.assert_equal(study.pooled, None)
        pdt.assert_equal(study.technical_outliers, None)
        pdt.assert_equal(study.phenotype_col, study.metadata.phenotype_col)
        pdt.assert_equal(study.phenotype_order, study.metadata.phenotype_order)
        pdt.assert_equal(study.phenotype_to_color,
                         study.metadata.phenotype_to_color)
        pdt.assert_equal(study.phenotype_to_marker,
                         study.metadata.phenotype_to_marker)
        pdt.assert_series_equal(study.sample_id_to_phenotype,
                                study.metadata.sample_id_to_phenotype)
        pdt.assert_series_equal(study.sample_id_to_color,
                                study.metadata.sample_id_to_color)
        pdt.assert_array_equal(study.phenotype_transitions,
                               study.metadata.phenotype_transitions)
        pdt.assert_array_equal(study.phenotype_color_ordered,
                               study.metadata.phenotype_color_order)
        pdt.assert_equal(study.default_sample_subset, 'all_samples')
        pdt.assert_equal(study.default_feature_subset, 'variant')
        pdt.assert_array_equal(study.default_sample_subsets,
                               true_default_sample_subsets)
        pdt.assert_dict_equal(study.default_feature_subsets, {})
Ejemplo n.º 39
0
    def test_init(self, metadata_data):
        from flotilla import Study

        metadata = metadata_data.copy()
        study = Study(metadata)

        metadata['outlier'] = False

        true_default_sample_subsets = list(sorted(list(set(
            study.metadata.sample_subsets.keys()).difference(
            set(study.default_sample_subset)))))
        true_default_sample_subsets.insert(0, study.default_sample_subset)

        pdt.assert_frame_equal(study.metadata.data, metadata)
        pdt.assert_equal(study.version, '0.1.0')
        pdt.assert_equal(study.pooled, None)
        pdt.assert_equal(study.technical_outliers, None)
        pdt.assert_equal(study.phenotype_col, study.metadata.phenotype_col)
        pdt.assert_equal(study.phenotype_order, study.metadata.phenotype_order)
        pdt.assert_equal(study.phenotype_to_color,
                         study.metadata.phenotype_to_color)
        pdt.assert_equal(study.phenotype_to_marker,
                         study.metadata.phenotype_to_marker)
        pdt.assert_series_equal(study.sample_id_to_phenotype,
                                study.metadata.sample_id_to_phenotype)
        pdt.assert_series_equal(study.sample_id_to_color,
                                study.metadata.sample_id_to_color)
        pdt.assert_array_equal(study.phenotype_transitions,
                               study.metadata.phenotype_transitions)
        pdt.assert_array_equal(study.phenotype_color_ordered,
                               study.metadata.phenotype_color_order)
        pdt.assert_equal(study.default_sample_subset, 'all_samples')
        pdt.assert_equal(study.default_feature_subset, 'variant')
        pdt.assert_array_equal(study.default_sample_subsets,
                               true_default_sample_subsets)
        pdt.assert_dict_equal(study.default_feature_subsets, {})
Ejemplo n.º 40
0
    def test_save(self, study, tmpdir):
        from flotilla.datapackage import name_to_resource

        study_name = 'test_save'
        study.supplemental.expression_corr = study.expression.data.corr()
        study.save(study_name, flotilla_dir=tmpdir)

        assert len(tmpdir.listdir()) == 1
        save_dir = tmpdir.listdir()[0]

        with open('{}/datapackage.json'.format(save_dir)) as f:
            test_datapackage = json.load(f)

        assert study_name == save_dir.purebasename

        # resource_keys_to_ignore = ('compression', 'format', 'path',
        #                            'url')
        keys_from_study = {'splicing': [],
                           'expression': ['thresh',
                                          'log_base',
                                          'plus_one'],
                           'metadata': ['phenotype_order',
                                        'phenotype_to_color',
                                        'phenotype_col',
                                        'phenotype_to_marker',
                                        'pooled_col',
                                        'minimum_samples'],
                           'mapping_stats': ['number_mapped_col',
                                             'min_reads'],
                           'expression_feature': ['rename_col',
                                                  'ignore_subset_cols'],
                           'splicing_feature': ['rename_col',
                                                'ignore_subset_cols',
                                                'expression_id_col'],
                           'gene_ontology': []}
        resource_names = keys_from_study.keys()

        # Add auto-generated attributes into the true datapackage
        for name, keys in keys_from_study.iteritems():
            resource = name_to_resource(test_datapackage, name)
            for key in keys:
                command = self.get_data_eval_command(name, key)
                test_value = resource[key]
                true_value = eval(command)
                if isinstance(test_value, dict):
                    pdt.assert_dict_equal(test_value, true_value)
                elif isinstance(test_value, Iterable):
                    pdt.assert_array_equal(test_value, true_value)

        for name in resource_names:
            resource = name_to_resource(test_datapackage, name)
            path = '{}.csv.gz'.format(name)
            assert resource['path'] == path
            test_df = pd.read_csv('{}/{}/{}'.format(tmpdir, study_name, path),
                                  index_col=0, compression='gzip')
            command = self.get_data_eval_command(name, 'data_original')
            true_df = eval(command)
            pdt.assert_frame_equal(test_df, true_df)

        version = semantic_version.Version(study.version)
        version.patch += 1
        assert str(version) == test_datapackage['datapackage_version']
        assert study_name == test_datapackage['name']
Ejemplo n.º 41
0
    def test_init(self, phenotype_order, phenotype_to_color,
                  phenotype_to_marker):
        from flotilla.data_model.metadata import MetaData
        from flotilla.data_model.base import subsets_from_metadata
        from flotilla.visualize.color import str_to_color

        test_metadata = MetaData(self.metadata,
                                 phenotype_order=phenotype_order,
                                 phenotype_to_color=phenotype_to_color,
                                 phenotype_to_marker=phenotype_to_marker,
                                 **self.kws)

        if phenotype_order is None:
            true_phenotype_order = list(sorted(
                test_metadata.unique_phenotypes))
        else:
            true_phenotype_order = phenotype_order

        if phenotype_to_color is None:
            default_phenotype_to_color = \
                test_metadata._default_phenotype_to_color
            true_phenotype_to_color = dict(
                (k, default_phenotype_to_color[k])
                for k in true_phenotype_order)
        else:
            true_phenotype_to_color = {}
            for phenotype, color in phenotype_to_color.iteritems():
                try:
                    color = str_to_color[color]
                except KeyError:
                    pass
                true_phenotype_to_color[phenotype] = color

        if phenotype_to_marker is None:
            markers = cycle(['o', '^', 's', 'v', '*', 'D', ])

            def marker_factory():
                return markers.next()
            true_phenotype_to_marker = defaultdict(marker_factory)
            for x in true_phenotype_order:
                true_phenotype_to_marker[x]

        else:
            true_phenotype_to_marker = phenotype_to_marker

        true_phenotype_transitions = zip(true_phenotype_order[:-1],
                                         true_phenotype_order[1:])
        true_unique_phenotypes = self.metadata[self.phenotype_col].unique()
        true_n_phenotypes = len(true_unique_phenotypes)

        true_colors = map(mpl.colors.rgb2hex,
                          sns.color_palette('husl',
                                            n_colors=true_n_phenotypes))
        colors = iter(true_colors)
        true_default_phenotype_to_color = defaultdict(lambda: colors.next())

        true_sample_id_to_phenotype = self.metadata[self.phenotype_col]
        true_phenotype_color_order = [true_phenotype_to_color[p]
                                      for p in true_phenotype_order]
        true_sample_id_to_color = \
            dict((i, true_phenotype_to_color[true_sample_id_to_phenotype[i]])
                 for i in self.metadata.index)

        true_sample_subsets = subsets_from_metadata(
            self.metadata, self.kws['minimum_sample_subset'], 'samples')

        pdt.assert_frame_equal(test_metadata.data, self.metadata)
        pdt.assert_series_equal(test_metadata.sample_id_to_phenotype,
                                true_sample_id_to_phenotype)
        pdt.assert_array_equal(test_metadata.unique_phenotypes,
                               true_unique_phenotypes)
        pdt.assert_array_equal(test_metadata.n_phenotypes,
                               len(true_unique_phenotypes))
        pdt.assert_array_equal(test_metadata._default_phenotype_order,
                               list(sorted(true_unique_phenotypes)))
        pdt.assert_array_equal(test_metadata.phenotype_order,
                               true_phenotype_order)
        pdt.assert_array_equal(test_metadata.phenotype_transitions,
                               true_phenotype_transitions)
        pdt.assert_array_equal(test_metadata._colors, true_colors)
        pdt.assert_array_equal(test_metadata._default_phenotype_to_color,
                               true_default_phenotype_to_color)
        pdt.assert_dict_equal(test_metadata.phenotype_to_color,
                              true_phenotype_to_color)
        pdt.assert_dict_equal(test_metadata.phenotype_to_marker,
                              true_phenotype_to_marker)
        pdt.assert_array_equal(test_metadata.phenotype_color_order,
                               true_phenotype_color_order)
        pdt.assert_dict_equal(test_metadata.sample_id_to_color,
                              true_sample_id_to_color)
        pdt.assert_dict_equal(test_metadata.sample_subsets,
                              true_sample_subsets)