def test_large_dataframe(pca_large_dataframe, kwargs): from flotilla.visualize.decomposition import DecompositionViz dv = DecompositionViz(pca_large_dataframe.reduced_space, pca_large_dataframe.components_, pca_large_dataframe.explained_variance_ratio_, **kwargs) x_pc = kwargs['x_pc'] y_pc = kwargs['y_pc'] pcs = [x_pc, y_pc] true_top_features = set([]) true_pc_loadings_labels = {} true_pc_loadings = {} for pc in pcs: x = pca_large_dataframe.components_.ix[pc].copy() x.sort(ascending=True) half_features = int(kwargs['n_top_pc_features'] / 2) if len(x) > kwargs['n_top_pc_features']: a = x[:half_features] b = x[-half_features:] labels = np.r_[a.index, b.index] true_pc_loadings[pc] = np.r_[a, b] else: labels = x.index true_pc_loadings[pc] = x true_pc_loadings_labels[pc] = labels true_top_features.update(labels) pdt.assert_array_equal(dv.top_features, true_top_features) pdt.assert_dict_equal(dv.pc_loadings_labels, true_pc_loadings_labels) pdt.assert_dict_equal(dv.pc_loadings, true_pc_loadings)
def test_init(self, gene_ontology_data, gene_ontology): true_data = gene_ontology_data.dropna() true_all_genes = true_data['Ensembl Gene ID'].unique() true_ontology = defaultdict(dict) for go, df in true_data.groupby('GO Term Accession'): true_ontology[go]['genes'] = set(df['Ensembl Gene ID']) true_ontology[go]['name'] = df['GO Term Name'].values[0] true_ontology[go]['domain'] = df['GO domain'].values[0] true_ontology[go]['n_genes'] = len(true_ontology[go]['genes']) pdt.assert_frame_equal(true_data, gene_ontology.data) pdt.assert_array_equal(sorted(true_all_genes), sorted(gene_ontology.all_genes)) pdt.assert_contains_all(true_ontology.keys(), gene_ontology.ontology) pdt.assert_contains_all(gene_ontology.ontology.keys(), true_ontology) for go, true_attributes in true_ontology.items(): test_attributes = gene_ontology.ontology[go] true_genes = sorted(true_attributes['genes']) test_genes = sorted(test_attributes['genes']) pdt.assert_array_equal(true_genes, test_genes) pdt.assert_equal(true_attributes['name'], test_attributes['name']) pdt.assert_equal(true_attributes['domain'], test_attributes['domain']) pdt.assert_equal(true_attributes['n_genes'], test_attributes['n_genes'])
def test_init(self, gene_ontology_data, gene_ontology): true_data = gene_ontology_data.dropna() true_all_genes = true_data['Ensembl Gene ID'].unique() true_ontology = defaultdict(dict) for go, df in true_data.groupby('GO Term Accession'): true_ontology[go]['genes'] = set(df['Ensembl Gene ID']) true_ontology[go]['name'] = df['GO Term Name'].values[0] true_ontology[go]['domain'] = df['GO domain'].values[0] true_ontology[go]['n_genes'] = len(true_ontology[go]['genes']) pdt.assert_frame_equal(true_data, gene_ontology.data) pdt.assert_array_equal(sorted(true_all_genes), sorted(gene_ontology.all_genes)) pdt.assert_contains_all(true_ontology.keys(), gene_ontology.ontology) pdt.assert_contains_all(gene_ontology.ontology.keys(), true_ontology) for go, true_attributes in true_ontology.items(): test_attributes = gene_ontology.ontology[go] true_genes = sorted(true_attributes['genes']) test_genes = sorted(test_attributes['genes']) pdt.assert_array_equal(true_genes, test_genes) pdt.assert_equal(true_attributes['name'], test_attributes['name']) pdt.assert_equal(true_attributes['domain'], test_attributes['domain']) pdt.assert_equal(true_attributes['n_genes'], test_attributes['n_genes'])
def test_feature_subset_to_feature_ids(self, expression_data_no_na, expression_feature_data, feature_subset): from flotilla.data_model.base import BaseData expression = BaseData(expression_data_no_na, feature_data=expression_feature_data) test_feature_ids = expression.feature_subset_to_feature_ids( feature_subset, rename=False) true_feature_ids = pd.Index([]) if feature_subset is not None: try: if feature_subset in expression.feature_subsets: true_feature_ids = expression.feature_subsets[ feature_subset] elif feature_subset.startswith('all'): true_feature_ids = expression.data.columns except TypeError: if not isinstance(feature_subset, str): feature_ids = feature_subset n_custom = expression.feature_data.columns.map( lambda x: x.startswith('custom')).sum() ind = 'custom_{}'.format(n_custom + 1) expression.feature_data[ind] = \ expression.feature_data.index.isin(feature_ids) else: raise ValueError( "There are no {} features in this data: " "{}".format(feature_subset, self)) else: true_feature_ids = expression.data.columns pdt.assert_array_equal(test_feature_ids, true_feature_ids)
def test_order(pca, kwargs): from flotilla.visualize.decomposition import DecompositionViz kw = kwargs.copy() kw.pop('order') kw.pop('groupby') groups = ['group1', 'group2', 'group3'] groupby = pd.Series( [np.random.choice(groups) for i in pca.reduced_space.index], index=pca.reduced_space.index) order = ['group3', 'group1', 'group2'] dv = DecompositionViz(pca.reduced_space, pca.components_, pca.explained_variance_ratio_, order=order, groupby=groupby, **kw) color_ordered = [dv.label_to_color[x] for x in order] pdt.assert_series_equal(dv.groupby, groupby) pdt.assert_array_equal(dv.order, order) pdt.assert_array_equal(dv.color_ordered, color_ordered)
def test_sample_subset_to_sample_ids(self, study, sample_subset): test_sample_subset = study.sample_subset_to_sample_ids(sample_subset) try: true_sample_subset = study.metadata.sample_subsets[sample_subset] except (KeyError, TypeError): try: ind = study.metadata.sample_id_to_phenotype == sample_subset if ind.sum() > 0: true_sample_subset = \ study.metadata.sample_id_to_phenotype.index[ind] else: if sample_subset is None or 'all_samples'.startswith( sample_subset): sample_ind = np.ones(study.metadata.data.shape[0], dtype=bool) elif sample_subset.startswith("~"): sample_ind = ~pd.Series( study.metadata.data[sample_subset.lstrip("~")], dtype='bool') else: sample_ind = pd.Series( study.metadata.data[sample_subset], dtype='bool') true_sample_subset = study.metadata.data.index[sample_ind] except (AttributeError, ValueError): true_sample_subset = sample_subset pdt.assert_array_equal(true_sample_subset, test_sample_subset)
def test_sample_subset_to_sample_ids(self, study, sample_subset): test_sample_subset = study.sample_subset_to_sample_ids(sample_subset) try: true_sample_subset = study.metadata.sample_subsets[sample_subset] except (KeyError, TypeError): try: ind = study.metadata.sample_id_to_phenotype == sample_subset if ind.sum() > 0: true_sample_subset = \ study.metadata.sample_id_to_phenotype.index[ind] else: if sample_subset is None or 'all_samples'.startswith( sample_subset): sample_ind = np.ones(study.metadata.data.shape[0], dtype=bool) elif sample_subset.startswith("~"): sample_ind = ~pd.Series( study.metadata.data[sample_subset.lstrip("~")], dtype='bool') else: sample_ind = pd.Series( study.metadata.data[sample_subset], dtype='bool') true_sample_subset = study.metadata.data.index[sample_ind] except (AttributeError, ValueError): true_sample_subset = sample_subset pdt.assert_array_equal(true_sample_subset, test_sample_subset)
def test_feature_subset_to_feature_ids(self, expression_data_no_na, expression_feature_data, feature_subset): from flotilla.data_model.base import BaseData expression = BaseData(expression_data_no_na, feature_data=expression_feature_data) test_feature_ids = expression.feature_subset_to_feature_ids( feature_subset, rename=False) true_feature_ids = pd.Index([]) if feature_subset is not None: try: if feature_subset in expression.feature_subsets: true_feature_ids = expression.feature_subsets[ feature_subset] elif feature_subset.startswith('all'): true_feature_ids = expression.data.columns except TypeError: if not isinstance(feature_subset, str): feature_ids = feature_subset n_custom = expression.feature_data.columns.map( lambda x: x.startswith('custom')).sum() ind = 'custom_{}'.format(n_custom + 1) expression.feature_data[ind] = \ expression.feature_data.index.isin(feature_ids) else: raise ValueError("There are no {} features in this data: " "{}".format(feature_subset, self)) else: true_feature_ids = expression.data.columns pdt.assert_array_equal(test_feature_ids, true_feature_ids)
def test_large_dataframe(pca_large_dataframe, kwargs): from flotilla.visualize.decomposition import DecompositionViz dv = DecompositionViz(pca_large_dataframe.reduced_space, pca_large_dataframe.components_, pca_large_dataframe.explained_variance_ratio_, **kwargs) x_pc = kwargs['x_pc'] y_pc = kwargs['y_pc'] pcs = [x_pc, y_pc] true_top_features = set([]) true_pc_loadings_labels = {} true_pc_loadings = {} for pc in pcs: x = pca_large_dataframe.components_.ix[pc].copy() x.sort(ascending=True) half_features = int(kwargs['n_top_pc_features'] / 2) if len(x) > kwargs['n_top_pc_features']: a = x[:half_features] b = x[-half_features:] labels = np.r_[a.index, b.index] true_pc_loadings[pc] = np.r_[a, b] else: labels = x.index true_pc_loadings[pc] = x true_pc_loadings_labels[pc] = labels true_top_features.update(labels) pdt.assert_array_equal(dv.top_features, true_top_features) pdt.assert_dict_equal(dv.pc_loadings_labels, true_pc_loadings_labels) pdt.assert_dict_equal(dv.pc_loadings, true_pc_loadings)
def test_save(self, study, tmpdir): from flotilla.datapackage import name_to_resource study_name = 'test_save' study.supplemental.expression_corr = study.expression.data.corr() study.save(study_name, flotilla_dir=tmpdir) assert len(tmpdir.listdir()) == 1 save_dir = tmpdir.listdir()[0] with open('{}/datapackage.json'.format(save_dir)) as f: test_datapackage = json.load(f) assert study_name == save_dir.purebasename # resource_keys_to_ignore = ('compression', 'format', 'path', # 'url') keys_from_study = { 'splicing': [], 'expression': ['thresh', 'log_base', 'plus_one'], 'metadata': [ 'phenotype_order', 'phenotype_to_color', 'phenotype_col', 'phenotype_to_marker', 'pooled_col', 'minimum_samples' ], 'mapping_stats': ['number_mapped_col', 'min_reads'], 'expression_feature': ['rename_col', 'ignore_subset_cols'], 'splicing_feature': ['rename_col', 'ignore_subset_cols', 'expression_id_col'], 'gene_ontology': [] } resource_names = keys_from_study.keys() # Add auto-generated attributes into the true datapackage for name, keys in keys_from_study.iteritems(): resource = name_to_resource(test_datapackage, name) for key in keys: command = self.get_data_eval_command(name, key) test_value = resource[key] true_value = eval(command) if isinstance(test_value, dict): pdt.assert_dict_equal(test_value, true_value) elif isinstance(test_value, Iterable): pdt.assert_array_equal(test_value, true_value) for name in resource_names: resource = name_to_resource(test_datapackage, name) path = '{}.csv.gz'.format(name) assert resource['path'] == path test_df = pd.read_csv('{}/{}/{}'.format(tmpdir, study_name, path), index_col=0, compression='gzip') command = self.get_data_eval_command(name, 'data_original') true_df = eval(command) pdt.assert_frame_equal(test_df, true_df) version = semantic_version.Version(study.version) version.patch += 1 assert str(version) == test_datapackage['datapackage_version'] assert study_name == test_datapackage['name']
def test_chr_start_stop_to_sj_ind(chr_start_stop, sj): from sj2psi import chr_start_stop_to_sj_ind test_output = chr_start_stop_to_sj_ind(chr_start_stop, sj) chrom, startstop = chr_start_stop.replace(',', '').split(':') start, stop = map(int, startstop.split('-')) true_output = (sj.chrom == chrom) & (start < sj.intron_start) \ & (sj.intron_stop < stop) pdt.assert_array_equal(test_output, true_output)
def test_chr_start_stop_to_sj_ind(chr_start_stop, sj): from sj2psi import chr_start_stop_to_sj_ind test_output = chr_start_stop_to_sj_ind(chr_start_stop, sj) chrom, startstop = chr_start_stop.replace(',', '').split(':') start, stop = map(int, startstop.split('-')) true_output = (sj.chrom == chrom) & (start < sj.intron_start) \ & (sj.intron_stop < stop) pdt.assert_array_equal(test_output, true_output)
def test_too_few_mapped(self, mapping_stats, mapping_stats_data, mapping_stats_kws): from flotilla.data_model.quality_control import MIN_READS min_reads = mapping_stats_kws.get('min_reads', MIN_READS) number_mapped_col = mapping_stats_kws.get('number_mapped_col') number_mapped = mapping_stats_data[number_mapped_col] too_few_mapped = number_mapped.index[number_mapped < min_reads] pdt.assert_array_equal(mapping_stats.too_few_mapped, too_few_mapped)
def test__variant(self, expression_data): from flotilla.data_model.base import BaseData base_data = BaseData(expression_data) var = expression_data.var() var_cut = var.mean() + 2 * var.std() variant = expression_data.columns[var > var_cut] pdt.assert_equal(base_data._var_cut, var_cut) pdt.assert_array_equal(base_data.variant, variant)
def test__variant(self, expression_data): from flotilla.data_model.base import BaseData base_data = BaseData(expression_data) var = expression_data.var() var_cut = var.mean() + 2 * var.std() variant = expression_data.columns[var > var_cut] pdt.assert_equal(base_data._var_cut, var_cut) pdt.assert_array_equal(base_data.variant, variant)
def test_feature_subset_to_feature_ids(self, study, data_type, feature_subset): test_feature_subset = study.feature_subset_to_feature_ids( data_type, feature_subset) if 'expression'.startswith(data_type): true_feature_subset = \ study.expression.feature_subset_to_feature_ids(feature_subset, rename=False) elif 'splicing'.startswith(data_type): true_feature_subset = study.splicing.feature_subset_to_feature_ids( feature_subset, rename=False) pdt.assert_array_equal(test_feature_subset, true_feature_subset)
def test_feature_subset_to_feature_ids(self, study, data_type, feature_subset): test_feature_subset = study.feature_subset_to_feature_ids( data_type, feature_subset) if 'expression'.startswith(data_type): true_feature_subset = \ study.expression.feature_subset_to_feature_ids(feature_subset, rename=False) elif 'splicing'.startswith(data_type): true_feature_subset = study.splicing.feature_subset_to_feature_ids( feature_subset, rename=False) pdt.assert_array_equal(test_feature_subset, true_feature_subset)
def test_group_var_generic_2d_all_finite(self): prng = RandomState(1234) out = (np.nan * np.ones((5, 2))).astype(self.dtype) counts = np.zeros(5, dtype=int) values = 10 * prng.rand(10, 2).astype(self.dtype) labels = np.tile(np.arange(5), (2, )) expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0)**2 expected_counts = counts + 2 self.algo(out, counts, values, labels) np.testing.assert_allclose(out, expected_out, self.rtol) tm.assert_array_equal(counts, expected_counts)
def test_unique_label_indices(): from pandas.hashtable import unique_label_indices a = np.random.randint(1, 1 << 10, 1 << 15).astype('i8') left = unique_label_indices(a) right = np.unique(a, return_index=True)[1] tm.assert_array_equal(left, right) a[np.random.choice(len(a), 10)] = -1 left= unique_label_indices(a) right = np.unique(a, return_index=True)[1][1:] tm.assert_array_equal(left, right)
def test_group_var_generic_2d_all_finite(self): prng = RandomState(1234) out = (np.nan * np.ones((5, 2))).astype(self.dtype) counts = np.zeros(5, dtype="int64") values = 10 * prng.rand(10, 2).astype(self.dtype) labels = np.tile(np.arange(5), (2,)).astype("int64") expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2 expected_counts = counts + 2 self.algo(out, counts, values, labels) np.testing.assert_allclose(out, expected_out, self.rtol) tm.assert_array_equal(counts, expected_counts)
def test_group_var_generic_1d(self): prng = RandomState(1234) out = (np.nan * np.ones((5, 1))).astype(self.dtype) counts = np.zeros(5, dtype="int64") values = 10 * prng.rand(15, 1).astype(self.dtype) labels = np.tile(np.arange(5), (3,)).astype("int64") expected_out = (np.squeeze(values).reshape((5, 3), order="F").std(axis=1, ddof=1) ** 2)[:, np.newaxis] expected_counts = counts + 3 self.algo(out, counts, values, labels) np.testing.assert_allclose(out, expected_out, self.rtol) tm.assert_array_equal(counts, expected_counts)
def test_unique_label_indices(): from pandas.hashtable import unique_label_indices a = np.random.randint(1, 1 << 10, 1 << 15).astype('i8') left = unique_label_indices(a) right = np.unique(a, return_index=True)[1] tm.assert_array_equal(left, right) a[np.random.choice(len(a), 10)] = -1 left= unique_label_indices(a) right = np.unique(a, return_index=True)[1][1:] tm.assert_array_equal(left, right)
def test_init_splicing(self, metadata_data, metadata_kws, splicing_data, splicing_kws): from flotilla import Study metadata = metadata_data.copy() splicing = splicing_data.copy() kw_pairs = (('metadata', metadata_kws), ('splicing', splicing_kws)) kwargs = {} for name, kws in kw_pairs: for k, v in kws.items(): kwargs['{}_{}'.format(name, k)] = v study = Study(metadata, splicing_data=splicing, **kwargs) pdt.assert_array_equal(study.splicing.data_original, splicing_data)
def test_init_expression(self, metadata_data, metadata_kws, expression_data, expression_kws): from flotilla import Study metadata = metadata_data.copy() expression = expression_data.copy() kw_pairs = (('metadata', metadata_kws), ('expression', expression_kws)) kwargs = {} for name, kws in kw_pairs: for k, v in kws.items(): kwargs['{}_{}'.format(name, k)] = v study = Study(metadata, expression_data=expression, **kwargs) pdt.assert_array_equal(study.expression.data_original, expression_data)
def test_group_var_generic_1d_flat_labels(self): prng = RandomState(1234) out = (np.nan * np.ones((1, 1))).astype(self.dtype) counts = np.zeros(1, dtype=int) values = 10 * prng.rand(5, 1).astype(self.dtype) labels = np.zeros(5, dtype=int) expected_out = np.array([[values.std(ddof=1) ** 2]]) expected_counts = counts + 5 self.algo(out, counts, values, labels) np.testing.assert_allclose(out, expected_out, self.rtol) tm.assert_array_equal(counts, expected_counts)
def test_group_var_generic_1d(self): prng = RandomState(1234) out = (np.nan * np.ones((5, 1))).astype(self.dtype) counts = np.zeros(5, dtype=int) values = 10 * prng.rand(15, 1).astype(self.dtype) labels = np.tile(np.arange(5), (3, )) expected_out = (np.squeeze(values).reshape( (5, 3), order='F').std(axis=1, ddof=1)**2)[:, np.newaxis] expected_counts = counts + 3 self.algo(out, counts, values, labels) np.testing.assert_allclose(out, expected_out, self.rtol) tm.assert_array_equal(counts, expected_counts)
def test_group_var_generic_1d_flat_labels(self): prng = RandomState(1234) out = (np.nan * np.ones((1, 1))).astype(self.dtype) counts = np.zeros(1, dtype='int64') values = 10 * prng.rand(5, 1).astype(self.dtype) labels = np.zeros(5, dtype='int64') expected_out = np.array([[values.std(ddof=1) ** 2]]) expected_counts = counts + 5 self.algo(out, counts, values, labels) np.testing.assert_allclose(out, expected_out, self.rtol) tm.assert_array_equal(counts, expected_counts)
def test_init_splicing(self, metadata_data, metadata_kws, splicing_data, splicing_kws): from flotilla import Study metadata = metadata_data.copy() splicing = splicing_data.copy() kw_pairs = (('metadata', metadata_kws), ('splicing', splicing_kws)) kwargs = {} for name, kws in kw_pairs: for k, v in kws.items(): kwargs['{}_{}'.format(name, k)] = v study = Study(metadata, splicing_data=splicing, **kwargs) pdt.assert_array_equal(study.splicing.data_original, splicing_data)
def test_init_expression(self, metadata_data, metadata_kws, expression_data, expression_kws): from flotilla import Study metadata = metadata_data.copy() expression = expression_data.copy() kw_pairs = (('metadata', metadata_kws), ('expression', expression_kws)) kwargs = {} for name, kws in kw_pairs: for k, v in kws.items(): kwargs['{}_{}'.format(name, k)] = v study = Study(metadata, expression_data=expression, **kwargs) pdt.assert_array_equal(study.expression.data_original, expression_data)
def test_group_var_generic_2d_some_nan(self): prng = RandomState(1234) out = (np.nan * np.ones((5, 2))).astype(self.dtype) counts = np.zeros(5, dtype="int64") values = 10 * prng.rand(10, 2).astype(self.dtype) values[:, 1] = np.nan labels = np.tile(np.arange(5), (2,)).astype("int64") expected_out = np.vstack( [values[:, 0].reshape(5, 2, order="F").std(ddof=1, axis=1) ** 2, np.nan * np.ones(5)] ).T expected_counts = counts + 2 self.algo(out, counts, values, labels) np.testing.assert_allclose(out, expected_out, self.rtol) tm.assert_array_equal(counts, expected_counts)
def test_init_technical_outlier(self, metadata_data, metadata_kws, technical_outliers, mapping_stats_data, mapping_stats_kws): from flotilla import Study metadata = metadata_data.copy() kw_pairs = (('metadata', metadata_kws), ('mapping_stats', mapping_stats_kws)) kwargs = {} for name, kws in kw_pairs: for k, v in kws.items(): kwargs['{}_{}'.format(name, k)] = v study = Study(metadata, mapping_stats_data=mapping_stats_data, **kwargs) pdt.assert_array_equal(sorted(study.technical_outliers), sorted(technical_outliers))
def test_group_var_generic_2d_some_nan(self): prng = RandomState(1234) out = (np.nan * np.ones((5, 2))).astype(self.dtype) counts = np.zeros(5, dtype='int64') values = 10 * prng.rand(10, 2).astype(self.dtype) values[:, 1] = np.nan labels = np.tile(np.arange(5), (2, )).astype('int64') expected_out = np.vstack([ values[:, 0].reshape(5, 2, order='F').std(ddof=1, axis=1) ** 2, np.nan * np.ones(5) ]).T expected_counts = counts + 2 self.algo(out, counts, values, labels) np.testing.assert_allclose(out, expected_out, self.rtol) tm.assert_array_equal(counts, expected_counts)
def test_init_technical_outlier(self, metadata_data, metadata_kws, technical_outliers, mapping_stats_data, mapping_stats_kws): from flotilla import Study metadata = metadata_data.copy() kw_pairs = (('metadata', metadata_kws), ('mapping_stats', mapping_stats_kws)) kwargs = {} for name, kws in kw_pairs: for k, v in kws.items(): kwargs['{}_{}'.format(name, k)] = v study = Study(metadata, mapping_stats_data=mapping_stats_data, **kwargs) pdt.assert_array_equal(sorted(study.technical_outliers), sorted(technical_outliers))
def test_change_phenotype_col(self, phenotype_order, phenotype_to_color, phenotype_to_marker): from flotilla.data_model.metadata import MetaData metadata = self.metadata.copy() metadata['phenotype2'] = np.random.choice(list('QXYZ'), size=self.n) test_metadata = MetaData(metadata, phenotype_order, phenotype_to_color, phenotype_to_marker, phenotype_col='phenotype') test_metadata.phenotype_col = 'phenotype2' pdt.assert_array_equal(test_metadata.unique_phenotypes, metadata.phenotype2.unique()) pdt.assert_contains_all(metadata.phenotype2.unique(), test_metadata.phenotype_to_color) pdt.assert_contains_all(metadata.phenotype2.unique(), test_metadata.phenotype_to_marker) pdt.assert_array_equal(test_metadata.phenotype_order, list(sorted(metadata.phenotype2.unique())))
def test_order(pca, kwargs): from flotilla.visualize.decomposition import DecompositionViz kw = kwargs.copy() kw.pop('order') kw.pop('groupby') groups = ['group1', 'group2', 'group3'] groupby = pd.Series([np.random.choice(groups) for i in pca.reduced_space.index], index=pca.reduced_space.index) order = ['group3', 'group1', 'group2'] dv = DecompositionViz(pca.reduced_space, pca.components_, pca.explained_variance_ratio_, order=order, groupby=groupby, **kw) color_ordered = [dv.label_to_color[x] for x in order] pdt.assert_series_equal(dv.groupby, groupby) pdt.assert_array_equal(dv.order, order) pdt.assert_array_equal(dv.color_ordered, color_ordered)
def test__init(self, expression_data_no_na, outliers): from flotilla.data_model.base import BaseData from flotilla.compute.predict import PredictorConfigManager, \ PredictorDataSetManager base_data = BaseData(expression_data_no_na, outliers=outliers) outlier_samples = outliers.copy() if outliers is not None else [] outliers_df = expression_data_no_na.ix[outlier_samples] feature_renamer_series = pd.Series(expression_data_no_na.columns, index=expression_data_no_na.columns) pdt.assert_frame_equal(base_data.data_original, expression_data_no_na) pdt.assert_equal(base_data.feature_data, None) pdt.assert_frame_equal(base_data.data, expression_data_no_na) pdt.assert_series_equal(base_data.feature_renamer_series, feature_renamer_series) pdt.assert_frame_equal(base_data.outliers, outliers_df) pdt.assert_array_equal(base_data.outlier_samples, outlier_samples) assert isinstance(base_data.predictor_config_manager, PredictorConfigManager) assert isinstance(base_data.predictor_dataset_manager, PredictorDataSetManager)
def test__init(self, expression_data_no_na, outliers): from flotilla.data_model.base import BaseData from flotilla.compute.predict import PredictorConfigManager, \ PredictorDataSetManager base_data = BaseData(expression_data_no_na, outliers=outliers) outlier_samples = outliers.copy() if outliers is not None else [] outliers_df = expression_data_no_na.ix[outlier_samples] feature_renamer_series = pd.Series(expression_data_no_na.columns, index=expression_data_no_na.columns) pdt.assert_frame_equal(base_data.data_original, expression_data_no_na) pdt.assert_equal(base_data.feature_data, None) pdt.assert_frame_equal(base_data.data, expression_data_no_na) pdt.assert_series_equal(base_data.feature_renamer_series, feature_renamer_series) pdt.assert_frame_equal(base_data.outliers, outliers_df) pdt.assert_array_equal(base_data.outlier_samples, outlier_samples) assert isinstance(base_data.predictor_config_manager, PredictorConfigManager) assert isinstance(base_data.predictor_dataset_manager, PredictorDataSetManager)
def test_init(self, metadata_data): from flotilla import Study metadata = metadata_data.copy() study = Study(metadata) metadata['outlier'] = False true_default_sample_subsets = list( sorted( list( set(study.metadata.sample_subsets.keys()).difference( set(study.default_sample_subset))))) true_default_sample_subsets.insert(0, study.default_sample_subset) pdt.assert_frame_equal(study.metadata.data, metadata) pdt.assert_equal(study.version, '0.1.0') pdt.assert_equal(study.pooled, None) pdt.assert_equal(study.technical_outliers, None) pdt.assert_equal(study.phenotype_col, study.metadata.phenotype_col) pdt.assert_equal(study.phenotype_order, study.metadata.phenotype_order) pdt.assert_equal(study.phenotype_to_color, study.metadata.phenotype_to_color) pdt.assert_equal(study.phenotype_to_marker, study.metadata.phenotype_to_marker) pdt.assert_series_equal(study.sample_id_to_phenotype, study.metadata.sample_id_to_phenotype) pdt.assert_series_equal(study.sample_id_to_color, study.metadata.sample_id_to_color) pdt.assert_array_equal(study.phenotype_transitions, study.metadata.phenotype_transitions) pdt.assert_array_equal(study.phenotype_color_ordered, study.metadata.phenotype_color_order) pdt.assert_equal(study.default_sample_subset, 'all_samples') pdt.assert_equal(study.default_feature_subset, 'variant') pdt.assert_array_equal(study.default_sample_subsets, true_default_sample_subsets) pdt.assert_dict_equal(study.default_feature_subsets, {})
def test_init(self, metadata_data): from flotilla import Study metadata = metadata_data.copy() study = Study(metadata) metadata['outlier'] = False true_default_sample_subsets = list(sorted(list(set( study.metadata.sample_subsets.keys()).difference( set(study.default_sample_subset))))) true_default_sample_subsets.insert(0, study.default_sample_subset) pdt.assert_frame_equal(study.metadata.data, metadata) pdt.assert_equal(study.version, '0.1.0') pdt.assert_equal(study.pooled, None) pdt.assert_equal(study.technical_outliers, None) pdt.assert_equal(study.phenotype_col, study.metadata.phenotype_col) pdt.assert_equal(study.phenotype_order, study.metadata.phenotype_order) pdt.assert_equal(study.phenotype_to_color, study.metadata.phenotype_to_color) pdt.assert_equal(study.phenotype_to_marker, study.metadata.phenotype_to_marker) pdt.assert_series_equal(study.sample_id_to_phenotype, study.metadata.sample_id_to_phenotype) pdt.assert_series_equal(study.sample_id_to_color, study.metadata.sample_id_to_color) pdt.assert_array_equal(study.phenotype_transitions, study.metadata.phenotype_transitions) pdt.assert_array_equal(study.phenotype_color_ordered, study.metadata.phenotype_color_order) pdt.assert_equal(study.default_sample_subset, 'all_samples') pdt.assert_equal(study.default_feature_subset, 'variant') pdt.assert_array_equal(study.default_sample_subsets, true_default_sample_subsets) pdt.assert_dict_equal(study.default_feature_subsets, {})
def test_save(self, study, tmpdir): from flotilla.datapackage import name_to_resource study_name = 'test_save' study.supplemental.expression_corr = study.expression.data.corr() study.save(study_name, flotilla_dir=tmpdir) assert len(tmpdir.listdir()) == 1 save_dir = tmpdir.listdir()[0] with open('{}/datapackage.json'.format(save_dir)) as f: test_datapackage = json.load(f) assert study_name == save_dir.purebasename # resource_keys_to_ignore = ('compression', 'format', 'path', # 'url') keys_from_study = {'splicing': [], 'expression': ['thresh', 'log_base', 'plus_one'], 'metadata': ['phenotype_order', 'phenotype_to_color', 'phenotype_col', 'phenotype_to_marker', 'pooled_col', 'minimum_samples'], 'mapping_stats': ['number_mapped_col', 'min_reads'], 'expression_feature': ['rename_col', 'ignore_subset_cols'], 'splicing_feature': ['rename_col', 'ignore_subset_cols', 'expression_id_col'], 'gene_ontology': []} resource_names = keys_from_study.keys() # Add auto-generated attributes into the true datapackage for name, keys in keys_from_study.iteritems(): resource = name_to_resource(test_datapackage, name) for key in keys: command = self.get_data_eval_command(name, key) test_value = resource[key] true_value = eval(command) if isinstance(test_value, dict): pdt.assert_dict_equal(test_value, true_value) elif isinstance(test_value, Iterable): pdt.assert_array_equal(test_value, true_value) for name in resource_names: resource = name_to_resource(test_datapackage, name) path = '{}.csv.gz'.format(name) assert resource['path'] == path test_df = pd.read_csv('{}/{}/{}'.format(tmpdir, study_name, path), index_col=0, compression='gzip') command = self.get_data_eval_command(name, 'data_original') true_df = eval(command) pdt.assert_frame_equal(test_df, true_df) version = semantic_version.Version(study.version) version.patch += 1 assert str(version) == test_datapackage['datapackage_version'] assert study_name == test_datapackage['name']
def test_init(self, phenotype_order, phenotype_to_color, phenotype_to_marker): from flotilla.data_model.metadata import MetaData from flotilla.data_model.base import subsets_from_metadata from flotilla.visualize.color import str_to_color test_metadata = MetaData(self.metadata, phenotype_order=phenotype_order, phenotype_to_color=phenotype_to_color, phenotype_to_marker=phenotype_to_marker, **self.kws) if phenotype_order is None: true_phenotype_order = list(sorted( test_metadata.unique_phenotypes)) else: true_phenotype_order = phenotype_order if phenotype_to_color is None: default_phenotype_to_color = \ test_metadata._default_phenotype_to_color true_phenotype_to_color = dict( (k, default_phenotype_to_color[k]) for k in true_phenotype_order) else: true_phenotype_to_color = {} for phenotype, color in phenotype_to_color.iteritems(): try: color = str_to_color[color] except KeyError: pass true_phenotype_to_color[phenotype] = color if phenotype_to_marker is None: markers = cycle(['o', '^', 's', 'v', '*', 'D', ]) def marker_factory(): return markers.next() true_phenotype_to_marker = defaultdict(marker_factory) for x in true_phenotype_order: true_phenotype_to_marker[x] else: true_phenotype_to_marker = phenotype_to_marker true_phenotype_transitions = zip(true_phenotype_order[:-1], true_phenotype_order[1:]) true_unique_phenotypes = self.metadata[self.phenotype_col].unique() true_n_phenotypes = len(true_unique_phenotypes) true_colors = map(mpl.colors.rgb2hex, sns.color_palette('husl', n_colors=true_n_phenotypes)) colors = iter(true_colors) true_default_phenotype_to_color = defaultdict(lambda: colors.next()) true_sample_id_to_phenotype = self.metadata[self.phenotype_col] true_phenotype_color_order = [true_phenotype_to_color[p] for p in true_phenotype_order] true_sample_id_to_color = \ dict((i, true_phenotype_to_color[true_sample_id_to_phenotype[i]]) for i in self.metadata.index) true_sample_subsets = subsets_from_metadata( self.metadata, self.kws['minimum_sample_subset'], 'samples') pdt.assert_frame_equal(test_metadata.data, self.metadata) pdt.assert_series_equal(test_metadata.sample_id_to_phenotype, true_sample_id_to_phenotype) pdt.assert_array_equal(test_metadata.unique_phenotypes, true_unique_phenotypes) pdt.assert_array_equal(test_metadata.n_phenotypes, len(true_unique_phenotypes)) pdt.assert_array_equal(test_metadata._default_phenotype_order, list(sorted(true_unique_phenotypes))) pdt.assert_array_equal(test_metadata.phenotype_order, true_phenotype_order) pdt.assert_array_equal(test_metadata.phenotype_transitions, true_phenotype_transitions) pdt.assert_array_equal(test_metadata._colors, true_colors) pdt.assert_array_equal(test_metadata._default_phenotype_to_color, true_default_phenotype_to_color) pdt.assert_dict_equal(test_metadata.phenotype_to_color, true_phenotype_to_color) pdt.assert_dict_equal(test_metadata.phenotype_to_marker, true_phenotype_to_marker) pdt.assert_array_equal(test_metadata.phenotype_color_order, true_phenotype_color_order) pdt.assert_dict_equal(test_metadata.sample_id_to_color, true_sample_id_to_color) pdt.assert_dict_equal(test_metadata.sample_subsets, true_sample_subsets)