def test_from_file_error(self): for test_path in self.fferror_test_paths: with open(get_data_path(test_path), 'U') as f: with npt.assert_raises(FileFormatError): OrdinationResults.from_file(f) for test_path in self.verror_test_paths: with open(get_data_path(test_path), 'U') as f: with npt.assert_raises(ValueError): OrdinationResults.from_file(f)
def test_from_file(self): for exp_scores, test_path in zip(self.scores, self.test_paths): for file_type in ('file like', 'file name'): fname = get_data_path(test_path) if file_type == 'file like': with open(fname) as fh: obs = OrdinationResults.from_file(fh) elif file_type == 'file name': obs = OrdinationResults.from_file(fname) yield self.check_OrdinationResults_equal, obs, exp_scores
def test_assert_ordination_results_equal(self): minimal1 = OrdinationResults([1, 2]) # a minimal set of results should be equal to itself assert_ordination_results_equal(minimal1, minimal1) # type mismatch with npt.assert_raises(AssertionError): assert_ordination_results_equal(minimal1, 'foo') # numeric values should be checked that they're almost equal almost_minimal1 = OrdinationResults([1.0000001, 1.9999999]) assert_ordination_results_equal(minimal1, almost_minimal1) # species_ids missing in one, present in the other almost_minimal1.species_ids = ['abc', 'def'] with npt.assert_raises(AssertionError): assert_ordination_results_equal(minimal1, almost_minimal1) almost_minimal1.species_ids = None # site_ids missing in one, present in the other almost_minimal1.site_ids = ['abc', 'def'] with npt.assert_raises(AssertionError): assert_ordination_results_equal(minimal1, almost_minimal1) almost_minimal1.site_ids = None # test each of the optional numeric attributes for attr in ('species', 'site', 'biplot', 'site_constraints', 'proportion_explained'): # missing optional numeric attribute in one, present in the other setattr(almost_minimal1, attr, [[1, 2], [3, 4]]) with npt.assert_raises(AssertionError): assert_ordination_results_equal(minimal1, almost_minimal1) setattr(almost_minimal1, attr, None) # optional numeric attributes present in both, but not almost equal setattr(minimal1, attr, [[1, 2], [3, 4]]) setattr(almost_minimal1, attr, [[1, 2], [3.00002, 4]]) with npt.assert_raises(AssertionError): assert_ordination_results_equal(minimal1, almost_minimal1) setattr(minimal1, attr, None) setattr(almost_minimal1, attr, None) # optional numeric attributes present in both, and almost equal setattr(minimal1, attr, [[1, 2], [3, 4]]) setattr(almost_minimal1, attr, [[1, 2], [3.00000002, 4]]) assert_ordination_results_equal(minimal1, almost_minimal1) setattr(minimal1, attr, None) setattr(almost_minimal1, attr, None)
def parse_coords(lines): """Parse skbio's ordination results file into coords, labels, eigvals, pct_explained. Returns: - list of sample labels in order - array of coords (rows = samples, cols = axes in descending order) - list of eigenvalues - list of percent variance explained For the file format check skbio.stats.ordination.OrdinationResults.read Strategy: read the file using skbio's parser and return the objects we want """ try: pcoa_results = OrdinationResults.read(lines) return (pcoa_results.site_ids, pcoa_results.site, pcoa_results.eigvals, pcoa_results.proportion_explained) except FileFormatError: try: lines.seek(0) except AttributeError: # looks like we have a list of lines, not a file-like object pass return qiime_parse_coords(lines)
def setUpClass(cls): axis_labels = ['PC1', 'PC2', 'PC3'] cls.test_df1 = pd.DataFrame.from_dict( { 's1': [0.1, 0.2, 7], 's2': [0.9, 0.2, 7], }, orient='index', columns=axis_labels, ) cls.test_df1.index.name = 'Sample ID' cls.pcoa1 = OrdinationResults( 'pcoa1', 'pcoa1', eigvals=pd.Series( [7, 2, 1], index=axis_labels, ), samples=cls.test_df1, proportion_explained=pd.Series( [0.7, 0.2, 0.1], index=axis_labels, ), ) cls.test_metadata = pd.DataFrame( { 'age_cat': ['30s', '40s', '50s', '30s', None], 'num_cat': [7.24, 7.24, 8.25, 7.24, None], 'other': [1, 2, 3, 4, None], }, index=pd.Series(['s1', 's2', 'c', 'd', 'e'], name='#SampleID'))
def community_plot( tree: str, table: str, sample_metadata: str, output_dir: str, pcoa: str, feature_metadata: str, ignore_missing_samples: bool, filter_extra_samples: bool, filter_missing_features: bool, number_of_pcoa_features: int, shear_to_table: bool, ) -> None: tree_newick, fm = check_and_process_files(output_dir, tree, feature_metadata) table = load_table(table) sample_metadata = pd.read_csv(sample_metadata, sep="\t", index_col=0) if pcoa is not None: pcoa = OrdinationResults.read(pcoa) pcoa = prepare_pcoa(pcoa, number_of_pcoa_features) viz = Empress( tree_newick, table=table, sample_metadata=sample_metadata, feature_metadata=fm, ordination=pcoa, ignore_missing_samples=ignore_missing_samples, filter_extra_samples=filter_extra_samples, filter_missing_features=filter_missing_features, shear_to_table=shear_to_table, ) os.makedirs(output_dir) save_viz(viz, output_dir, q2=False)
def setUp(self): self.test_dm = DistanceMatrix( np.array([ [0, 1, 2, 3, 4], [1, 0, 4, 5, 6], [2, 4, 0, 6, 7], [3, 5, 6, 0, 8], [4, 6, 7, 8, 0], ]), ids=[f'S{i}' for i in range(5)], ) n_samples = 100 np.random.seed(825) sample_embedding = np.random.normal(size=(n_samples, 3)) + 2 sample_embedding[:, 1] *= 3 sample_embedding[:, 2] *= 6 sample_df = pd.DataFrame( sample_embedding, index=[f'S{i}' for i in range(n_samples)], columns=[f'C{i}' for i in range(3)], ) self.test_ord_results = OrdinationResults( 'foo', 'bar', eigvals=pd.Series(np.arange(n_samples)), samples=sample_df, )
def test_str(self): exp = ("Ordination results:\n" "\tEigvals: 2\n" "\tProportion explained: N/A\n" "\tSpecies: 3x2\n" "\tSite: 3x2\n" "\tBiplot: N/A\n" "\tSite constraints: N/A\n" "\tSpecies IDs: 'Species1', 'Species2', 'Species3'\n" "\tSite IDs: 'Site1', 'Site2', 'Site3'") obs = str(self.ordination_results) self.assertEqual(obs, exp) # all optional attributes missing exp = ("Ordination results:\n" "\tEigvals: 1\n" "\tProportion explained: N/A\n" "\tSpecies: N/A\n" "\tSite: N/A\n" "\tBiplot: N/A\n" "\tSite constraints: N/A\n" "\tSpecies IDs: N/A\n" "\tSite IDs: N/A") obs = str(OrdinationResults(np.array([4.2]))) self.assertEqual(obs, exp)
def test_get_procrustes_results(self): sample_id_map = { 'CP3A1': 'S1', 'CC1A1': 'S2', 'CC2A1': 'S3', 'CP1A1': 'S4' } actual = get_procrustes_results(StringIO(pcoa1_f), StringIO(pcoa1_f), sample_id_map=sample_id_map, randomize=None, max_dimensions=None) # just some sanity checks as the individual componenets are # already tested -- these are based on looking at the output of the # run, and testing to ensure that it hasn't changed eigvals = array([ 8976580.24393, 6044862.67619, 4372581.39431, 3161360.10319, 2583594.45275, 2407555.39787 ]) prop_expl = array([ 23.1764657118, 15.6071186064, 11.2894866423, 8.16225689998, 6.67053450426, 6.21602253997 ]) site = array([[ -0.199225958574, -0.250846540029, -0.119813087305, -0.155652031006, 0.18495315824, -0.160875399364 ], [ -0.238263544222, -0.37724227779, -0.169458651217, 0.0305157004776, 0.112181007345, 0.0677415967093 ], [ 0.116737988534, 0.414627960015, 0.201315243115, 0.113769076804, -0.283025353088, -0.144278863311 ], [ 0.320751514262, 0.213460857804, 0.0879564954067, 0.0113672537238, -0.0141088124974, 0.237412665966 ]]) site_ids = ['S3', 'S2', 'S1', 'S4'] expected = OrdinationResults(eigvals=eigvals, proportion_explained=prop_expl, site=site, site_ids=site_ids) assert_almost_equal(actual[0].eigvals, expected.eigvals) assert_almost_equal(actual[0].proportion_explained, expected.proportion_explained) self.assertEqual(actual[0].site_ids, expected.site_ids) assert_almost_equal(actual[0].site, expected.site) assert_almost_equal(actual[1].eigvals, expected.eigvals) assert_almost_equal(actual[1].proportion_explained, expected.proportion_explained) assert_almost_equal(actual[1].site, expected.site) self.assertEqual(actual[1].site_ids, expected.site_ids) self.assertTrue(actual[2] < 6e-30)
def setUpClass(cls): axis_labels = ['PC1', 'PC2', 'PC3'] cls.test_df1 = pd.DataFrame.from_dict( { 's1': [0.1, 0.2, 7], 's2': [0.9, 0.2, 7], }, orient='index', columns=axis_labels, ) cls.test_df1.index.name = 'Sample ID' cls.pcoa1 = OrdinationResults( 'pcoa1', 'pcoa1', eigvals=pd.Series( [7, 2, 1], index=axis_labels, ), samples=cls.test_df1, proportion_explained=pd.Series( [0.7, 0.2, 0.1], index=axis_labels, ), ) cls.test_metadata = pd.DataFrame( { 'age_cat': ['30s', '40s', '50s', '30s', None], 'num_cat': [7.24, 7.24, 8.25, 7.24, None], 'other': [1, 2, 3, 4, None], }, index=pd.Series(['s1', 's2', 'c', 'd', 'e'], name='#SampleID')) cls.resources = DictElement({ 'datasets': DictElement({ 'dataset1': DictElement({ '__metadata__': MockMetadataElement(cls.test_metadata), '__pcoa__': PCOAElement({ 'sample_set': DictElement({ 'beta_metric': cls.pcoa1, }), }) }), 'dataset2': DictElement({ '__metadata__': MockMetadataElement(cls.test_metadata), }), }), }) cls.resources.accept(TrivialVisitor()) cls.res_patcher = patch( 'microsetta_public_api.api.emperor.get_resources') cls.mock_resources = cls.res_patcher.start() cls.mock_resources.return_value = cls.resources
def test_io(self): # Very basic check that read/write public API is present and appears to # be functioning. Roundtrip from memory -> disk -> memory and ensure # results match. fh = StringIO() self.ordination_results.write(fh) fh.seek(0) deserialized = OrdinationResults.read(fh) assert_ordination_results_equal(deserialized, self.ordination_results) self.assertTrue(type(deserialized) == OrdinationResults)
def body_site(coords, mapping_file): """Generates as many figures as samples in the coordinates file""" o = OrdinationResults.from_file(coords) # coordinates c_df = pd.DataFrame(o.site, o.site_ids) # mapping file mf = pd.read_csv(mapping_file, '\t', converters=defaultdict(str), index_col='#SampleID') mf = mf.loc[o.site_ids] color_hmp_fecal = sns.color_palette('Paired', 12)[10] # light brown color_agp_fecal = sns.color_palette('Paired', 12)[11] # dark brown color_hmp_oral = sns.color_palette('Paired', 12)[0] # light blue color_agp_oral = sns.color_palette('Paired', 12)[1] # dark blue color_hmp_skin = sns.color_palette('Paired', 12)[2] # light green color_agp_skin = sns.color_palette('Paired', 12)[3] # dark green grp_colors = {'AGP-FECAL': color_agp_fecal, 'AGP-ORAL': color_agp_oral, 'AGP-SKIN': color_agp_skin, 'HMP-FECAL': color_hmp_fecal, 'GG-FECAL': color_hmp_fecal, 'PGP-FECAL': color_hmp_fecal, 'HMP-ORAL': color_hmp_oral, 'PGP-ORAL': color_hmp_oral, 'HMP-SKIN': color_hmp_skin, 'PGP-SKIN': color_hmp_skin} for sample in mf.index: # plot categories as 50 slices with random zorder for grp, color in grp_colors.iteritems(): sub_coords = c_df[mf.TITLE_BODY_SITE == grp].values for i in np.array_split(sub_coords, 50): plt.scatter(i[:, 0], i[:, 1], color=color, edgecolor=np.asarray(color)*0.6, lw=LINE_WIDTH, alpha=ALPHA, zorder=np.random.rand()) # plot participant's dot plt.scatter(c_df.loc[sample][0], c_df.loc[sample][1], color=grp_colors[mf.loc[sample]['TITLE_BODY_SITE']], s=270, edgecolor='w', zorder=1) plt.scatter(c_df.loc[sample][0], c_df.loc[sample][1], color=grp_colors[mf.loc[sample]['TITLE_BODY_SITE']], s=250, edgecolor=np.asarray( grp_colors[mf.loc[sample]['TITLE_BODY_SITE']])*0.6, zorder=2) plt.axis('off') my_dpi = 72 plt.savefig(sample+'.pdf', figsize=(1000/my_dpi, 1000/my_dpi), dpi=my_dpi) plt.close()
def setUp(self): # Define in-memory CA results to serialize and deserialize. eigvals = np.array([0.0961330159181, 0.0409418140138]) species = np.array([[0.408869425742, 0.0695518116298], [-0.1153860437, -0.299767683538], [-0.309967102571, 0.187391917117]]) site = np.array([[-0.848956053187, 0.882764759014], [-0.220458650578, -1.34482000302], [1.66697179591, 0.470324389808]]) biplot = None site_constraints = None prop_explained = None species_ids = ['Species1', 'Species2', 'Species3'] site_ids = ['Site1', 'Site2', 'Site3'] self.ordination_results = OrdinationResults( eigvals=eigvals, species=species, site=site, biplot=biplot, site_constraints=site_constraints, proportion_explained=prop_explained, species_ids=species_ids, site_ids=site_ids) # DataFrame for testing plot method. Has a categorical column with a # mix of numbers and strings. Has a numeric column with a mix of ints, # floats, and strings that can be converted to floats. Has a numeric # column with missing data (np.nan). self.df = pd.DataFrame([['foo', '42', 10], [22, 0, 8], [22, -4.2, np.nan], ['foo', '42.19', 11]], index=['A', 'B', 'C', 'D'], columns=['categorical', 'numeric', 'nancolumn']) # Minimal ordination results for easier testing of plotting method. # Paired with df above. eigvals = np.array([0.50, 0.25, 0.25]) site = np.array([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5], [0.4, 0.5, 0.6]]) self.min_ord_results = OrdinationResults(eigvals=eigvals, site=site, site_ids=['A', 'B', 'C', 'D'])
def gradient(coords, mapping_file, color): """Generates as many figures as samples in the coordinates file""" o = OrdinationResults.from_file(coords) # coordinates c_df = pd.DataFrame(o.site, o.site_ids) # mapping file mf = pd.read_csv(mapping_file, '\t', converters=defaultdict(str), index_col='#SampleID') mf = mf.loc[o.site_ids] mf[color] = mf[color].convert_objects(convert_numeric=True) numeric = mf[~pd.isnull(mf[color])] non_numeric = mf[pd.isnull(mf[color])] color_array = plt.cm.RdBu(numeric[color]/max(numeric[color])) for sample in mf.index: # plot numeric metadata as colored gradient ids = numeric.index x, y = c_df.loc[ids][0], c_df.loc[ids][1] plt.scatter(x, y, c=numeric[color], cmap=plt.get_cmap('RdBu'), alpha=ALPHA, lw=LINE_WIDTH, edgecolor=color_array*0.6) # plt.colorbar() # plot non-numeric metadata as gray ids = non_numeric.index x, y = c_df.loc[ids][0], c_df.loc[ids][1] plt.scatter(x, y, c='0.5', alpha=ALPHA, lw=LINE_WIDTH, edgecolor='0.3') # plot individual's dot try: color_index = numeric.index.tolist().index(sample) except ValueError: color_index = None if color_index is None: _color = (0.5, 0.5, 0.5) else: _color = color_array[color_index] plt.scatter(c_df.loc[sample][0], c_df.loc[sample][1], color=_color, s=270, edgecolor='w') plt.scatter(c_df.loc[sample][0], c_df.loc[sample][1], color=_color, s=250, edgecolor=np.asarray(_color)*0.6) plt.axis('off') my_dpi = 72 plt.savefig(sample+'.pdf', figsize=(1000/my_dpi, 1000/my_dpi), dpi=my_dpi) plt.close()
class TestOrdinationResults(unittest.TestCase): def setUp(self): # Define in-memory CA results to serialize and deserialize. eigvals = np.array([0.0961330159181, 0.0409418140138]) species = np.array([[0.408869425742, 0.0695518116298], [-0.1153860437, -0.299767683538], [-0.309967102571, 0.187391917117]]) site = np.array([[-0.848956053187, 0.882764759014], [-0.220458650578, -1.34482000302], [1.66697179591, 0.470324389808]]) biplot = None site_constraints = None prop_explained = None species_ids = ['Species1', 'Species2', 'Species3'] site_ids = ['Site1', 'Site2', 'Site3'] self.ordination_results = OrdinationResults( eigvals=eigvals, species=species, site=site, biplot=biplot, site_constraints=site_constraints, proportion_explained=prop_explained, species_ids=species_ids, site_ids=site_ids) def test_io(self): # Very basic check that read/write public API is present and appears to # be functioning. Roundtrip from memory -> disk -> memory and ensure # results match. fh = StringIO() self.ordination_results.write(fh) fh.seek(0) deserialized = OrdinationResults.read(fh) assert_ordination_results_equal(deserialized, self.ordination_results) self.assertTrue(type(deserialized) == OrdinationResults) def test_deprecated_io(self): fh = StringIO() npt.assert_warns(UserWarning, self.ordination_results.to_file, fh) fh.seek(0) deserialized = npt.assert_warns(UserWarning, OrdinationResults.from_file, fh) assert_ordination_results_equal(deserialized, self.ordination_results) self.assertTrue(type(deserialized) == OrdinationResults)
def setUp(self): or_f = StringIO(PCOA_STRING) self.ord_res = OrdinationResults.read(or_f) self.data = [['PC.354', 'Control', '20061218', 'Ctrol_mouse_I.D._354'], ['PC.355', 'Control', '20061218', 'Control_mouse_I.D._355'], ['PC.356', 'Control', '20061126', 'Control_mouse_I.D._356'], ['PC.481', 'Control', '20070314', 'Control_mouse_I.D._481'], ['PC.593', 'Control', '20071210', 'Control_mouse_I.D._593'], ['PC.607', 'Fast', '20071112', 'Fasting_mouse_I.D._607'], ['PC.634', 'Fast', '20080116', 'Fasting_mouse_I.D._634'], ['PC.635', 'Fast', '20080116', 'Fasting_mouse_I.D._635'], ['PC.636', 'Fast', '20080116', 'Fasting_mouse_I.D._636']] self.headers = ['SampleID', 'Treatment', 'DOB', 'Description']
def embed( distance_matrix: DistanceMatrix, n_neighbors: int, min_dist: float = 1, number_of_dimensions: int = 2, random_state: int = 724, ) -> OrdinationResults: n_samples = len(distance_matrix.ids) if number_of_dimensions > n_samples: raise ValueError( f'number_of_dimensions ({number_of_dimensions}) must be fewer than' f'number of samples ({n_samples}) - 2' ) transformer = UMAP( n_neighbors=n_neighbors, n_components=number_of_dimensions, min_dist=min_dist, random_state=random_state, metric='precomputed', ) embedding = transformer.fit_transform(distance_matrix[:, :]) if embedding.shape[1] < 3: difference = 3 - embedding.shape[1] embedding = np.hstack((embedding, np.zeros((len(embedding), difference)))) number_of_dimensions = embedding.shape[1] embedding_df = pd.DataFrame(embedding, index=distance_matrix.ids, columns=[f'UMAP-{i}' for i in range(embedding.shape[1])] ) null_eigvals = pd.Series(np.zeros(number_of_dimensions)) ord_results = OrdinationResults( 'umap', 'Uniform Manifold Approximation and Projection', eigvals=null_eigvals, samples=embedding_df, proportion_explained=null_eigvals, ) return center(ord_results)
def parse_coords(lines): """Parse skbio's ordination results file into coords, labels, eigvals, pct_explained. Returns: - list of sample labels in order - array of coords (rows = samples, cols = axes in descending order) - list of eigenvalues - list of percent variance explained For the file format check skbio.stats.ordination.OrdinationResults.read Strategy: read the file using skbio's parser and return the objects we want """ pcoa_results = OrdinationResults.read(lines) return (pcoa_results.site_ids, pcoa_results.site, pcoa_results.eigvals, pcoa_results.proportion_explained)
def setUp(self): # Define in-memory CA results to serialize and deserialize. eigvals = np.array([0.0961330159181, 0.0409418140138]) species = np.array([[0.408869425742, 0.0695518116298], [-0.1153860437, -0.299767683538], [-0.309967102571, 0.187391917117]]) site = np.array([[-0.848956053187, 0.882764759014], [-0.220458650578, -1.34482000302], [1.66697179591, 0.470324389808]]) biplot = None site_constraints = None prop_explained = None species_ids = ['Species1', 'Species2', 'Species3'] site_ids = ['Site1', 'Site2', 'Site3'] self.ordination_results = OrdinationResults( eigvals=eigvals, species=species, site=site, biplot=biplot, site_constraints=site_constraints, proportion_explained=prop_explained, species_ids=species_ids, site_ids=site_ids)
def emperor_output(sklearn_output, full_file_list, eigenvalues, percent_variance, output_file, new_files = None): print("Made it to Emperor Function!") #read in sklearn output and format accordingly for emperor intake eigvals = pd.Series(data = eigenvalues) samples = pd.DataFrame(data = sklearn_output, index = full_file_list) p_explained = pd.Series(data = percent_variance) ores = OrdinationResults(long_method_name = "principal component analysis", short_method_name = "pcoa", eigvals = eigvals, samples = samples, proportion_explained = p_explained) #this first part is for the global metadata file global_metadata = pd.read_csv(config.PATH_TO_ORIGINAL_MAPPING_FILE, sep = "\t") global_metadata_headers = global_metadata.columns.tolist() global_metadata.rename(columns = {'filename': 'SampleID'}, inplace = True) global_metadata["type"] = "Global Data" global_metadata.set_index("SampleID", inplace = True) common = global_metadata #this part is for the user uploaded metadata file if new_files != None: metadata_uploaded = pd.DataFrame({"SampleID": new_files, "type":["Your Data"] * len(new_files)}) for item in global_metadata_headers: metadata_uploaded[item] = ["Your Data"] * len(new_files) metadata_uploaded.set_index("SampleID", inplace = True) common = pd.concat([global_metadata, metadata_uploaded]) #so you need to align the metadata and the files contained within the ordination file BEFORE feeding it into the Emperor thing otherwise it doesn't like to output results final_metadata, unused = common.align(samples, join = "right", axis = 0) #call stuff to ouput an emperor plot emp = Emperor(ores, final_metadata, remote = True) # create an output directory os.makedirs(output_file, exist_ok=True) with open(os.path.join(output_file, 'index.html'), 'w') as f: f.write(emp.make_emperor(standalone = True)) emp.copy_support_files(output_file)
def center(embedding: OrdinationResults) -> OrdinationResults: short_name = embedding.short_method_name long_name = embedding.long_method_name n_dimensions = embedding.samples.shape[1] transformer = PCA(n_components=n_dimensions) new_embedding = transformer.fit_transform(embedding.samples) embedding_df = pd.DataFrame(new_embedding, index=embedding.samples.index, columns=embedding.samples.columns ) null_eigvals = pd.Series(np.zeros(n_dimensions)) ord_results = OrdinationResults( short_name, long_name, eigvals=null_eigvals, samples=embedding_df, proportion_explained=null_eigvals, ) return ord_results
def emperor_output(sklearn_output, full_file_list, eigenvalues, percent_variance, output_file, new_files=[]): eigvals = pd.Series(data=eigenvalues) samples = pd.DataFrame(data=sklearn_output, index=full_file_list) samples.index.rename("SampleID", inplace=True) p_explained = pd.Series(data=percent_variance) ores = OrdinationResults(long_method_name="principal component analysis", short_method_name="pcoa", eigvals=eigvals, samples=samples, proportion_explained=p_explained) #read in all sample metadata df = pd.read_table(config.PATH_TO_ORIGINAL_MAPPING_FILE) df.rename(columns={"filename": "SampleID"}, inplace=True) df.set_index("SampleID", inplace=True) #handling the case in which the pca is a projection if len(new_files) != 0: df["Type"] = "Global" new_meta = pd.DataFrame({"SampleID": new_files, "Type": "Your Data"}) new_meta.set_index("SampleID", inplace=True) df = pd.concat([df, new_meta], axis=0, join="outer") final_metadata, unused = df.align(samples, join="right", axis=0) #call stuff to ouput an emperor plot emp = Emperor(ores, final_metadata, remote=True) # create an output directory os.makedirs(output_file, exist_ok=True) with open(os.path.join(output_file, 'index.html'), 'w') as f: f.write(emp.make_emperor(standalone=True)) emp.copy_support_files(output_file)
def _ordination_to_ordination_results(fh): eigvals = _parse_vector_section(fh, 'Eigvals') if eigvals is None: raise OrdinationFormatError("At least one eigval must be present.") _check_empty_line(fh) prop_expl = _parse_vector_section(fh, 'Proportion explained') _check_length_against_eigvals(prop_expl, eigvals, 'proportion explained values') _check_empty_line(fh) species, species_ids = _parse_array_section(fh, 'Species') _check_length_against_eigvals(species, eigvals, 'coordinates per species') _check_empty_line(fh) site, site_ids = _parse_array_section(fh, 'Site') _check_length_against_eigvals(site, eigvals, 'coordinates per site') _check_empty_line(fh) # biplot does not have ids to parse (the other arrays do) biplot, _ = _parse_array_section(fh, 'Biplot', has_ids=False) _check_empty_line(fh) cons, cons_ids = _parse_array_section(fh, 'Site constraints') if cons_ids is not None and site_ids is not None: if cons_ids != site_ids: raise OrdinationFormatError( "Site constraints ids and site ids must be equal: %s != %s" % (cons_ids, site_ids)) return OrdinationResults(eigvals=eigvals, species=species, site=site, biplot=biplot, site_constraints=cons, proportion_explained=prop_expl, species_ids=species_ids, site_ids=site_ids)
def _ordination_to_ordination_results(fh): eigvals = _parse_vector_section(fh, 'Eigvals') if eigvals is None: raise OrdinationFormatError("At least one eigval must be present.") _check_empty_line(fh) prop_expl = _parse_vector_section(fh, 'Proportion explained') _check_length_against_eigvals(prop_expl, eigvals, 'proportion explained values') _check_empty_line(fh) species = _parse_array_section(fh, 'Species') _check_length_against_eigvals(species, eigvals, 'coordinates per species') _check_empty_line(fh) site = _parse_array_section(fh, 'Site') _check_length_against_eigvals(site, eigvals, 'coordinates per site') _check_empty_line(fh) # biplot does not have ids to parse (the other arrays do) biplot = _parse_array_section(fh, 'Biplot', has_ids=False) _check_empty_line(fh) cons = _parse_array_section(fh, 'Site constraints') if cons is not None and site is not None: if not np.array_equal(cons.index, site.index): raise OrdinationFormatError( "Site constraints ids and site ids must be equal: %s != %s" % (cons.index, site.index)) return OrdinationResults( short_method_name='', long_method_name='', eigvals=eigvals, features=species, samples=site, biplot_scores=biplot, sample_constraints=cons, proportion_explained=prop_expl)
if __name__ == '__main__': option_parser, opts, args = parse_command_line_parameters(**script_info) ord_fp = opts.input_fp mapping_fp = opts.map_fp categories = opts.categories.split(',') output_dir = opts.output_dir sort_by = opts.sort_by algorithm = opts.algorithm axes = opts.axes weighted = opts.weight_by_vector window_size = opts.window_size # Parse the ordination results with open(ord_fp, 'U') as f: ord_res = OrdinationResults.read(f) # Parse the mapping file with open(mapping_fp, 'U') as f: map_dict = parse_mapping_file_to_dict(f)[0] metamap = pd.DataFrame.from_dict(map_dict, orient='index') for category in categories: if category not in metamap.keys(): option_parser.error("Category %s does not exist in the mapping " "file" % categories) sort_category = None if sort_by: if sort_by == 'SampleID': sort_category = None
def country(coords, mapping_file): """Generates as many figures as samples in the coordinates file""" o = OrdinationResults.from_file(coords) x, y = o.site[:, 0], o.site[:, 1] # coordinates c_df = pd.DataFrame(o.site, o.site_ids) # mapping file mf = pd.read_csv(mapping_file, '\t', converters=defaultdict(str), index_col='#SampleID') mf = mf.loc[o.site_ids] color_Venezuela = sns.color_palette('Paired', 12)[10] color_Malawi = sns.color_palette('Paired', 12)[1] color_Western = sns.color_palette('Paired', 12)[4] color_Highlight = sns.color_palette('Paired', 12)[5] color_no_data = (0.5, 0.5, 0.5) grp_colors = OrderedDict() grp_colors['no_data'] = color_no_data grp_colors['Australia'] = color_Western grp_colors['Belgium'] = color_Western grp_colors['Canada'] = color_Western grp_colors['China'] = color_Western grp_colors['Finland'] = color_Western grp_colors['France'] = color_Western grp_colors['Germany'] = color_Western grp_colors['Great Britain'] = color_Western grp_colors['Ireland'] = color_Western grp_colors['Japan'] = color_Western grp_colors['Netherlands'] = color_Western grp_colors['New Zealand'] = color_Western grp_colors['Norway'] = color_Western grp_colors['Scotland'] = color_Western grp_colors['Spain'] = color_Western grp_colors['Switzerland'] = color_Western grp_colors['Thailand'] = color_Western grp_colors['United Arab Emirates'] = color_Western grp_colors['United Kingdom'] = color_Western grp_colors['United States of America'] = color_Western grp_colors['Malawi'] = color_Malawi grp_colors['Venezuela'] = color_Venezuela for sample in mf.index: # countour plot superimposed sns.kdeplot(x, y, cmap='bone') sns.set_context(rc={"lines.linewidth": 0.75}) # change particapant's country's color to color_Highlight unless # country is Venezuela or Malawi if (mf.loc[sample]['COUNTRY'] != 'Malawi') & ( mf.loc[sample]['COUNTRY'] != 'Venezuela'): grp_colors[mf.loc[sample]['COUNTRY']] = color_Highlight # plot each country except participant's according to colors above for grp, color in grp_colors.iteritems(): if grp == mf.loc[sample]['COUNTRY']: continue sub_coords = c_df[mf.COUNTRY == grp] plt.scatter(sub_coords[0], sub_coords[1], color=color, edgecolor=np.asarray(color)*0.6, lw=LINE_WIDTH, alpha=ALPHA) # now plot participant's country grp = mf.loc[sample]['COUNTRY'] color = grp_colors[grp] sub_coords = c_df[mf.COUNTRY == grp] plt.scatter(sub_coords[0], sub_coords[1], color=color, edgecolor=np.asarray(color)*0.6, lw=LINE_WIDTH, alpha=ALPHA) # plot participant's dot plt.scatter(c_df.loc[sample][0], c_df.loc[sample][1], color=grp_colors[mf.loc[sample]['COUNTRY']], s=270, edgecolor='w', zorder=1) plt.scatter(c_df.loc[sample][0], c_df.loc[sample][1], color=grp_colors[mf.loc[sample]['COUNTRY']], s=250, edgecolor=np.asarray(grp_colors[mf.loc[sample] ['COUNTRY']])*0.6, zorder=2) # reset particapant's country's color to color_Western unless country # is Venezuela or Malawi if (mf.loc[sample]['COUNTRY'] != 'Malawi') & ( mf.loc[sample]['COUNTRY'] != 'Venezuela'): grp_colors[mf.loc[sample]['COUNTRY']] = color_Western plt.axis('off') my_dpi = 72 plt.savefig(sample+'.pdf', figsize=(1000/my_dpi, 1000/my_dpi), dpi=my_dpi) plt.close()
class TestOrdinationResults(unittest.TestCase): def setUp(self): # Define in-memory CA results to serialize and deserialize. eigvals = np.array([0.0961330159181, 0.0409418140138]) species = np.array([[0.408869425742, 0.0695518116298], [-0.1153860437, -0.299767683538], [-0.309967102571, 0.187391917117]]) site = np.array([[-0.848956053187, 0.882764759014], [-0.220458650578, -1.34482000302], [1.66697179591, 0.470324389808]]) biplot = None site_constraints = None prop_explained = None species_ids = ['Species1', 'Species2', 'Species3'] site_ids = ['Site1', 'Site2', 'Site3'] self.ordination_results = OrdinationResults( eigvals=eigvals, species=species, site=site, biplot=biplot, site_constraints=site_constraints, proportion_explained=prop_explained, species_ids=species_ids, site_ids=site_ids) # DataFrame for testing plot method. Has a categorical column with a # mix of numbers and strings. Has a numeric column with a mix of ints, # floats, and strings that can be converted to floats. Has a numeric # column with missing data (np.nan). self.df = pd.DataFrame([['foo', '42', 10], [22, 0, 8], [22, -4.2, np.nan], ['foo', '42.19', 11]], index=['A', 'B', 'C', 'D'], columns=['categorical', 'numeric', 'nancolumn']) # Minimal ordination results for easier testing of plotting method. # Paired with df above. eigvals = np.array([0.50, 0.25, 0.25]) site = np.array([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5], [0.4, 0.5, 0.6]]) self.min_ord_results = OrdinationResults(eigvals=eigvals, site=site, site_ids=['A', 'B', 'C', 'D']) def test_str(self): exp = ("Ordination results:\n" "\tEigvals: 2\n" "\tProportion explained: N/A\n" "\tSpecies: 3x2\n" "\tSite: 3x2\n" "\tBiplot: N/A\n" "\tSite constraints: N/A\n" "\tSpecies IDs: 'Species1', 'Species2', 'Species3'\n" "\tSite IDs: 'Site1', 'Site2', 'Site3'") obs = str(self.ordination_results) self.assertEqual(obs, exp) # all optional attributes missing exp = ("Ordination results:\n" "\tEigvals: 1\n" "\tProportion explained: N/A\n" "\tSpecies: N/A\n" "\tSite: N/A\n" "\tBiplot: N/A\n" "\tSite constraints: N/A\n" "\tSpecies IDs: N/A\n" "\tSite IDs: N/A") obs = str(OrdinationResults(np.array([4.2]))) self.assertEqual(obs, exp) def check_basic_figure_sanity(self, fig, exp_num_subplots, exp_title, exp_legend_exists, exp_xlabel, exp_ylabel, exp_zlabel): # check type assert_is_instance(fig, mpl.figure.Figure) # check number of subplots axes = fig.get_axes() npt.assert_equal(len(axes), exp_num_subplots) # check title ax = axes[0] npt.assert_equal(ax.get_title(), exp_title) # shouldn't have tick labels for tick_label in (ax.get_xticklabels() + ax.get_yticklabels() + ax.get_zticklabels()): npt.assert_equal(tick_label.get_text(), '') # check if legend is present legend = ax.get_legend() if exp_legend_exists: assert_true(legend is not None) else: assert_true(legend is None) # check axis labels npt.assert_equal(ax.get_xlabel(), exp_xlabel) npt.assert_equal(ax.get_ylabel(), exp_ylabel) npt.assert_equal(ax.get_zlabel(), exp_zlabel) def test_plot_no_metadata(self): fig = self.min_ord_results.plot() self.check_basic_figure_sanity(fig, 1, '', False, '0', '1', '2') def test_plot_with_numeric_metadata_and_plot_options(self): fig = self.min_ord_results.plot( self.df, 'numeric', axes=(1, 0, 2), axis_labels=['PC 2', 'PC 1', 'PC 3'], title='a title', cmap='Reds') self.check_basic_figure_sanity( fig, 2, 'a title', False, 'PC 2', 'PC 1', 'PC 3') def test_plot_with_categorical_metadata_and_plot_options(self): fig = self.min_ord_results.plot( self.df, 'categorical', axes=[2, 0, 1], title='a title', cmap='Accent') self.check_basic_figure_sanity(fig, 1, 'a title', True, '2', '0', '1') def test_plot_with_invalid_axis_labels(self): with assert_raises_regexp(ValueError, 'axis_labels.*4'): self.min_ord_results.plot(axes=[2, 0, 1], axis_labels=('a', 'b', 'c', 'd')) def test_validate_plot_axes_valid_input(self): # shouldn't raise an error on valid input. nothing is returned, so # nothing to check here self.min_ord_results._validate_plot_axes(self.min_ord_results.site.T, (1, 2, 0)) def test_validate_plot_axes_invalid_input(self): # not enough dimensions with assert_raises_regexp(ValueError, '2 dimension\(s\)'): self.min_ord_results._validate_plot_axes( np.asarray([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]]), (0, 1, 2)) coord_matrix = self.min_ord_results.site.T # wrong number of axes with assert_raises_regexp(ValueError, 'exactly three.*found 0'): self.min_ord_results._validate_plot_axes(coord_matrix, []) with assert_raises_regexp(ValueError, 'exactly three.*found 4'): self.min_ord_results._validate_plot_axes(coord_matrix, (0, 1, 2, 3)) # duplicate axes with assert_raises_regexp(ValueError, 'must be unique'): self.min_ord_results._validate_plot_axes(coord_matrix, (0, 1, 0)) # out of range axes with assert_raises_regexp(ValueError, 'axes\[1\].*3'): self.min_ord_results._validate_plot_axes(coord_matrix, (0, -1, 2)) with assert_raises_regexp(ValueError, 'axes\[2\].*3'): self.min_ord_results._validate_plot_axes(coord_matrix, (0, 2, 3)) def test_get_plot_point_colors_invalid_input(self): # column provided without df with npt.assert_raises(ValueError): self.min_ord_results._get_plot_point_colors(None, 'numeric', ['B', 'C'], 'jet') # df provided without column with npt.assert_raises(ValueError): self.min_ord_results._get_plot_point_colors(self.df, None, ['B', 'C'], 'jet') # column not in df with assert_raises_regexp(ValueError, 'missingcol'): self.min_ord_results._get_plot_point_colors(self.df, 'missingcol', ['B', 'C'], 'jet') # id not in df with assert_raises_regexp(ValueError, 'numeric'): self.min_ord_results._get_plot_point_colors( self.df, 'numeric', ['B', 'C', 'missingid', 'A'], 'jet') # missing data in df with assert_raises_regexp(ValueError, 'nancolumn'): self.min_ord_results._get_plot_point_colors(self.df, 'nancolumn', ['B', 'C', 'A'], 'jet') def test_get_plot_point_colors_no_df_or_column(self): obs = self.min_ord_results._get_plot_point_colors(None, None, ['B', 'C'], 'jet') npt.assert_equal(obs, (None, None)) def test_get_plot_point_colors_numeric_column(self): # subset of the ids in df exp = [0.0, -4.2, 42.0] obs = self.min_ord_results._get_plot_point_colors( self.df, 'numeric', ['B', 'C', 'A'], 'jet') npt.assert_almost_equal(obs[0], exp) assert_true(obs[1] is None) # all ids in df exp = [0.0, 42.0, 42.19, -4.2] obs = self.min_ord_results._get_plot_point_colors( self.df, 'numeric', ['B', 'A', 'D', 'C'], 'jet') npt.assert_almost_equal(obs[0], exp) assert_true(obs[1] is None) def test_get_plot_point_colors_categorical_column(self): # subset of the ids in df exp_colors = [[0., 0., 0.5, 1.], [0., 0., 0.5, 1.], [0.5, 0., 0., 1.]] exp_color_dict = { 'foo': [0.5, 0., 0., 1.], 22: [0., 0., 0.5, 1.] } obs = self.min_ord_results._get_plot_point_colors( self.df, 'categorical', ['B', 'C', 'A'], 'jet') npt.assert_almost_equal(obs[0], exp_colors) npt.assert_equal(obs[1], exp_color_dict) # all ids in df exp_colors = [[0., 0., 0.5, 1.], [0.5, 0., 0., 1.], [0.5, 0., 0., 1.], [0., 0., 0.5, 1.]] obs = self.min_ord_results._get_plot_point_colors( self.df, 'categorical', ['B', 'A', 'D', 'C'], 'jet') npt.assert_almost_equal(obs[0], exp_colors) # should get same color dict as before npt.assert_equal(obs[1], exp_color_dict) def test_plot_categorical_legend(self): fig = plt.figure() ax = fig.add_subplot(111, projection='3d') # we shouldn't have a legend yet assert_true(ax.get_legend() is None) self.min_ord_results._plot_categorical_legend( ax, {'foo': 'red', 'bar': 'green'}) # make sure we have a legend now legend = ax.get_legend() assert_true(legend is not None) # do some light sanity checking to make sure our input labels and # colors are present. we're not using nose.tools.assert_items_equal # because it isn't available in Python 3. labels = [t.get_text() for t in legend.get_texts()] npt.assert_equal(sorted(labels), ['bar', 'foo']) colors = [l.get_color() for l in legend.get_lines()] npt.assert_equal(sorted(colors), ['green', 'red']) def test_repr_png(self): obs = self.min_ord_results._repr_png_() assert_is_instance(obs, binary_type) assert_true(len(obs) > 0) def test_repr_svg(self): obs = self.min_ord_results._repr_svg_() # print_figure(format='svg') can return text or bytes depending on the # version of IPython assert_true(isinstance(obs, text_type) or isinstance(obs, binary_type)) assert_true(len(obs) > 0) def test_png(self): assert_is_instance(self.min_ord_results.png, Image) def test_svg(self): assert_is_instance(self.min_ord_results.svg, SVG)
class TestOrdinationResults(unittest.TestCase): def setUp(self): # Define in-memory CA results to serialize and deserialize. eigvals = np.array([0.0961330159181, 0.0409418140138]) species = np.array([[0.408869425742, 0.0695518116298], [-0.1153860437, -0.299767683538], [-0.309967102571, 0.187391917117]]) site = np.array([[-0.848956053187, 0.882764759014], [-0.220458650578, -1.34482000302], [1.66697179591, 0.470324389808]]) biplot = None site_constraints = None prop_explained = None species_ids = ['Species1', 'Species2', 'Species3'] site_ids = ['Site1', 'Site2', 'Site3'] self.ordination_results = OrdinationResults( eigvals=eigvals, species=species, site=site, biplot=biplot, site_constraints=site_constraints, proportion_explained=prop_explained, species_ids=species_ids, site_ids=site_ids) # DataFrame for testing plot method. Has a categorical column with a # mix of numbers and strings. Has a numeric column with a mix of ints, # floats, and strings that can be converted to floats. Has a numeric # column with missing data (np.nan). self.df = pd.DataFrame([['foo', '42', 10], [22, 0, 8], [22, -4.2, np.nan], ['foo', '42.19', 11]], index=['A', 'B', 'C', 'D'], columns=['categorical', 'numeric', 'nancolumn']) # Minimal ordination results for easier testing of plotting method. # Paired with df above. eigvals = np.array([0.50, 0.25, 0.25]) site = np.array([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5], [0.4, 0.5, 0.6]]) self.min_ord_results = OrdinationResults(eigvals=eigvals, site=site, site_ids=['A', 'B', 'C', 'D']) def test_deprecated_io(self): fh = StringIO() npt.assert_warns(UserWarning, self.ordination_results.to_file, fh) fh.seek(0) deserialized = npt.assert_warns(UserWarning, OrdinationResults.from_file, fh) assert_ordination_results_equal(deserialized, self.ordination_results) self.assertTrue(type(deserialized) == OrdinationResults) def check_basic_figure_sanity(self, fig, exp_num_subplots, exp_title, exp_legend_exists, exp_xlabel, exp_ylabel, exp_zlabel): # check type assert_is_instance(fig, mpl.figure.Figure) # check number of subplots axes = fig.get_axes() npt.assert_equal(len(axes), exp_num_subplots) # check title ax = axes[0] npt.assert_equal(ax.get_title(), exp_title) # shouldn't have tick labels for tick_label in (ax.get_xticklabels() + ax.get_yticklabels() + ax.get_zticklabels()): npt.assert_equal(tick_label.get_text(), '') # check if legend is present legend = ax.get_legend() if exp_legend_exists: assert_true(legend is not None) else: assert_true(legend is None) # check axis labels npt.assert_equal(ax.get_xlabel(), exp_xlabel) npt.assert_equal(ax.get_ylabel(), exp_ylabel) npt.assert_equal(ax.get_zlabel(), exp_zlabel) def test_plot_no_metadata(self): fig = self.min_ord_results.plot() self.check_basic_figure_sanity(fig, 1, '', False, '0', '1', '2') def test_plot_with_numeric_metadata_and_plot_options(self): fig = self.min_ord_results.plot(self.df, 'numeric', axes=(1, 0, 2), axis_labels=['PC 2', 'PC 1', 'PC 3'], title='a title', cmap='Reds') self.check_basic_figure_sanity(fig, 2, 'a title', False, 'PC 2', 'PC 1', 'PC 3') def test_plot_with_categorical_metadata_and_plot_options(self): fig = self.min_ord_results.plot(self.df, 'categorical', axes=[2, 0, 1], title='a title', cmap='Accent') self.check_basic_figure_sanity(fig, 1, 'a title', True, '2', '0', '1') def test_plot_with_invalid_axis_labels(self): with assert_raises_regexp(ValueError, 'axis_labels.*4'): self.min_ord_results.plot(axes=[2, 0, 1], axis_labels=('a', 'b', 'c', 'd')) def test_validate_plot_axes_valid_input(self): # shouldn't raise an error on valid input. nothing is returned, so # nothing to check here self.min_ord_results._validate_plot_axes(self.min_ord_results.site.T, (1, 2, 0)) def test_validate_plot_axes_invalid_input(self): # not enough dimensions with assert_raises_regexp(ValueError, '2 dimension\(s\)'): self.min_ord_results._validate_plot_axes( np.asarray([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4]]), (0, 1, 2)) coord_matrix = self.min_ord_results.site.T # wrong number of axes with assert_raises_regexp(ValueError, 'exactly three.*found 0'): self.min_ord_results._validate_plot_axes(coord_matrix, []) with assert_raises_regexp(ValueError, 'exactly three.*found 4'): self.min_ord_results._validate_plot_axes(coord_matrix, (0, 1, 2, 3)) # duplicate axes with assert_raises_regexp(ValueError, 'must be unique'): self.min_ord_results._validate_plot_axes(coord_matrix, (0, 1, 0)) # out of range axes with assert_raises_regexp(ValueError, 'axes\[1\].*3'): self.min_ord_results._validate_plot_axes(coord_matrix, (0, -1, 2)) with assert_raises_regexp(ValueError, 'axes\[2\].*3'): self.min_ord_results._validate_plot_axes(coord_matrix, (0, 2, 3)) def test_get_plot_point_colors_invalid_input(self): # column provided without df with npt.assert_raises(ValueError): self.min_ord_results._get_plot_point_colors( None, 'numeric', ['B', 'C'], 'jet') # df provided without column with npt.assert_raises(ValueError): self.min_ord_results._get_plot_point_colors( self.df, None, ['B', 'C'], 'jet') # column not in df with assert_raises_regexp(ValueError, 'missingcol'): self.min_ord_results._get_plot_point_colors( self.df, 'missingcol', ['B', 'C'], 'jet') # id not in df with assert_raises_regexp(ValueError, 'numeric'): self.min_ord_results._get_plot_point_colors( self.df, 'numeric', ['B', 'C', 'missingid', 'A'], 'jet') # missing data in df with assert_raises_regexp(ValueError, 'nancolumn'): self.min_ord_results._get_plot_point_colors( self.df, 'nancolumn', ['B', 'C', 'A'], 'jet') def test_get_plot_point_colors_no_df_or_column(self): obs = self.min_ord_results._get_plot_point_colors( None, None, ['B', 'C'], 'jet') npt.assert_equal(obs, (None, None)) def test_get_plot_point_colors_numeric_column(self): # subset of the ids in df exp = [0.0, -4.2, 42.0] obs = self.min_ord_results._get_plot_point_colors( self.df, 'numeric', ['B', 'C', 'A'], 'jet') npt.assert_almost_equal(obs[0], exp) assert_true(obs[1] is None) # all ids in df exp = [0.0, 42.0, 42.19, -4.2] obs = self.min_ord_results._get_plot_point_colors( self.df, 'numeric', ['B', 'A', 'D', 'C'], 'jet') npt.assert_almost_equal(obs[0], exp) assert_true(obs[1] is None) def test_get_plot_point_colors_categorical_column(self): # subset of the ids in df exp_colors = [[0., 0., 0.5, 1.], [0., 0., 0.5, 1.], [0.5, 0., 0., 1.]] exp_color_dict = {'foo': [0.5, 0., 0., 1.], 22: [0., 0., 0.5, 1.]} obs = self.min_ord_results._get_plot_point_colors( self.df, 'categorical', ['B', 'C', 'A'], 'jet') npt.assert_almost_equal(obs[0], exp_colors) npt.assert_equal(obs[1], exp_color_dict) # all ids in df exp_colors = [[0., 0., 0.5, 1.], [0.5, 0., 0., 1.], [0.5, 0., 0., 1.], [0., 0., 0.5, 1.]] obs = self.min_ord_results._get_plot_point_colors( self.df, 'categorical', ['B', 'A', 'D', 'C'], 'jet') npt.assert_almost_equal(obs[0], exp_colors) # should get same color dict as before npt.assert_equal(obs[1], exp_color_dict) def test_plot_categorical_legend(self): fig = plt.figure() ax = fig.add_subplot(111, projection='3d') # we shouldn't have a legend yet assert_true(ax.get_legend() is None) self.min_ord_results._plot_categorical_legend(ax, { 'foo': 'red', 'bar': 'green' }) # make sure we have a legend now legend = ax.get_legend() assert_true(legend is not None) # do some light sanity checking to make sure our input labels and # colors are present. we're not using nose.tools.assert_items_equal # because it isn't available in Python 3. labels = [t.get_text() for t in legend.get_texts()] npt.assert_equal(sorted(labels), ['bar', 'foo']) colors = [l.get_color() for l in legend.get_lines()] npt.assert_equal(sorted(colors), ['green', 'red']) def test_repr_png(self): obs = self.min_ord_results._repr_png_() assert_is_instance(obs, binary_type) assert_true(len(obs) > 0) def test_repr_svg(self): obs = self.min_ord_results._repr_svg_() assert_is_instance(obs, text_type) assert_true(len(obs) > 0) def test_png(self): assert_is_instance(self.min_ord_results.png, Image) def test_svg(self): assert_is_instance(self.min_ord_results.svg, SVG)
def get_pair_cmds(self, omics_pairs): crowdeds = [0, 1] pc_sb_correlations = [] for keys, values in self.mmvec_res.items(): pair, case, omic1, omic2, filt1, filt2, sams, mmvec = keys ranks_fp, ordi_fp, meta_fp, omic1_common, omic2_common = values order_omics = get_order_omics(omic1, omic2, filt1, filt2, case, omics_pairs) omic1 = order_omics[0] omic2 = order_omics[1] filt1 = order_omics[2] filt2 = order_omics[3] omic_feature = order_omics[4] omic_sample = order_omics[5] omic_microbe = order_omics[6] omic_metabolite = order_omics[7] # get differentials meta1, meta_pd1, diff_cols1 = self.metas[(pair, case, omic1, filt1, omic2, filt2)] meta2, meta_pd2, diff_cols2 = self.metas[(pair, case, omic2, filt2, omic1, filt1)] # features are biplot, samples are dots ordi = OrdinationResults.read(ordi_fp) cur_pc_sb_correlations, max_r = get_pc_sb_correlations( pair, case, ordi, omic1, omic2, filt1, filt2, diff_cols1, meta_pd1, diff_cols2, meta_pd2, meta_fp, omic1_common, omic2_common, ranks_fp) pc_sb_correlations.append(cur_pc_sb_correlations) cmd = '' if pair in self.highlights: pair_highlights = self.highlights[pair] for highlight, regexes_list in pair_highlights.items(): n_edit, meta_edit, ordi_edit_fp = edit_ordi_qzv( ordi, ordi_fp, highlight, regexes_list, meta1, meta_pd1) if n_edit: qza, qzv = get_qzs(ordi_edit_fp) cmd += get_biplot_commands(ordi_edit_fp, qza, qzv, omic_feature, omic_sample, meta_edit, meta2, n_edit, max_r) ordi_edit_fp = ordi_fp qza, qzv = get_qzs(ordi_edit_fp) for crowded in crowdeds: if crowded: n_ordi_feats = ordi.features.shape[0] qzv = qzv.replace('.qzv', '_crowded.qzv') else: n_ordi_feats = 15 # heat_qza, heat_qzv = get_heatmap_qzs(ranks_fp) # cmd += get_heatmap_commands( # ranks_fp, heat_qza, heat_qzv, meta1, # meta2, meta_pd1, meta_pd2) cmd += get_biplot_commands(ordi_edit_fp, qza, qzv, omic_feature, omic_sample, meta1, meta2, n_ordi_feats, max_r) cmd += get_xmmvec_commands(ordi_edit_fp, omic1, omic2, meta1, meta2, self.xmmvecs, pair) topn = 5 features_names = [] if features_names: heat = '%s_paired_heatmaps_custom.qzv' % splitext(ranks_fp)[0] else: heat = '%s_paired_heatmaps_top%s.qzv' % (splitext(ranks_fp)[0], topn) cmd += get_paired_heatmaps_command(ranks_fp, omic1_common, omic2_common, meta1, features_names, topn, heat) self.cmds.setdefault(pair, []).append(cmd) return pc_sb_correlations
def get_procrustes_results(coords_f1, coords_f2, sample_id_map=None, randomize=None, max_dimensions=None, get_eigenvalues=get_mean_eigenvalues, get_percent_variation_explained=get_mean_percent_variation): """ """ # Parse the PCoA files ord_res_1 = OrdinationResults.read(coords_f1) ord_res_2 = OrdinationResults.read(coords_f2) sample_ids1 = ord_res_1.site_ids coords1 = ord_res_1.site eigvals1 = ord_res_1.eigvals pct_var1 = ord_res_1.proportion_explained sample_ids2 = ord_res_2.site_ids coords2 = ord_res_2.site eigvals2 = ord_res_2.eigvals pct_var2 = ord_res_2.proportion_explained if sample_id_map: sample_ids1 = map_sample_ids(sample_ids1, sample_id_map) sample_ids2 = map_sample_ids(sample_ids2, sample_id_map) # rearrange the order of coords in coords2 to correspond to # the order of coords in coords1 order = list(set(sample_ids1) & set(sample_ids2)) coords1 = reorder_coords(coords1, sample_ids1, order) coords2 = reorder_coords(coords2, sample_ids2, order) if len(order) == 0: raise ValueError('No overlapping samples in the two files') # If this is a random trial, apply the shuffling function passed as # randomize() if randomize: coords2 = randomize(coords2) randomized_coords2 = OrdinationResults(eigvals=eigvals2, proportion_explained=pct_var2, site=coords2, site_ids=order) else: randomized_coords2 = None coords1, coords2 = pad_coords_matrices(coords1, coords2) if max_dimensions: coords1 = filter_coords_matrix(coords1, max_dimensions) coords2 = filter_coords_matrix(coords2, max_dimensions) pct_var1 = pct_var1[:max_dimensions] pct_var2 = pct_var2[:max_dimensions] eigvals1 = eigvals1[:max_dimensions] eigvals2 = eigvals2[:max_dimensions] else: if len(pct_var1) > len(pct_var2): pct_var2 = append(pct_var2, zeros(len(pct_var1) - len(pct_var2))) eigvals2 = append(eigvals2, zeros(len(eigvals1) - len(eigvals2))) elif len(pct_var1) < len(pct_var2): pct_var1 = append(pct_var1, zeros(len(pct_var2) - len(pct_var1))) eigvals1 = append(eigvals1, zeros(len(eigvals2) - len(eigvals1))) # Run the Procrustes analysis transformed_coords_m1, transformed_coords_m2, m_squared =\ procrustes(coords1, coords2) # print coords2 # print transformed_coords_m2 eigvals = get_eigenvalues(eigvals1, eigvals2) pct_var = get_percent_variation_explained(pct_var1, pct_var2) transformed_coords1 = OrdinationResults(eigvals=asarray(eigvals), proportion_explained=asarray(pct_var), site=asarray(transformed_coords_m1), site_ids=order) transformed_coords2 = OrdinationResults(eigvals=asarray(eigvals), proportion_explained=asarray(pct_var), site=asarray(transformed_coords_m2), site_ids=order) # Return the results return (transformed_coords1, transformed_coords2, m_squared, randomized_coords2)
def setUp(self): super().setUp() axis_labels = ['PC1', 'PC2', 'PC3'] self.resources = ResourceManager() self.fh1 = self.create_tempfile(suffix='.qza') self.fh2 = self.create_tempfile(suffix='.qza') self.pcoa_path1 = self.fh1.name self.pcoa_path2 = self.fh2.name self.test_df1 = pd.DataFrame.from_dict( { 's1': [0.1, 0.2, 7], 's2': [0.9, 0.2, 7], }, orient='index', columns=axis_labels, ) self.test_df1.index.name = 'Sample ID' self.test_df2 = pd.DataFrame.from_dict( { 's1': [0.1, 0.2, 7], 's2': [0.9, 0.2, 7], 's3': [0.2, -0.3, 0], 's4': [0.111, -4, 0.2], }, orient='index', columns=axis_labels, ) self.test_df2.index.name = 'Sample ID' self.pcoa1 = OrdinationResults( 'pcoa1', 'pcoa1', eigvals=pd.Series( [7, 2, 1], index=axis_labels, ), samples=self.test_df1, proportion_explained=pd.Series( [0.7, 0.2, 0.1], index=axis_labels, ), ) self.pcoa2 = OrdinationResults( 'pcoa2', 'pcoa2', eigvals=pd.Series( [6, 3, 1], index=axis_labels, ), samples=self.test_df2, proportion_explained=pd.Series( [0.6, 0.3, 0.1], index=axis_labels, ), ) imported_artifact = Artifact.import_data( "PCoAResults", self.pcoa1, ) imported_artifact.save(self.pcoa_path1) imported_artifact = Artifact.import_data( "PCoAResults", self.pcoa2, ) imported_artifact.save(self.pcoa_path2)
def load_mp_data(use_artifact_api=True, is_empire=True): """Loads data from the QIIME 2 moving pictures tutorial for visualization. It's assumed that this data is already stored in docs/moving-pictures/, aka the PREFIX_DIR global variable set above, which should be located relative to where this function is being run from. If this directory or the data files within it cannot be accessed, this function will (probably) break. Parameters ---------- use_artifact_api: bool, optional (default True) If True, this will load the artifacts using the QIIME 2 Artifact API, and the returned objects will have types corresponding to the first listed types (before the | characters) shown below. If False, this will instead load the artifacts without using QIIME 2's APIs; in this case, the returned objects will have types corresponding to the second listed types (after the | characters) shown below. is_empire: bool, optional(default True) If True, this will return an ordination. If False, will return None in place of an ordination. Returns ------- (tree, table, md, fmd, ordination) tree: qiime2.Artifact | skbio.tree.TreeNode Phylogenetic tree. table: qiime2.Artifact | biom.Table Feature table. md: qiime2.Metadata | pandas.DataFrame Sample metadata. fmd: qiime2.Metadata | pandas.DataFrame Feature metadata. (Although this is stored in the repository as a FeatureData[Taxonomy] artifact, we transform it to Metadata if use_artifact_api is True.) pcoa: qiime2.Artifact | skbio.OrdinationResults | None """ q2_tree_loc = os.path.join(PREFIX_DIR, "rooted-tree.qza") q2_table_loc = os.path.join(PREFIX_DIR, "table.qza") q2_pcoa_loc = os.path.join(PREFIX_DIR, "unweighted_unifrac_pcoa_results.qza") q2_tax_loc = os.path.join(PREFIX_DIR, "taxonomy.qza") md_loc = os.path.join(PREFIX_DIR, "sample_metadata.tsv") if use_artifact_api: from qiime2 import Artifact, Metadata tree = Artifact.load(q2_tree_loc) table = Artifact.load(q2_table_loc) pcoa = Artifact.load(q2_pcoa_loc) if is_empire else None md = Metadata.load(md_loc) # We have to transform the taxonomy QZA to Metadata ourselves fmd = Artifact.load(q2_tax_loc).view(Metadata) else: import biom import pandas as pd from skbio.stats.ordination import OrdinationResults from skbio.tree import TreeNode with tempfile.TemporaryDirectory() as _tmp: tree_loc = extract_q2_artifact_to_path(_tmp, q2_tree_loc, "tree.nwk") tree = TreeNode.read(tree_loc) tbl_loc = extract_q2_artifact_to_path(_tmp, q2_table_loc, "feature-table.biom") table = biom.load_table(tbl_loc) if is_empire: pcoa_loc = extract_q2_artifact_to_path(_tmp, q2_pcoa_loc, "ordination.txt") pcoa = OrdinationResults.read(pcoa_loc) else: pcoa = None tax_loc = extract_q2_artifact_to_path(_tmp, q2_tax_loc, "taxonomy.tsv") fmd = pd.read_csv(tax_loc, sep="\t", index_col=0) md = pd.read_csv(md_loc, sep="\t", index_col=0, skiprows=[1]) return tree, table, md, fmd, pcoa
def setUp(self): super(OrdinationResultsReaderWriterTests, self).setUp() # define in-memory results, one for each of the valid files in # self.valid_fps # CA results eigvals = np.array([0.0961330159181, 0.0409418140138]) species = np.array([[0.408869425742, 0.0695518116298], [-0.1153860437, -0.299767683538], [-0.309967102571, 0.187391917117]]) site = np.array([[-0.848956053187, 0.882764759014], [-0.220458650578, -1.34482000302], [1.66697179591, 0.470324389808]]) biplot = None site_constraints = None prop_explained = None species_ids = ['Species1', 'Species2', 'Species3'] site_ids = ['Site1', 'Site2', 'Site3'] ca_scores = OrdinationResults(eigvals=eigvals, species=species, site=site, biplot=biplot, site_constraints=site_constraints, proportion_explained=prop_explained, species_ids=species_ids, site_ids=site_ids) # CCA results eigvals = np.array([ 0.366135830393, 0.186887643052, 0.0788466514249, 0.082287840501, 0.0351348475787, 0.0233265839374, 0.0099048981912, 0.00122461669234, 0.000417454724117 ]) species = np.loadtxt(get_data_path('ordres_exp_OrdRes_CCA_species')) site = np.loadtxt(get_data_path('ordres_exp_OrdRes_CCA_site')) biplot = np.array( [[-0.169746767979, 0.63069090084, 0.760769036049], [-0.994016563505, 0.0609533148724, -0.0449369418179], [0.184352565909, -0.974867543612, 0.0309865007541]]) site_constraints = np.loadtxt( get_data_path('ordres_exp_OrdRes_CCA_site_constraints')) prop_explained = None species_ids = [ 'Species0', 'Species1', 'Species2', 'Species3', 'Species4', 'Species5', 'Species6', 'Species7', 'Species8' ] site_ids = [ 'Site0', 'Site1', 'Site2', 'Site3', 'Site4', 'Site5', 'Site6', 'Site7', 'Site8', 'Site9' ] cca_scores = OrdinationResults(eigvals=eigvals, species=species, site=site, biplot=biplot, site_constraints=site_constraints, proportion_explained=prop_explained, species_ids=species_ids, site_ids=site_ids) # PCoA results eigvals = np.array([ 0.512367260461, 0.300719094427, 0.267912066004, 0.208988681078, 0.19169895326, 0.16054234528, 0.15017695712, 0.122457748167, 0.0 ]) species = None site = np.loadtxt(get_data_path('ordres_exp_OrdRes_PCoA_site')) biplot = None site_constraints = None prop_explained = np.array([ 0.267573832777, 0.15704469605, 0.139911863774, 0.109140272454, 0.100111048503, 0.0838401161912, 0.0784269939011, 0.0639511763509, 0.0 ]) species_ids = None site_ids = [ 'PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593', 'PC.355', 'PC.607', 'PC.634' ] pcoa_scores = OrdinationResults(eigvals=eigvals, species=species, site=site, biplot=biplot, site_constraints=site_constraints, proportion_explained=prop_explained, species_ids=species_ids, site_ids=site_ids) # RDA results eigvals = np.array([ 25.8979540892, 14.9825779819, 8.93784077262, 6.13995623072, 1.68070536498, 0.57735026919, 0.275983624351 ]) species = np.loadtxt(get_data_path('ordres_exp_OrdRes_RDA_species')) site = np.loadtxt(get_data_path('ordres_exp_OrdRes_RDA_site')) biplot = np.array([[0.422650019179, -0.559142585857, -0.713250678211], [0.988495963777, 0.150787422017, -0.0117848614073], [-0.556516618887, 0.817599992718, 0.147714267459], [-0.404079676685, -0.9058434809, -0.127150316558]]) site_constraints = np.loadtxt( get_data_path('ordres_exp_OrdRes_RDA_site_constraints')) prop_explained = None species_ids = [ 'Species0', 'Species1', 'Species2', 'Species3', 'Species4', 'Species5' ] site_ids = [ 'Site0', 'Site1', 'Site2', 'Site3', 'Site4', 'Site5', 'Site6', 'Site7', 'Site8', 'Site9' ] rda_scores = OrdinationResults(eigvals=eigvals, species=species, site=site, biplot=biplot, site_constraints=site_constraints, proportion_explained=prop_explained, species_ids=species_ids, site_ids=site_ids) self.ordination_results_objs = [ ca_scores, cca_scores, pcoa_scores, rda_scores ]
def setUp(self): eigvals = np.array([0.512367260461, 0.300719094427, 0.267912066004, 0.208988681078, 0.19169895326, 0.16054234528, 0.15017695712, 0.122457748167, 0.0]) site = np.array([[-0.212230626531, 0.216034194368, 0.03532727349, -0.254450494129, -0.0687468542543, 0.231895596562, 0.00496549154314, -0.0026246871695, 9.73837390723e-10], [-0.277487312135, -0.0295483215975, -0.0744173437992, 0.0957182357964, 0.204714844022, -0.0055407341857, -0.190287966833, 0.16307126638, 9.73837390723e-10], [0.220886492631, 0.0874848360559, -0.351990132198, -0.00316535032886, 0.114635191853, -0.00019194106125, 0.188557853937, 0.030002427212, 9.73837390723e-10], [0.0308923744062, -0.0446295973489, 0.133996451689, 0.29318228566, -0.167812539312, 0.130996149793, 0.113551017379, 0.109987942454, 9.73837390723e-10], [0.27616778138, -0.0341866951102, 0.0633000238256, 0.100446653327, 0.123802521199, 0.1285839664, -0.132852841046, -0.217514322505, 9.73837390723e-10], [0.202458130052, -0.115216120518, 0.301820871723, -0.18300251046, 0.136208248567, -0.0989435556722, 0.0927738484879, 0.0909429797672, 9.73837390723e-10], [0.236467470907, 0.21863434374, -0.0301637746424, -0.0225473129718, -0.205287183891, -0.180224615141, -0.165277751908, 0.0411933458557, 9.73837390723e-10], [-0.105517545144, -0.41405687433, -0.150073017617, -0.116066751485, -0.158763393475, -0.0223918378516, -0.0263068046112, -0.0501209518091, 9.73837390723e-10], [-0.371636765565, 0.115484234741, 0.0721996475289, 0.0898852445906, 0.0212491652909, -0.184183028843, 0.114877153051, -0.164938000185, 9.73837390723e-10]]) prop_expl = np.array([25.6216900347, 15.7715955926, 14.1215046787, 11.6913885817, 9.83044890697, 8.51253468595, 7.88775505332, 6.56308246609, 4.42499350906e-16]) site_ids = ['PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593', 'PC.355', 'PC.607', 'PC.634'] self.ord_res = OrdinationResults(eigvals=eigvals, site=site, proportion_explained=prop_expl, site_ids=site_ids) metadata_map = {'PC.354': {'Treatment': 'Control', 'DOB': '20061218', 'Weight': '60', 'Description': 'Control_mouse_I.D._354'}, 'PC.355': {'Treatment': 'Control', 'DOB': '20061218', 'Weight': '55', 'Description': 'Control_mouse_I.D._355'}, 'PC.356': {'Treatment': 'Control', 'DOB': '20061126', 'Weight': '50', 'Description': 'Control_mouse_I.D._356'}, 'PC.481': {'Treatment': 'Control', 'DOB': '20070314', 'Weight': '52', 'Description': 'Control_mouse_I.D._481'}, 'PC.593': {'Treatment': 'Control', 'DOB': '20071210', 'Weight': '57', 'Description': 'Control_mouse_I.D._593'}, 'PC.607': {'Treatment': 'Fast', 'DOB': '20071112', 'Weight': '65', 'Description': 'Fasting_mouse_I.D._607'}, 'PC.634': {'Treatment': 'Fast', 'DOB': '20080116', 'Weight': '68', 'Description': 'Fasting_mouse_I.D._634'}, 'PC.635': {'Treatment': 'Fast', 'DOB': '20080116', 'Weight': '70', 'Description': 'Fasting_mouse_I.D._635'}, 'PC.636': {'Treatment': 'Fast', 'DOB': '20080116', 'Weight': '72', 'Description': 'Fasting_mouse_I.D._636'}} self.metadata_map = pd.DataFrame.from_dict(metadata_map, orient='index') self.categories = ['Treatment'] self.sort_by = 'Weight'
def format_coords(coord_header, coords, eigvals, pct_var, headers=True): """formats coords given specified coords matrix etc.""" result = [] if (headers): result.append('pc vector number\t' + '\t'.join(map(str, range(1, len(coords[0]) + 1)))) for name, row in zip(coord_header, coords): result.append('\t'.join([name] + map(str, row))) result.append('') result.append('') result.append('eigvals\t' + '\t'.join(map(str, eigvals))) result.append('% variation explained\t' + '\t'.join(map(str, pct_var))) else: result = ['\t'.join(map(str, row)) for row in coords] result.append('') return '\n'.join(result) if __name__ == "__main__": old_file = argv[1] new_file = argv[2] with open(old_file, 'U') as infile: with open(new_file, 'w') as outfile: res = OrdinationResults.from_file(infile) lines = format_coords(res.site_ids, res.site, res.eigvals, res.proportion_explained) outfile.write(lines)
distances[dataset_][(fold_, Nsamp_)]['Bray_Curtis'] = table_ table_ = pd.read_table(os.path.join(subpath_, sub_set, 'Robust_Aitchison_Distance.tsv'), index_col=0, low_memory=False) table_.index = table_.index.astype(str) table_.columns = table_.columns.astype(str) table_ = table_.reindex(index=index_me, columns=index_me) distances[dataset_][(fold_, Nsamp_)]['Robust_Aitchison'] = table_ # ordination type file in_ord = os.path.join(subpath_, sub_set, 'RPCA_Ordination.txt') # get loadings from ordination files ordinations[dataset_][( fold_, Nsamp_)]['RPCA_Samples'] = OrdinationResults.read(in_ord).samples ordinations[dataset_][( fold_, Nsamp_)]['RPCA_Features'] = OrdinationResults.read(in_ord).features # permanova analysis from skbio import DistanceMatrix from skbio.stats.distance import permanova both_perm_res = {} perm_res = {} perm_res_tmp = {} for dataset_, subs in distances.items(): perm_res[dataset_] = {} perm_res_tmp[dataset_] = {} for (fold_, Nsamp_), methods_ in subs.items():