def test_parse_sample_metadata(self): map_f = io.StringIO("#SampleID\tCol1\tCol2\n01\ta\t1\n00\tb\t2\n") observed = parse_sample_metadata(map_f) expected = pd.DataFrame([['a', '1'], ['b', '2']], index=pd.Index(['01', '00'], name='#SampleID'), columns=['Col1', 'Col2']) pdt.assert_frame_equal(observed, expected)
def gibbs(table_fp: Table, mapping_fp: pd.DataFrame, output_dir: str, loo: bool, jobs: int, alpha1: float, alpha2: float, beta: float, source_rarefaction_depth: int, sink_rarefaction_depth: int, restarts: int, draws_per_restart: int, burnin: int, delay: int, per_sink_feature_assignments: bool, sample_with_replacement: bool, source_sink_column: str, source_column_value: str, sink_column_value: str, source_category_column: str): '''Gibb's sampler for Bayesian estimation of microbial sample sources. For details, see the project README file. ''' # Create results directory. Click has already checked if it exists, and # failed if so. os.mkdir(output_dir) # Load the metadata file and feature table. sample_metadata = parse_sample_metadata(open(mapping_fp, 'U')) feature_table = biom_to_df(load_table(table_fp)) # run the gibbs sampler helper function (same used for q2) results = gibbs_helper(feature_table, sample_metadata, loo, jobs, alpha1, alpha2, beta, source_rarefaction_depth, sink_rarefaction_depth, restarts, draws_per_restart, burnin, delay, per_sink_feature_assignments, sample_with_replacement, source_sink_column, source_column_value, sink_column_value, source_category_column) # import the results (will change based on per_sink_feature_assignments) if len(results) == 3: mpm, mps, fas = results # write the feature tables from fas for sink, fa in zip(mpm.columns, fas): fa.to_csv(os.path.join(output_dir, sink + '.feature_table.txt'), sep='\t') else: # get the results (without fas) mpm, mps = results # Write results. mpm.to_csv(os.path.join(output_dir, 'mixing_proportions.txt'), sep='\t') mps.to_csv(os.path.join(output_dir, 'mixing_proportions_stds.txt'), sep='\t') # Plot contributions. fig, ax = plot_heatmap(mpm.T) fig.savefig(os.path.join(output_dir, 'mixing_proportions.pdf'), dpi=300)
def gibbs_cli(table_fp, mapping_fp, output_dir, loo, jobs, alpha1, alpha2, beta, source_rarefaction_depth, sink_rarefaction_depth, restarts, draws_per_restart, burnin, delay, per_sink_feature_assignments, sample_with_replacement, source_sink_column, source_column_value, sink_column_value, source_category_column, diagnostics, limit): '''Gibb's sampler for Bayesian estimation of microbial sample sources. For details, see the project README file. ''' # Create results directory. Click has already checked if it exists, and # failed if so. os.mkdir(output_dir) # Load the metadata file and feature table. sample_metadata = parse_sample_metadata(open(mapping_fp, 'U')) feature_table = biom_to_df(load_table(table_fp)) # Do high level check on feature data. feature_table = validate_gibbs_input(feature_table) # Remove samples not shared by both feature and metadata tables and order # rows equivalently. sample_metadata, feature_table = \ intersect_and_sort_samples(sample_metadata, feature_table) # Identify source and sink samples. source_samples = get_samples(sample_metadata, source_sink_column, source_column_value) sink_samples = get_samples(sample_metadata, source_sink_column, sink_column_value) # If we have no source samples neither normal operation or loo will work. # Will also likely get strange errors. if len(source_samples) == 0: raise ValueError(('You passed %s as the `source_sink_column` and %s ' 'as the `source_column_value`. There are no samples ' 'which are sources under these values. Please see ' 'the help documentation and check your mapping ' 'file.') % (source_sink_column, source_column_value)) # Prepare the 'sources' matrix by collapsing the `source_samples` by their # metadata values. csources = collapse_source_data(sample_metadata, feature_table, source_samples, source_category_column, 'mean') # Rarify collapsed source data if requested. if source_rarefaction_depth > 0: d = (csources.sum(1) >= source_rarefaction_depth) if not d.all(): count_too_shallow = (~d).sum() shallowest = csources.sum(1).min() raise ValueError( ('You requested rarefaction of source samples at ' '%s, but there are %s collapsed source samples ' 'that have less sequences than that. The ' 'shallowest of these is %s sequences.') % (source_rarefaction_depth, count_too_shallow, shallowest)) else: csources = subsample_dataframe(csources, source_rarefaction_depth, replace=sample_with_replacement) # Prepare to rarify sink data if we are not doing LOO. If we are doing loo, # we skip the rarefaction, and set sinks to `None`. if not loo: sinks = feature_table.loc[sink_samples, :] if sink_rarefaction_depth > 0: d = (sinks.sum(1) >= sink_rarefaction_depth) if not d.all(): count_too_shallow = (~d).sum() shallowest = sinks.sum(1).min() raise ValueError( ('You requested rarefaction of sink samples ' 'at %s, but there are %s sink samples that ' 'have less sequences than that. The ' 'shallowest of these is %s sequences.') % (sink_rarefaction_depth, count_too_shallow, shallowest)) else: sinks = subsample_dataframe(sinks, sink_rarefaction_depth, replace=sample_with_replacement) else: sinks = None # Run the computations. mpm, mps, fas = gibbs(csources, sinks, alpha1, alpha2, beta, restarts, draws_per_restart, burnin, delay, jobs, create_feature_tables=per_sink_feature_assignments) # Write results. mpm.to_csv(os.path.join(output_dir, 'mixing_proportions.txt'), sep='\t') mps.to_csv(os.path.join(output_dir, 'mixing_proportions_stds.txt'), sep='\t') if per_sink_feature_assignments: for sink, fa in zip(mpm.index, fas): fa.to_csv(os.path.join(output_dir, sink + '.feature_table.txt'), sep='\t') # Plot contributions. fig, ax = plot_heatmap(mpm) fig.savefig(os.path.join(output_dir, 'mixing_proportions.pdf'), dpi=300) #modified: testing stats output if diagnostics: os.mkdir(output_dir + 'diagnostics') data = np.load('envcounts.npy') sink_ids = np.load('sink_ids.npy') source_ids = np.load('source_ids.npy') file_path = output_dir + 'diagnostics' source_ids = np.append(source_ids, ['unknown']) df = pandas.DataFrame(source_ids) sink_index = -1 for array in data: sink_df = [] sink_index += 1 sink_id = sink_ids[sink_index] source_index = -1 for sources in source_ids: source_index += 1 source_array = array[:, source_index] split_array = np.array_split(source_array, draws_per_restart) plt.figure(figsize=(8, 6), dpi=300), plt.title(sink_id, fontsize=(16)) flagged = [] for splits in split_array: data_sum = np.cumsum(splits) restart_num = np.size(data_sum) vector = np.linspace(1, restart_num, restart_num) rolling = np.true_divide(data_sum, vector) scalar = [(endpoint * alpha1) for endpoint in rolling] line_average = np.average(scalar) line_average = np.round(line_average, decimals=4) flagged.append(line_average) plt.plot(scalar, label=line_average), plt.legend(), plt.ylabel( sources, fontsize=(16)) absolutes = [abs(chains) for chains in flagged] difference = (max(absolutes) - min(absolutes)) sink_df.append(difference) if difference >= limit: file_name = sink_id + '_' + sources + '.png' plt.savefig(os.path.join(file_path, file_name)) else: pass plt.close() sink_df = pandas.DataFrame(sink_df) df[sink_id] = sink_df df.columns.values[0] = '' df.set_index('').T df.to_csv(file_path + '/' + 'table.txt', sep='\t', index=False) os.remove('envcounts.npy') os.remove('sink_ids.npy') os.remove('source_ids.npy')
def gibbs_cli(table_fp, mapping_fp, output_dir, loo, jobs, alpha1, alpha2, beta, source_rarefaction_depth, sink_rarefaction_depth, restarts, draws_per_restart, burnin, delay, cluster_start_delay, per_sink_feature_assignments, sample_with_replacement, source_sink_column, source_column_value, sink_column_value, source_category_column): '''Gibb's sampler for Bayesian estimation of microbial sample sources. For details, see the project README file. ''' # Create results directory. Click has already checked if it exists, and # failed if so. os.mkdir(output_dir) # Load the metadata file and feature table. sample_metadata = parse_sample_metadata(open(mapping_fp, 'U')) feature_table = biom_to_df(load_table(table_fp)) # Do high level check on feature data. feature_table = validate_gibbs_input(feature_table) # Remove samples not shared by both feature and metadata tables and order # rows equivalently. sample_metadata, feature_table = \ intersect_and_sort_samples(sample_metadata, feature_table) # Identify source and sink samples. source_samples = get_samples(sample_metadata, source_sink_column, source_column_value) sink_samples = get_samples(sample_metadata, source_sink_column, sink_column_value) # If we have no source samples neither normal operation or loo will work. # Will also likely get strange errors. if len(source_samples) == 0: raise ValueError(('You passed %s as the `source_sink_column` and %s ' 'as the `source_column_value`. There are no samples ' 'which are sources under these values. Please see ' 'the help documentation and check your mapping ' 'file.') % (source_sink_column, source_column_value)) # Prepare the 'sources' matrix by collapsing the `source_samples` by their # metadata values. csources = collapse_source_data(sample_metadata, feature_table, source_samples, source_category_column, 'mean') # Rarify collapsed source data if requested. if source_rarefaction_depth > 0: d = (csources.sum(1) >= source_rarefaction_depth) if not d.all(): count_too_shallow = (~d).sum() shallowest = csources.sum(1).min() raise ValueError(('You requested rarefaction of source samples at ' '%s, but there are %s collapsed source samples ' 'that have less sequences than that. The ' 'shallowest of these is %s sequences.') % (source_rarefaction_depth, count_too_shallow, shallowest)) else: csources = subsample_dataframe(csources, source_rarefaction_depth, replace=sample_with_replacement) # Prepare to rarify sink data if we are not doing LOO. If we are doing loo, # we skip the rarefaction, and set sinks to `None`. if not loo: sinks = feature_table.loc[sink_samples, :] if sink_rarefaction_depth > 0: d = (sinks.sum(1) >= sink_rarefaction_depth) if not d.all(): count_too_shallow = (~d).sum() shallowest = sinks.sum(1).min() raise ValueError(('You requested rarefaction of sink samples ' 'at %s, but there are %s sink samples that ' 'have less sequences than that. The ' 'shallowest of these is %s sequences.') % (sink_rarefaction_depth, count_too_shallow, shallowest)) else: sinks = subsample_dataframe(sinks, sink_rarefaction_depth, replace=sample_with_replacement) else: sinks = None # If we've been asked to do multiple jobs, we need to spin up a cluster. if jobs > 1: # Launch the ipcluster and wait for it to come up. subprocess.Popen('ipcluster start -n %s --quiet' % jobs, shell=True) time.sleep(cluster_start_delay) cluster = Client() else: cluster = None # Run the computations. mpm, mps, fas = gibbs(csources, sinks, alpha1, alpha2, beta, restarts, draws_per_restart, burnin, delay, cluster=cluster, create_feature_tables=per_sink_feature_assignments) # If we started a cluster, shut it down. if jobs > 1: cluster.shutdown(hub=True) # Write results. mpm.to_csv(os.path.join(output_dir, 'mixing_proportions.txt'), sep='\t') mps.to_csv(os.path.join(output_dir, 'mixing_proportions_stds.txt'), sep='\t') if per_sink_feature_assignments: for sink, fa in zip(mpm.index, fas): fa.to_csv(os.path.join(output_dir, sink + '.feature_table.txt'), sep='\t') # Plot contributions. fig, ax = plot_heatmap(mpm) fig.savefig(os.path.join(output_dir, 'mixing_proportions.pdf'), dpi=300)
def gibbs_cli(table_fp, mapping_fp, output_dir, loo, jobs, alpha1, alpha2, beta, source_rarefaction_depth, sink_rarefaction_depth, restarts, draws_per_restart, burnin, delay, cluster_start_delay, per_sink_feature_assignments, sample_with_replacement, source_sink_column, source_column_value, sink_column_value, source_category_column): '''Gibb's sampler for Bayesian estimation of microbial sample sources. For details, see the project README file. ''' # Create results directory. Click has already checked if it exists, and # failed if so. os.mkdir(output_dir) # Load the metadata file and feature table. sample_metadata = parse_sample_metadata(open(mapping_fp, 'U')) feature_table = biom_to_df(load_table(table_fp)) # Do high level check on feature data. feature_table = validate_gibbs_input(feature_table) # Remove samples not shared by both feature and metadata tables and order # rows equivalently. sample_metadata, feature_table = \ intersect_and_sort_samples(sample_metadata, feature_table) # Identify source and sink samples. source_samples = get_samples(sample_metadata, source_sink_column, source_column_value) sink_samples = get_samples(sample_metadata, source_sink_column, sink_column_value) # If we have no source samples neither normal operation or loo will work. # Will also likely get strange errors. if len(source_samples) == 0: raise ValueError(('You passed %s as the `source_sink_column` and %s ' 'as the `source_column_value`. There are no samples ' 'which are sources under these values. Please see ' 'the help documentation and check your mapping ' 'file.') % (source_sink_column, source_column_value)) # Prepare the 'sources' matrix by collapsing the `source_samples` by their # metadata values. csources = collapse_source_data(sample_metadata, feature_table, source_samples, source_category_column, 'mean') # Rarify collapsed source data if requested. if source_rarefaction_depth > 0: d = (csources.sum(1) >= source_rarefaction_depth) if not d.all(): count_too_shallow = (~d).sum() shallowest = csources.sum(1).min() raise ValueError( ('You requested rarefaction of source samples at ' '%s, but there are %s collapsed source samples ' 'that have less sequences than that. The ' 'shallowest of these is %s sequences.') % (source_rarefaction_depth, count_too_shallow, shallowest)) else: csources = subsample_dataframe(csources, source_rarefaction_depth, replace=sample_with_replacement) # Prepare to rarify sink data if we are not doing LOO. If we are doing loo, # we skip the rarefaction, and set sinks to `None`. if not loo: sinks = feature_table.loc[sink_samples, :] if sink_rarefaction_depth > 0: d = (sinks.sum(1) >= sink_rarefaction_depth) if not d.all(): count_too_shallow = (~d).sum() shallowest = sinks.sum(1).min() raise ValueError( ('You requested rarefaction of sink samples ' 'at %s, but there are %s sink samples that ' 'have less sequences than that. The ' 'shallowest of these is %s sequences.') % (sink_rarefaction_depth, count_too_shallow, shallowest)) else: sinks = subsample_dataframe(sinks, sink_rarefaction_depth, replace=sample_with_replacement) else: sinks = None # If we've been asked to do multiple jobs, we need to spin up a cluster. if jobs > 1: # Launch the ipcluster and wait for it to come up. subprocess.Popen('ipcluster start -n %s --quiet' % jobs, shell=True) time.sleep(cluster_start_delay) cluster = Client() else: cluster = None # Run the computations. mpm, mps, fas = gibbs(csources, sinks, alpha1, alpha2, beta, restarts, draws_per_restart, burnin, delay, cluster=cluster, create_feature_tables=per_sink_feature_assignments) # If we started a cluster, shut it down. if jobs > 1: cluster.shutdown(hub=True) # Write results. mpm.to_csv(os.path.join(output_dir, 'mixing_proportions.txt'), sep='\t') mps.to_csv(os.path.join(output_dir, 'mixing_proportions_stds.txt'), sep='\t') if per_sink_feature_assignments: for sink, fa in zip(mpm.index, fas): fa.to_csv(os.path.join(output_dir, sink + '.feature_table.txt'), sep='\t') # Plot contributions. fig, ax = plot_heatmap(mpm) fig.savefig(os.path.join(output_dir, 'mixing_proportions.pdf'), dpi=300)