Example #1
0
 def test_parse_sample_metadata(self):
     map_f = io.StringIO("#SampleID\tCol1\tCol2\n01\ta\t1\n00\tb\t2\n")
     observed = parse_sample_metadata(map_f)
     expected = pd.DataFrame([['a', '1'], ['b', '2']],
                             index=pd.Index(['01', '00'], name='#SampleID'),
                             columns=['Col1', 'Col2'])
     pdt.assert_frame_equal(observed, expected)
Example #2
0
 def test_parse_sample_metadata(self):
     map_f = io.StringIO("#SampleID\tCol1\tCol2\n01\ta\t1\n00\tb\t2\n")
     observed = parse_sample_metadata(map_f)
     expected = pd.DataFrame([['a', '1'], ['b', '2']],
                             index=pd.Index(['01', '00'], name='#SampleID'),
                             columns=['Col1', 'Col2'])
     pdt.assert_frame_equal(observed, expected)
Example #3
0
def gibbs(table_fp: Table, mapping_fp: pd.DataFrame, output_dir: str,
          loo: bool, jobs: int, alpha1: float, alpha2: float, beta: float,
          source_rarefaction_depth: int, sink_rarefaction_depth: int,
          restarts: int, draws_per_restart: int, burnin: int, delay: int,
          per_sink_feature_assignments: bool, sample_with_replacement: bool,
          source_sink_column: str, source_column_value: str,
          sink_column_value: str, source_category_column: str):
    '''Gibb's sampler for Bayesian estimation of microbial sample sources.

    For details, see the project README file.
    '''
    # Create results directory. Click has already checked if it exists, and
    # failed if so.
    os.mkdir(output_dir)

    # Load the metadata file and feature table.
    sample_metadata = parse_sample_metadata(open(mapping_fp, 'U'))
    feature_table = biom_to_df(load_table(table_fp))

    # run the gibbs sampler helper function (same used for q2)
    results = gibbs_helper(feature_table, sample_metadata, loo, jobs, alpha1,
                           alpha2, beta, source_rarefaction_depth,
                           sink_rarefaction_depth, restarts, draws_per_restart,
                           burnin, delay, per_sink_feature_assignments,
                           sample_with_replacement, source_sink_column,
                           source_column_value, sink_column_value,
                           source_category_column)
    # import the results (will change based on per_sink_feature_assignments)
    if len(results) == 3:
        mpm, mps, fas = results
        # write the feature tables from fas
        for sink, fa in zip(mpm.columns, fas):
            fa.to_csv(os.path.join(output_dir, sink + '.feature_table.txt'),
                      sep='\t')
    else:
        # get the results (without fas)
        mpm, mps = results

    # Write results.
    mpm.to_csv(os.path.join(output_dir, 'mixing_proportions.txt'), sep='\t')
    mps.to_csv(os.path.join(output_dir, 'mixing_proportions_stds.txt'),
               sep='\t')

    # Plot contributions.
    fig, ax = plot_heatmap(mpm.T)
    fig.savefig(os.path.join(output_dir, 'mixing_proportions.pdf'), dpi=300)
Example #4
0
def gibbs_cli(table_fp, mapping_fp, output_dir, loo, jobs, alpha1, alpha2,
              beta, source_rarefaction_depth, sink_rarefaction_depth, restarts,
              draws_per_restart, burnin, delay, per_sink_feature_assignments,
              sample_with_replacement, source_sink_column, source_column_value,
              sink_column_value, source_category_column, diagnostics, limit):
    '''Gibb's sampler for Bayesian estimation of microbial sample sources.

    For details, see the project README file.
    '''
    # Create results directory. Click has already checked if it exists, and
    # failed if so.
    os.mkdir(output_dir)

    # Load the metadata file and feature table.
    sample_metadata = parse_sample_metadata(open(mapping_fp, 'U'))
    feature_table = biom_to_df(load_table(table_fp))

    # Do high level check on feature data.
    feature_table = validate_gibbs_input(feature_table)

    # Remove samples not shared by both feature and metadata tables and order
    # rows equivalently.
    sample_metadata, feature_table = \
        intersect_and_sort_samples(sample_metadata, feature_table)

    # Identify source and sink samples.
    source_samples = get_samples(sample_metadata, source_sink_column,
                                 source_column_value)
    sink_samples = get_samples(sample_metadata, source_sink_column,
                               sink_column_value)

    # If we have no source samples neither normal operation or loo will work.
    # Will also likely get strange errors.
    if len(source_samples) == 0:
        raise ValueError(('You passed %s as the `source_sink_column` and %s '
                          'as the `source_column_value`. There are no samples '
                          'which are sources under these values. Please see '
                          'the help documentation and check your mapping '
                          'file.') % (source_sink_column, source_column_value))

    # Prepare the 'sources' matrix by collapsing the `source_samples` by their
    # metadata values.
    csources = collapse_source_data(sample_metadata, feature_table,
                                    source_samples, source_category_column,
                                    'mean')

    # Rarify collapsed source data if requested.
    if source_rarefaction_depth > 0:
        d = (csources.sum(1) >= source_rarefaction_depth)
        if not d.all():
            count_too_shallow = (~d).sum()
            shallowest = csources.sum(1).min()
            raise ValueError(
                ('You requested rarefaction of source samples at '
                 '%s, but there are %s collapsed source samples '
                 'that have less sequences than that. The '
                 'shallowest of these is %s sequences.') %
                (source_rarefaction_depth, count_too_shallow, shallowest))
        else:
            csources = subsample_dataframe(csources,
                                           source_rarefaction_depth,
                                           replace=sample_with_replacement)

    # Prepare to rarify sink data if we are not doing LOO. If we are doing loo,
    # we skip the rarefaction, and set sinks to `None`.
    if not loo:
        sinks = feature_table.loc[sink_samples, :]
        if sink_rarefaction_depth > 0:
            d = (sinks.sum(1) >= sink_rarefaction_depth)
            if not d.all():
                count_too_shallow = (~d).sum()
                shallowest = sinks.sum(1).min()
                raise ValueError(
                    ('You requested rarefaction of sink samples '
                     'at %s, but there are %s sink samples that '
                     'have less sequences than that. The '
                     'shallowest of these is %s sequences.') %
                    (sink_rarefaction_depth, count_too_shallow, shallowest))
            else:
                sinks = subsample_dataframe(sinks,
                                            sink_rarefaction_depth,
                                            replace=sample_with_replacement)
    else:
        sinks = None

    # Run the computations.
    mpm, mps, fas = gibbs(csources,
                          sinks,
                          alpha1,
                          alpha2,
                          beta,
                          restarts,
                          draws_per_restart,
                          burnin,
                          delay,
                          jobs,
                          create_feature_tables=per_sink_feature_assignments)

    # Write results.
    mpm.to_csv(os.path.join(output_dir, 'mixing_proportions.txt'), sep='\t')
    mps.to_csv(os.path.join(output_dir, 'mixing_proportions_stds.txt'),
               sep='\t')
    if per_sink_feature_assignments:
        for sink, fa in zip(mpm.index, fas):
            fa.to_csv(os.path.join(output_dir, sink + '.feature_table.txt'),
                      sep='\t')

    # Plot contributions.
    fig, ax = plot_heatmap(mpm)
    fig.savefig(os.path.join(output_dir, 'mixing_proportions.pdf'), dpi=300)

    #modified: testing stats output
    if diagnostics:
        os.mkdir(output_dir + 'diagnostics')
        data = np.load('envcounts.npy')
        sink_ids = np.load('sink_ids.npy')
        source_ids = np.load('source_ids.npy')
        file_path = output_dir + 'diagnostics'

        source_ids = np.append(source_ids, ['unknown'])
        df = pandas.DataFrame(source_ids)
        sink_index = -1
        for array in data:
            sink_df = []
            sink_index += 1
            sink_id = sink_ids[sink_index]
            source_index = -1

            for sources in source_ids:
                source_index += 1
                source_array = array[:, source_index]
                split_array = np.array_split(source_array, draws_per_restart)
                plt.figure(figsize=(8, 6), dpi=300), plt.title(sink_id,
                                                               fontsize=(16))

                flagged = []
                for splits in split_array:
                    data_sum = np.cumsum(splits)
                    restart_num = np.size(data_sum)
                    vector = np.linspace(1, restart_num, restart_num)
                    rolling = np.true_divide(data_sum, vector)

                    scalar = [(endpoint * alpha1) for endpoint in rolling]
                    line_average = np.average(scalar)
                    line_average = np.round(line_average, decimals=4)
                    flagged.append(line_average)
                    plt.plot(scalar,
                             label=line_average), plt.legend(), plt.ylabel(
                                 sources, fontsize=(16))

                absolutes = [abs(chains) for chains in flagged]
                difference = (max(absolutes) - min(absolutes))
                sink_df.append(difference)

                if difference >= limit:
                    file_name = sink_id + '_' + sources + '.png'
                    plt.savefig(os.path.join(file_path, file_name))
                else:
                    pass
                plt.close()

            sink_df = pandas.DataFrame(sink_df)
            df[sink_id] = sink_df
            df.columns.values[0] = ''
            df.set_index('').T
            df.to_csv(file_path + '/' + 'table.txt', sep='\t', index=False)

    os.remove('envcounts.npy')
    os.remove('sink_ids.npy')
    os.remove('source_ids.npy')
Example #5
0
def gibbs_cli(table_fp, mapping_fp, output_dir, loo, jobs, alpha1, alpha2,
              beta, source_rarefaction_depth, sink_rarefaction_depth, restarts,
              draws_per_restart, burnin, delay, cluster_start_delay,
              per_sink_feature_assignments, sample_with_replacement,
              source_sink_column, source_column_value,
              sink_column_value, source_category_column):
    '''Gibb's sampler for Bayesian estimation of microbial sample sources.

    For details, see the project README file.
    '''
    # Create results directory. Click has already checked if it exists, and
    # failed if so.
    os.mkdir(output_dir)

    # Load the metadata file and feature table.
    sample_metadata = parse_sample_metadata(open(mapping_fp, 'U'))
    feature_table = biom_to_df(load_table(table_fp))

    # Do high level check on feature data.
    feature_table = validate_gibbs_input(feature_table)

    # Remove samples not shared by both feature and metadata tables and order
    # rows equivalently.
    sample_metadata, feature_table = \
        intersect_and_sort_samples(sample_metadata, feature_table)

    # Identify source and sink samples.
    source_samples = get_samples(sample_metadata, source_sink_column,
                                 source_column_value)
    sink_samples = get_samples(sample_metadata, source_sink_column,
                               sink_column_value)

    # If we have no source samples neither normal operation or loo will work.
    # Will also likely get strange errors.
    if len(source_samples) == 0:
        raise ValueError(('You passed %s as the `source_sink_column` and %s '
                          'as the `source_column_value`. There are no samples '
                          'which are sources under these values. Please see '
                          'the help documentation and check your mapping '
                          'file.') % (source_sink_column, source_column_value))

    # Prepare the 'sources' matrix by collapsing the `source_samples` by their
    # metadata values.
    csources = collapse_source_data(sample_metadata, feature_table,
                                    source_samples, source_category_column,
                                    'mean')

    # Rarify collapsed source data if requested.
    if source_rarefaction_depth > 0:
        d = (csources.sum(1) >= source_rarefaction_depth)
        if not d.all():
            count_too_shallow = (~d).sum()
            shallowest = csources.sum(1).min()
            raise ValueError(('You requested rarefaction of source samples at '
                              '%s, but there are %s collapsed source samples '
                              'that have less sequences than that. The '
                              'shallowest of these is %s sequences.') %
                             (source_rarefaction_depth, count_too_shallow,
                              shallowest))
        else:
            csources = subsample_dataframe(csources, source_rarefaction_depth,
                                           replace=sample_with_replacement)

    # Prepare to rarify sink data if we are not doing LOO. If we are doing loo,
    # we skip the rarefaction, and set sinks to `None`.
    if not loo:
        sinks = feature_table.loc[sink_samples, :]
        if sink_rarefaction_depth > 0:
            d = (sinks.sum(1) >= sink_rarefaction_depth)
            if not d.all():
                count_too_shallow = (~d).sum()
                shallowest = sinks.sum(1).min()
                raise ValueError(('You requested rarefaction of sink samples '
                                  'at %s, but there are %s sink samples that '
                                  'have less sequences than that. The '
                                  'shallowest of these is %s sequences.') %
                                 (sink_rarefaction_depth, count_too_shallow,
                                  shallowest))
            else:
                sinks = subsample_dataframe(sinks, sink_rarefaction_depth,
                                            replace=sample_with_replacement)
    else:
        sinks = None

    # If we've been asked to do multiple jobs, we need to spin up a cluster.
    if jobs > 1:
        # Launch the ipcluster and wait for it to come up.
        subprocess.Popen('ipcluster start -n %s --quiet' % jobs, shell=True)
        time.sleep(cluster_start_delay)
        cluster = Client()
    else:
        cluster = None

    # Run the computations.
    mpm, mps, fas = gibbs(csources, sinks, alpha1, alpha2, beta, restarts,
                          draws_per_restart, burnin, delay, cluster=cluster,
                          create_feature_tables=per_sink_feature_assignments)

    # If we started a cluster, shut it down.
    if jobs > 1:
        cluster.shutdown(hub=True)

    # Write results.
    mpm.to_csv(os.path.join(output_dir, 'mixing_proportions.txt'), sep='\t')
    mps.to_csv(os.path.join(output_dir, 'mixing_proportions_stds.txt'),
               sep='\t')
    if per_sink_feature_assignments:
        for sink, fa in zip(mpm.index, fas):
            fa.to_csv(os.path.join(output_dir, sink + '.feature_table.txt'),
                      sep='\t')

    # Plot contributions.
    fig, ax = plot_heatmap(mpm)
    fig.savefig(os.path.join(output_dir, 'mixing_proportions.pdf'), dpi=300)
Example #6
0
def gibbs_cli(table_fp, mapping_fp, output_dir, loo, jobs, alpha1, alpha2,
              beta, source_rarefaction_depth, sink_rarefaction_depth, restarts,
              draws_per_restart, burnin, delay, cluster_start_delay,
              per_sink_feature_assignments, sample_with_replacement,
              source_sink_column, source_column_value, sink_column_value,
              source_category_column):
    '''Gibb's sampler for Bayesian estimation of microbial sample sources.

    For details, see the project README file.
    '''
    # Create results directory. Click has already checked if it exists, and
    # failed if so.
    os.mkdir(output_dir)

    # Load the metadata file and feature table.
    sample_metadata = parse_sample_metadata(open(mapping_fp, 'U'))
    feature_table = biom_to_df(load_table(table_fp))

    # Do high level check on feature data.
    feature_table = validate_gibbs_input(feature_table)

    # Remove samples not shared by both feature and metadata tables and order
    # rows equivalently.
    sample_metadata, feature_table = \
        intersect_and_sort_samples(sample_metadata, feature_table)

    # Identify source and sink samples.
    source_samples = get_samples(sample_metadata, source_sink_column,
                                 source_column_value)
    sink_samples = get_samples(sample_metadata, source_sink_column,
                               sink_column_value)

    # If we have no source samples neither normal operation or loo will work.
    # Will also likely get strange errors.
    if len(source_samples) == 0:
        raise ValueError(('You passed %s as the `source_sink_column` and %s '
                          'as the `source_column_value`. There are no samples '
                          'which are sources under these values. Please see '
                          'the help documentation and check your mapping '
                          'file.') % (source_sink_column, source_column_value))

    # Prepare the 'sources' matrix by collapsing the `source_samples` by their
    # metadata values.
    csources = collapse_source_data(sample_metadata, feature_table,
                                    source_samples, source_category_column,
                                    'mean')

    # Rarify collapsed source data if requested.
    if source_rarefaction_depth > 0:
        d = (csources.sum(1) >= source_rarefaction_depth)
        if not d.all():
            count_too_shallow = (~d).sum()
            shallowest = csources.sum(1).min()
            raise ValueError(
                ('You requested rarefaction of source samples at '
                 '%s, but there are %s collapsed source samples '
                 'that have less sequences than that. The '
                 'shallowest of these is %s sequences.') %
                (source_rarefaction_depth, count_too_shallow, shallowest))
        else:
            csources = subsample_dataframe(csources,
                                           source_rarefaction_depth,
                                           replace=sample_with_replacement)

    # Prepare to rarify sink data if we are not doing LOO. If we are doing loo,
    # we skip the rarefaction, and set sinks to `None`.
    if not loo:
        sinks = feature_table.loc[sink_samples, :]
        if sink_rarefaction_depth > 0:
            d = (sinks.sum(1) >= sink_rarefaction_depth)
            if not d.all():
                count_too_shallow = (~d).sum()
                shallowest = sinks.sum(1).min()
                raise ValueError(
                    ('You requested rarefaction of sink samples '
                     'at %s, but there are %s sink samples that '
                     'have less sequences than that. The '
                     'shallowest of these is %s sequences.') %
                    (sink_rarefaction_depth, count_too_shallow, shallowest))
            else:
                sinks = subsample_dataframe(sinks,
                                            sink_rarefaction_depth,
                                            replace=sample_with_replacement)
    else:
        sinks = None

    # If we've been asked to do multiple jobs, we need to spin up a cluster.
    if jobs > 1:
        # Launch the ipcluster and wait for it to come up.
        subprocess.Popen('ipcluster start -n %s --quiet' % jobs, shell=True)
        time.sleep(cluster_start_delay)
        cluster = Client()
    else:
        cluster = None

    # Run the computations.
    mpm, mps, fas = gibbs(csources,
                          sinks,
                          alpha1,
                          alpha2,
                          beta,
                          restarts,
                          draws_per_restart,
                          burnin,
                          delay,
                          cluster=cluster,
                          create_feature_tables=per_sink_feature_assignments)

    # If we started a cluster, shut it down.
    if jobs > 1:
        cluster.shutdown(hub=True)

    # Write results.
    mpm.to_csv(os.path.join(output_dir, 'mixing_proportions.txt'), sep='\t')
    mps.to_csv(os.path.join(output_dir, 'mixing_proportions_stds.txt'),
               sep='\t')
    if per_sink_feature_assignments:
        for sink, fa in zip(mpm.index, fas):
            fa.to_csv(os.path.join(output_dir, sink + '.feature_table.txt'),
                      sep='\t')

    # Plot contributions.
    fig, ax = plot_heatmap(mpm)
    fig.savefig(os.path.join(output_dir, 'mixing_proportions.pdf'), dpi=300)