Ejemplo n.º 1
0
    def test_validate_prefix(self):
        httpretty.register_uri(
            httpretty.POST,
            "https://test_server.com/qiita_db/jobs/job-id/step/")
        httpretty.register_uri(
            httpretty.GET,
            "https://test_server.com/qiita_db/prep_template/1/data",
            body='{"data": {"1.S1": {"orig_name": "S1"}, "1.S2": '
                 '{"orig_name": "S2"}, "1.S3": {"orig_name": "S3"}}}')

        fd, biom_fp = mkstemp(suffix=".biom")
        close(fd)
        data = np.asarray([[0, 0, 1], [1, 3, 42]])
        table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3'])
        with biom_open(biom_fp, 'w') as f:
            table.to_hdf5(f, "Test")

        self._clean_up_files.append(biom_fp)

        self.parameters['files'] = '{"BIOM": ["%s"]}' % biom_fp

        obs_success, obs_ainfo, obs_error = validate(
            self.qclient, 'job-id', self.parameters, self.out_dir)
        exp_biom_fp = join(self.out_dir, basename(biom_fp))
        self._clean_up_files.append(exp_biom_fp)
        self.assertTrue(obs_success)
        self.assertEqual(obs_ainfo, [[None, 'BIOM', [exp_biom_fp, 'biom']]])
        self.assertEqual(obs_error, "")
        obs_t = load_table(exp_biom_fp)
        self.assertItemsEqual(obs_t.ids(), ["1.S1", "1.S2", "1.S3"])
Ejemplo n.º 2
0
    def test_validate_prefix(self):
        httpretty.register_uri(
            httpretty.POST,
            "https://test_server.com/qiita_db/jobs/job-id/step/")
        httpretty.register_uri(
            httpretty.GET,
            "https://test_server.com/qiita_db/prep_template/1/data",
            body='{"data": {"1.S1": {"orig_name": "S1"}, "1.S2": '
            '{"orig_name": "S2"}, "1.S3": {"orig_name": "S3"}}}')

        fd, biom_fp = mkstemp(suffix=".biom")
        close(fd)
        data = np.asarray([[0, 0, 1], [1, 3, 42]])
        table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3'])
        with biom_open(biom_fp, 'w') as f:
            table.to_hdf5(f, "Test")

        self._clean_up_files.append(biom_fp)

        self.parameters['files'] = '{"BIOM": ["%s"]}' % biom_fp

        obs_success, obs_ainfo, obs_error = validate(self.qclient, 'job-id',
                                                     self.parameters,
                                                     self.out_dir)
        exp_biom_fp = join(self.out_dir, basename(biom_fp))
        self._clean_up_files.append(exp_biom_fp)
        self.assertTrue(obs_success)
        self.assertEqual(obs_ainfo, [[None, 'BIOM', [exp_biom_fp, 'biom']]])
        self.assertEqual(obs_error, "")
        obs_t = load_table(exp_biom_fp)
        self.assertItemsEqual(obs_t.ids(), ["1.S1", "1.S2", "1.S3"])
Ejemplo n.º 3
0
    def _create_job_and_biom(self, sample_ids, template=None, analysis=None):
        # Create the BIOM table that needs to be valdiated
        fd, biom_fp = mkstemp(suffix=".biom")
        close(fd)
        data = np.random.randint(100, size=(2, len(sample_ids)))
        table = Table(data, ['O1', 'O2'], sample_ids)
        with biom_open(biom_fp, 'w') as f:
            table.to_hdf5(f, "Test")
        self._clean_up_files.append(biom_fp)

        # Create a new job
        parameters = {
            'template': template,
            'files': dumps({'biom': [biom_fp]}),
            'artifact_type': 'BIOM',
            'analysis': analysis
        }
        data = {
            'command': dumps(['BIOM type', '2.1.4', 'Validate']),
            'parameters': dumps(parameters),
            'status': 'running'
        }
        res = self.qclient.post('/apitest/processing_job/', data=data)
        job_id = res['job']

        return biom_fp, job_id, parameters
Ejemplo n.º 4
0
    def test_execute_job_error(self):
        # Create a prep template
        prep_info = {'SKB8.640193': {'col': 'val1'},
                     'SKD8.640184': {'col': 'val2'}}
        data = {'prep_info': dumps(prep_info),
                'study': 1,
                'data_type': '16S'}
        template = self.qclient.post(
            '/apitest/prep_template/', data=data)['prep']
        # Create a new validate job
        fd, biom_fp = mkstemp(suffix=".biom")
        close(fd)
        data = np.random.randint(100, size=(2, 2))
        table = Table(data, ['O1', 'O2'], ['S1', 'S2'])
        with biom_open(biom_fp, 'w') as f:
            table.to_hdf5(f, "Test")
        data = {'command': dumps(['BIOM type', '2.1.4', 'Validate']),
                'parameters': dumps(
                    {'files': dumps({'biom': [biom_fp]}),
                     'template': template,
                     'artifact_type': 'BIOM'}),
                'artifact_type': 'BIOM',
                'status': 'queued'}
        job_id = self.qclient.post(
            '/apitest/processing_job/', data=data)['job']

        plugin("https://localhost:21174", job_id, self.out_dir)
        obs = self._wait_job(job_id)
        self.assertEqual(obs, 'error')
Ejemplo n.º 5
0
    def test_faith_pd_invalid_input(self):
        # tests are based of skbio tests, checking for duplicate ids,
        # negative counts are not included but should be incorporated

        # tree has duplicated tip ids
        tree = TreeNode.read(
            StringIO('((OTU1:0.1, OTU2:0.2):0.3, (OTU3:0.5, OTU4:0.7):1.1)'
                     'root;'))
        otu_ids = ['OTU%d' % i for i in range(1, 5)]
        u_counts = [1, 1, 0, 0]

        data = np.array([u_counts]).T

        bt = Table(data, otu_ids, ['u'])

        ta = os.path.join(gettempdir(), 'table.biom')
        tr = os.path.join(gettempdir(), 'tree.biom')

        self.files_to_delete.append(ta)
        self.files_to_delete.append(tr)

        with biom_open(ta, 'w') as fhdf5:
            bt.to_hdf5(fhdf5, 'Table for unit testing')
        tree.write(tr)

        self.assertRaises(IOError, faith_pd, 'dne.biom', tr)
        self.assertRaises(IOError, faith_pd, ta, 'dne.tre')
Ejemplo n.º 6
0
def noisify(table_file, metadata_file, sigma, output_file):

    metadata = pd.read_table(metadata_file, index_col=0)
    table = load_table(table_file)
    table = pd.DataFrame(np.array(table.matrix_data.todense()).T,
                         index=table.ids(axis='sample'),
                         columns=table.ids(axis='observation'))
    cov = np.eye(table.shape[1] - 1)
    m_noise = compositional_noise(cov, nsamp=table.shape[0])
    table_ = table.values
    table_ = np.vstack(
        [perturb(table_[i, :], m_noise[i, :]) for i in range(table_.shape[0])])

    # note that this assumes that the column is named `library_size
    table_ = pd.DataFrame(
        multinomial_sample(table_, depths=metadata['library_size']))
    table_.index = table.index
    table_.columns = list(table.columns)

    metadata['observed'] = np.sum(table_.sum(axis=0) > 0)
    metadata['unobserved'] = np.sum(table_.sum(axis=0) == 0)
    metadata.to_csv(metadata_file, sep='\t')

    # drop zeros -- they are not informative
    table_ = table_.loc[:, table_.sum(axis=0) > 0]
    t = Table(table_.T.values, table_.columns.values, table_.index.values)
    with biom_open(output_file, 'w') as f:
        t.to_hdf5(f, generated_by='moi')
Ejemplo n.º 7
0
def deposit(table, groups, truth, output_table, output_groups, output_truth):
    t = Table(table.T.values, table.columns.values, table.index.values)
    with biom_open(output_table, 'w') as f:
        t.to_hdf5(f, generated_by='moi')
    groups.to_csv(output_groups, sep='\t')
    with open(output_truth, 'w') as f:
        f.write(','.join(truth))
Ejemplo n.º 8
0
def deposit_blocktable(output_dir, abs_table, rel_table, metadata, truth, sample_id):
    choice = 'abcdefghijklmnopqrstuvwxyz'
    output_abstable = "%s/rel_table.%s.biom" % (
        output_dir, sample_id)
    output_reltable = "%s/abs_table.%s.biom" % (
        output_dir, sample_id)
    output_metadata = "%s/metadata.%s.txt" % (
        output_dir, sample_id)
    output_truth = "%s/truth.%s.txt" % (
        output_dir, sample_id)

    abs_t = Table(abs_table.T.values,
                  abs_table.columns.values,
                  abs_table.index.values)
    with biom_open(output_abstable, 'w') as f:
        abs_t.to_hdf5(f, generated_by='moi')

    rel_t = Table(rel_table.T.values,
                  rel_table.columns.values,
                  rel_table.index.values)
    with biom_open(output_reltable, 'w') as f:
        rel_t.to_hdf5(f, generated_by='moi')

    metadata.to_csv(output_metadata, sep='\t')
    truth.to_csv(output_truth, sep='\t')
Ejemplo n.º 9
0
def hashing(unhashed_otu_table_list, unhashed_rep_seqs_list,
            sample_metadata_list):
    otu_df_list = []
    rep_seq_ids = set()
    seqs = []
    # Create OTU table
    for unhashed_otu_table in unhashed_otu_table_list:
        otu_df_list.append(hash_otu_table(unhashed_otu_table))
    otu_df = pd.concat(otu_df_list, join="outer", axis=1)
    otu_df.fillna(0.0, inplace=True)
    otu_table = Table(otu_df.values, list(otu_df.index), list(otu_df.columns))
    # Create rep seqs
    for unhashed_rep_seqs in unhashed_rep_seqs_list:
        seqs.extend(hash_rep_seqs(unhashed_rep_seqs, rep_seq_ids))
    otu_table_ids = set(otu_df.index)
    assert otu_table_ids == rep_seq_ids
    assert len(otu_df.index) == len(rep_seq_ids)
    # Merge sample metadata
    sample_metadata = pd.concat(
        [pd.read_csv(s, sep="\\t") for s in sample_metadata_list])
    # Write files
    sample_metadata.to_csv("sample_metadata.tsv", sep="\\t", index=False)
    with biom_open("otu_table.biom", "w") as fid:
        otu_table.to_hdf5(fid,
                          "Constructed by micone in dada2/deblur pipeline")
    with open("rep_seqs.fasta", "w") as fid:
        fasta_writer = FastaIO.FastaWriter(fid, wrap=None)
        fasta_writer.write_file(seqs)
Ejemplo n.º 10
0
    def setUp(self):
        # Register the URIs for the QiitaClient
        httpretty.register_uri(
            httpretty.POST,
            "https://test_server.com/qiita_db/authenticate/",
            body='{"access_token": "token", "token_type": "Bearer", '
            '"expires_in": "3600"}')

        self.qclient = QiitaClient('https://test_server.com', 'client_id',
                                   'client_secret')
        # Create a biom table
        fd, self.biom_fp = mkstemp(suffix=".biom")
        close(fd)
        data = np.asarray([[0, 0, 1], [1, 3, 42]])
        table = Table(data, ['O1', 'O2'], ['1.S1', '1.S2', '1.S3'])
        with biom_open(self.biom_fp, 'w') as f:
            table.to_hdf5(f, "Test")
        self.out_dir = mkdtemp()
        self.parameters = {
            'template': 1,
            'files': '{"BIOM": ["%s"]}' % self.biom_fp,
            'artifact_type': 'BIOM'
        }

        self._clean_up_files = [self.biom_fp, self.out_dir]
Ejemplo n.º 11
0
def hash_otu_table(unhashed_otu_table, seqid_hash_dict, output_file):
    table = load_table(unhashed_otu_table)
    df = table.to_dataframe(dense=True)
    seq_ids = [seqid_hash_dict[i] for i in df.index]
    df.index = seq_ids
    new_table = Table(df.values, list(df.index), list(df.columns))
    with biom_open(output_file, "w") as fid:
        new_table.to_hdf5(fid, "Constructed using qiime1 clustering")
Ejemplo n.º 12
0
def deposit_biofilm(table1, table2, metadata, U, V, edges, it, rep,
                    output_dir):
    """ Writes down tables, metadata and feature metadata into files.

    Parameters
    ----------
    table : biom.Table
        Biom table
    metadata : pd.DataFrame
        Dataframe of sample metadata
    feature_metadata : pd.DataFrame
        Dataframe of features metadata
    it : int
        iteration number
    rep : int
        repetition number
    output_dir : str
        output directory
    """
    choice = 'abcdefghijklmnopqrstuvwxyz'
    output_microbes = "%s/table_microbes.%d_%s.biom" % (output_dir, it,
                                                        choice[rep])
    output_metabolites = "%s/table_metabolites.%d_%s.biom" % (output_dir, it,
                                                              choice[rep])
    output_md = "%s/metadata.%d_%s.txt" % (output_dir, it, choice[rep])
    output_U = "%s/U.%d_%s.txt" % (output_dir, it, choice[rep])
    output_V = "%s/V.%d_%s.txt" % (output_dir, it, choice[rep])
    output_B = "%s/edges.%d_%s.txt" % (output_dir, it, choice[rep])
    output_ranks = "%s/ranks.%d_%s.txt" % (output_dir, it, choice[rep])

    idx1 = table1.sum(axis=0) > 0
    idx2 = table2.sum(axis=0) > 0
    table1 = table1.loc[:, idx1]
    table2 = table2.loc[:, idx2]

    table1 = Table(table1.values.T, table1.columns, table1.index)
    table2 = Table(table2.values.T, table2.columns, table2.index)

    with biom_open(output_microbes, 'w') as f:
        table1.to_hdf5(f, generated_by='moi1')
    with biom_open(output_metabolites, 'w') as f:
        table2.to_hdf5(f, generated_by='moi2')

    ranks = (U @ V)

    ranks = ranks[idx1, :]
    ranks = ranks[:, idx2]
    ranks = pd.DataFrame(ranks,
                         index=table1.ids(axis='observation'),
                         columns=table2.ids(axis='observation'))
    ranks.to_csv(output_ranks, sep='\t')
    metadata.to_csv(output_md, sep='\t', index_label='#SampleID')

    B = B[:, idx1]

    np.savetxt(output_U, U)
    np.savetxt(output_V, V)
    np.savetxt(output_B, B)
Ejemplo n.º 13
0
def hash_otu_table(unhashed_otu_table, output_file):
    table = load_table(unhashed_otu_table)
    df = table.to_dataframe(dense=True)
    seq_ids = list(map(hash_function, df.index))
    df.index = seq_ids
    new_table = Table(df.values, list(df.index), list(df.columns))
    with biom_open(output_file, "w") as fid:
        new_table.to_hdf5(fid, "Constructed by micone in dada2 pipeline")
    return seq_ids
Ejemplo n.º 14
0
def write_biom(table: biom.Table, fp: str):
    """Write a BIOM table to file.

    Parameters
    ----------
    table : biom.Table
        BIOM table to write.
    fp : str
        Output filepath.
    """
    with biom.util.biom_open(fp, 'w') as f:
        table.to_hdf5(f, table.generated_by)
Ejemplo n.º 15
0
def deposit_biofilms(output_dir, abs_table1, abs_table2, rel_table1,
                     rel_table2, edges, metadata, sample_id):
    """ Writes down tables and edges into files.

    Parameters
    ----------
    output_dir : str
        output directory
    rel_table1 : biom.Table
        Biom table of relative abundances
    rel_table2 : biom.Table
        Biom table of relative abundances
    abs_table1 : biom.Table
        Biom table of absolute abundances
    abs_table2 : biom.Table
        Biom table of absolute abundances
    edges : list
        Edge list for ground truthing.
    metadata : pd.DataFrame
        Dataframe of sample metadata
    sample_id : str
        sample id
    """
    output_abs_microbes = "%s/table.abs.microbes.%s.biom" % (output_dir,
                                                             sample_id)
    output_abs_metabolites = "%s/table.abs.metabolites.%s.biom" % (output_dir,
                                                                   sample_id)
    output_rel_microbes = "%s/table.rel.microbes.%s.biom" % (output_dir,
                                                             sample_id)
    output_rel_metabolites = "%s/table.rel.metabolites.%s.biom" % (output_dir,
                                                                   sample_id)
    output_md = "%s/metadata.%s.txt" % (output_dir, sample_id)
    output_U = "%s/U.%s.txt" % (output_dir, sample_id)
    output_V = "%s/V.%s.txt" % (output_dir, sample_id)
    output_edges = "%s/edges.%s.txt" % (output_dir, sample_id)
    output_ranks = "%s/ranks.%s.txt" % (output_dir, sample_id)

    # idx1 = table1.sum(axis=0) > 0
    # idx2 = table2.sum(axis=0) > 0
    # table1 = table1.loc[:, idx1]
    # table2 = table2.loc[:, idx2]

    # relative abundances
    table1 = Table(rel_table1.values.T, rel_table1.columns, rel_table1.index)
    table2 = Table(rel_table2.values.T, rel_table2.columns, rel_table2.index)
    with biom_open(output_rel_microbes, 'w') as f:
        table1.to_hdf5(f, generated_by='moi1')
    with biom_open(output_rel_metabolites, 'w') as f:
        table2.to_hdf5(f, generated_by='moi2')

    # absolute abundances
    table1 = Table(abs_table1.values.T, abs_table1.columns, abs_table1.index)
    table2 = Table(abs_table2.values.T, abs_table2.columns, abs_table2.index)
    with biom_open(output_abs_microbes, 'w') as f:
        table1.to_hdf5(f, generated_by='moi1')
    with biom_open(output_abs_metabolites, 'w') as f:
        table2.to_hdf5(f, generated_by='moi2')

    pd.DataFrame(edges).to_csv(output_edges, sep='\t')
    metadata.to_csv(output_md, sep='\t')
Ejemplo n.º 16
0
    def write_table_tree(self, u_counts, otu_ids, sample_ids, tree):
        data = np.array([u_counts]).T

        bt = Table(data, otu_ids, sample_ids)

        ta = os.path.join(gettempdir(), 'table.biom')
        tr = os.path.join(gettempdir(), 'tree.biom')

        self.files_to_delete.append(ta)
        self.files_to_delete.append(tr)

        with biom_open(ta, 'w') as fhdf5:
            bt.to_hdf5(fhdf5, 'Table for unit testing')
        tree.write(tr)

        return ta, tr
Ejemplo n.º 17
0
    def _work(self, u_counts, v_counts, otu_ids, tree, method):
        data = np.array([u_counts, v_counts]).T

        bt = Table(data, otu_ids, ['u', 'v'])

        ta = os.path.join(gettempdir(), 'table.biom')
        tr = os.path.join(gettempdir(), 'tree.biom')

        self.files_to_delete.append(ta)
        self.files_to_delete.append(tr)

        with biom_open(ta, 'w') as fhdf5:
            bt.to_hdf5(fhdf5, 'Table for unit testing')
        tree.write(tr)

        # return value is a distance matrix, get the distance from u->v
        return ssu(ta, tr, method, False, 1.0, False, 1)['u', 'v']
Ejemplo n.º 18
0
def write_biom(table: biom.Table, fp: str):
    """Write a BIOM table to file.

    Parameters
    ----------
    table : biom.Table
        BIOM table to write.
    fp : str
        Output filepath.

    Notes
    -----
    The `generated_by` attribute of the output BIOM table will be like
    "woltka-version".
    """
    with biom.util.biom_open(fp, 'w') as f:
        table.to_hdf5(f, f'{__name__}-{__version__}')
Ejemplo n.º 19
0
def write_outputs(
        o_biom_file: str,
        o_metadata_file: str,
        biom_updated: biom.Table,
        metadata_edit_best: pd.DataFrame,
        dim: bool = False) -> None:
    """
    Write the metadata and the biom table outputs.

    Parameters
    ----------
    o_metadata_file : str
        Path to the output metadata table file.
    o_biom_file : str
        Path to the output biom table file.
    biom_updated : biom.table
        The biom table without ambiguous samples,
        with a min number of reads per sample, and
        without duplicated sample per host, and
        with samples re-named as per AGP system.
    metadata_edit_best : pd.DataFrame
        Corresponding metadata table.
    dim : bool
        Whether to add the number of samples in the
        final biom file name before extension or not.
    """
    if dim:
        o_metadata_file, o_biom_file = get_outputs(
            metadata_edit_best, o_metadata_file, o_biom_file)

    if biom_updated.shape[0]:
        print('Outputs:')
        if o_metadata_file[0] == '/':
            if not isdir(dirname(o_metadata_file)):
                os.makedirs(dirname(o_metadata_file))
        metadata_edit_best.to_csv(o_metadata_file, index=False, sep='\t')
        print(o_metadata_file)

        if not isdir(dirname(o_biom_file)):
            os.makedirs(dirname(o_biom_file))
        with biom_open(o_biom_file, 'w') as f:
            biom_updated.to_hdf5(f, 'Xrbfetch')
        print(o_biom_file)
Ejemplo n.º 20
0
    def test_execute_job_error(self):
        # Create a prep template
        prep_info = {
            'SKB8.640193': {
                'col': 'val1'
            },
            'SKD8.640184': {
                'col': 'val2'
            }
        }
        data = {'prep_info': dumps(prep_info), 'study': 1, 'data_type': '16S'}
        template = self.qclient.post('/apitest/prep_template/',
                                     data=data)['prep']
        # Create a new validate job
        fd, biom_fp = mkstemp(suffix=".biom")
        close(fd)
        data = np.random.randint(100, size=(2, 2))
        table = Table(data, ['O1', 'O2'], ['S1', 'S2'])
        with biom_open(biom_fp, 'w') as f:
            table.to_hdf5(f, "Test")
        data = {
            'command':
            dumps(['BIOM type', '2.1.4 - Qiime2', 'Validate']),
            'parameters':
            dumps({
                'files': dumps({'biom': [biom_fp]}),
                'template': template,
                'artifact_type': 'BIOM'
            }),
            'artifact_type':
            'BIOM',
            'status':
            'queued'
        }
        job_id = self.qclient.post('/apitest/processing_job/',
                                   data=data)['job']

        plugin("https://localhost:8383", job_id, self.out_dir)
        obs = self._wait_job(job_id)

        self.assertEqual(obs, 'error')
Ejemplo n.º 21
0
def main(args):
    os.mkdir(args.output_dir)
    np.random.seed(args.seed)
    sims = multinomial_bioms(
        k=args.latent_dim, D=args.input_dim,
        N=args.samples, M=args.depth)
    Y = sims['Y']
    parts = Y.shape[0] // 10
    samp_ids = list(map(str, range(Y.shape[0])))
    obs_ids = list(map(str, range(Y.shape[1])))
    train = Table(Y[:parts * 8].T, obs_ids, samp_ids[:parts * 8])
    test = Table(Y[parts * 8 : parts * 9].T,
                 obs_ids, samp_ids[parts * 8 : parts * 9])
    valid = Table(Y[parts * 9:].T, obs_ids, samp_ids[parts * 9:])
    output_dir = args.output_dir
    with biom_open(f'{output_dir}/train.biom', 'w') as f:
        train.to_hdf5(f, 'train')
    with biom_open(f'{output_dir}/test.biom', 'w') as f:
        test.to_hdf5(f, 'test')
    with biom_open(f'{output_dir}/valid.biom', 'w') as f:
        valid.to_hdf5(f, 'valid')
    tree = sims['tree']
    tree.write(f'{output_dir}/basis.nwk')
    np.savetxt(f'{output_dir}/eigvals.txt', sims['eigs'])
    np.savetxt(f'{output_dir}/eigvecs.txt', sims['eigvectors'])
    np.savetxt(f'{output_dir}/W.txt', sims['W'])
Ejemplo n.º 22
0
    def setUp(self):
        np.random.seed(0)
        torch.manual_seed(0)
        self.k, self.D, self.N, self.M, self.C = 10, 50, 500, 100000, 3
        self.sims = multinomial_batch_bioms(k=self.k,
                                            D=self.D,
                                            N=self.N,
                                            M=self.M,
                                            C=self.C)
        Y = self.sims['Y']
        parts = Y.shape[0] // 10
        samp_ids = list(map(str, range(Y.shape[0])))
        obs_ids = list(map(str, range(Y.shape[1])))
        train = Table(Y[:parts * 8].T, obs_ids, samp_ids[:parts * 8])
        test = Table(Y[parts * 8:parts * 9].T, obs_ids,
                     samp_ids[parts * 8:parts * 9])
        valid = Table(Y[parts * 9:].T, obs_ids, samp_ids[parts * 9:])
        with biom_open('train.biom', 'w') as f:
            train.to_hdf5(f, 'train')
        with biom_open('test.biom', 'w') as f:
            test.to_hdf5(f, 'test')
        with biom_open('valid.biom', 'w') as f:
            valid.to_hdf5(f, 'valid')

        md = pd.DataFrame({'batch_category': self.sims['batch_idx']},
                          index=samp_ids)
        md.index.name = 'sampleid'
        md.to_csv('metadata.txt', sep='\t')
        batch_priors = pd.Series(self.sims['alphaILR'])
        batch_priors.to_csv('batch_priors.txt', sep='\t')
        self.sims['tree'].write('basis.nwk')
Ejemplo n.º 23
0
    def _create_job_and_biom(self, sample_ids, template=None, analysis=None):
        # Create the BIOM table that needs to be valdiated
        fd, biom_fp = mkstemp(suffix=".biom")
        close(fd)
        data = np.random.randint(100, size=(2, len(sample_ids)))
        table = Table(data, ['O1', 'O2'], sample_ids)
        with biom_open(biom_fp, 'w') as f:
            table.to_hdf5(f, "Test")
        self._clean_up_files.append(biom_fp)

        # Create a new job
        parameters = {'template': template,
                      'files': dumps({'biom': [biom_fp]}),
                      'artifact_type': 'BIOM',
                      'analysis': analysis}
        data = {'command': dumps(['BIOM type', '2.1.4', 'Validate']),
                'parameters': dumps(parameters),
                'status': 'running'}
        res = self.qclient.post('/apitest/processing_job/', data=data)
        job_id = res['job']

        return biom_fp, job_id, parameters
Ejemplo n.º 24
0
    def setUp(self):
        # Registewr the URIs for the QiitaClient
        httpretty.register_uri(
            httpretty.POST,
            "https://test_server.com/qiita_db/authenticate/",
            body='{"access_token": "token", "token_type": "Bearer", '
                 '"expires_in": "3600"}')

        self.qclient = QiitaClient('https://test_server.com', 'client_id',
                                   'client_secret')
        # Create a biom table
        fd, self.biom_fp = mkstemp(suffix=".biom")
        close(fd)
        data = np.asarray([[0, 0, 1], [1, 3, 42]])
        table = Table(data, ['O1', 'O2'], ['1.S1', '1.S2', '1.S3'])
        with biom_open(self.biom_fp, 'w') as f:
            table.to_hdf5(f, "Test")
        self.out_dir = mkdtemp()
        self.artifact_id = 4
        self.parameters = {'input_data': self.artifact_id}

        self._clean_up_files = [self.biom_fp, self.out_dir]
Ejemplo n.º 25
0
    def test__qiime2_rclr(self):
        """Tests q2-rclr matches standalone rclr."""

        # make mock table to write
        samps_ids = ['s%i' % i for i in range(self.cdata.shape[0])]
        feats_ids = ['f%i' % i for i in range(self.cdata.shape[1])]
        table_test = Table(self.cdata.T, feats_ids, samps_ids)
        # write table
        in_ = get_data_path('test.biom', subfolder='data')
        out_path = os_path_sep.join(in_.split(os_path_sep)[:-1])
        test_path = os.path.join(out_path, 'rclr-test.biom')
        with biom_open(test_path, 'w') as wf:
            table_test.to_hdf5(wf, "test")
        # run standalone
        runner = CliRunner()
        result = runner.invoke(sdc.commands['rclr'],
                               ['--in-biom', test_path,
                                '--output-dir', out_path])
        out_table = get_data_path('rclr-table.biom',
                                  subfolder='data')
        res_table = load_table(out_table)
        standalone_mat = res_table.matrix_data.toarray().T
        # check that exit code was 0 (indicating success)
        try:
            self.assertEqual(0, result.exit_code)
        except AssertionError:
            ex = result.exception
            error = Exception('Command failed with non-zero exit code')
            raise error.with_traceback(ex.__traceback__)
        # run QIIME2
        q2_table_test = Artifact.import_data("FeatureTable[Frequency]",
                                             table_test)
        q2_res = rclr_transformation(q2_table_test).rclr_table.view(Table)
        q2_res_mat = q2_res.matrix_data.toarray().T
        # check same and check both correct
        npt.assert_allclose(standalone_mat, q2_res_mat)
        npt.assert_allclose(standalone_mat, self.true)
        npt.assert_allclose(q2_res_mat, self.true)
Ejemplo n.º 26
0
 def test_standalone_rclr(self):
     """Test the standalone rlcr."""
     # make mock table to write
     samps_ids = ['s%i' % i for i in range(self.cdata.shape[0])]
     feats_ids = ['f%i' % i for i in range(self.cdata.shape[1])]
     table_test = Table(self.cdata.T, feats_ids, samps_ids)
     # write table
     in_ = get_data_path('test.biom', subfolder='rpca_data')
     out_path = os_path_sep.join(in_.split(os_path_sep)[:-1])
     test_path = os.path.join(out_path, 'rclr-test.biom')
     with biom_open(test_path, 'w') as wf:
         table_test.to_hdf5(wf, "test")
     runner = CliRunner()
     result = runner.invoke(sdc.commands['rclr'],
                            ['--in-biom', test_path,
                             '--output-dir', out_path])
     out_table = get_data_path('rclr-table.biom',
                               subfolder='rpca_data')
     res_table = load_table(out_table)
     test_cmat = res_table.matrix_data.toarray().T
     npt.assert_allclose(test_cmat, self.true)
     # Lastly, check that exit code was 0 (indicating success)
     CliTestCase().assertExitCode(0, result)
Ejemplo n.º 27
0
 def setUp(self):
     np.random.seed(1)
     torch.manual_seed(1)
     self.k, self.D, self.N, self.M = 10, 50, 500, 100000
     self.sims = multinomial_bioms(k=self.k, D=self.D, N=self.N, M=self.M)
     Y = self.sims['Y']
     parts = Y.shape[0] // 10
     samp_ids = list(map(str, range(Y.shape[0])))
     obs_ids = list(map(str, range(Y.shape[1])))
     train = Table(Y[:parts * 8].T, obs_ids, samp_ids[:parts * 8])
     test = Table(Y[parts * 8:parts * 9].T, obs_ids,
                  samp_ids[parts * 8:parts * 9])
     valid = Table(Y[parts * 9:].T, obs_ids, samp_ids[parts * 9:])
     with biom_open('train.biom', 'w') as f:
         train.to_hdf5(f, 'train')
     with biom_open('test.biom', 'w') as f:
         test.to_hdf5(f, 'test')
     with biom_open('valid.biom', 'w') as f:
         valid.to_hdf5(f, 'valid')
     self.sims['tree'].write('basis.nwk')
Ejemplo n.º 28
0
def save_bioms(args, sims):
    output_dir = args.output_dir
    Y = sims['Y']
    parts = Y.shape[0] // 10
    samp_ids = list(map(str, range(Y.shape[0])))
    obs_ids = list(map(str, range(Y.shape[1])))
    train = Table(Y[:parts * 8].T, obs_ids, samp_ids[:parts * 8])
    test = Table(Y[parts * 8:parts * 9].T, obs_ids,
                 samp_ids[parts * 8:parts * 9])
    valid = Table(Y[parts * 9:].T, obs_ids, samp_ids[parts * 9:])
    output_dir = args.output_dir
    with biom_open(f'{output_dir}/train.biom', 'w') as f:
        train.to_hdf5(f, 'train')
    with biom_open(f'{output_dir}/test.biom', 'w') as f:
        test.to_hdf5(f, 'test')
    with biom_open(f'{output_dir}/valid.biom', 'w') as f:
        valid.to_hdf5(f, 'valid')
    tree = sims['tree']
    tree.write(f'{output_dir}/basis.nwk')
    np.savetxt(f'{output_dir}/eigvals.txt', sims['eigs'])
    np.savetxt(f'{output_dir}/eigvecs.txt', sims['eigvectors'])
    np.savetxt(f'{output_dir}/W.txt', sims['W'])
Ejemplo n.º 29
0
def deposit(output_dir, table1, table2, metadata, U, V, B, it, rep):
    """ Writes down tables, metadata and feature metadata into files.

    Parameters
    ----------
    output_dir : str
        output directory
    table1 : biom.Table
        Biom table
    table2 : biom.Table
        Biom table
    metadata : pd.DataFrame
        Dataframe of sample metadata
    U : np.array
        Microbial latent variables
    V : np.array
        Metabolite latent variables
    edges : list
        Edge list for ground truthing.
    feature_metadata : pd.DataFrame
        Dataframe of features metadata
    it : int
        iteration number
    rep : int
        repetition number
    """
    choice = 'abcdefghijklmnopqrstuvwxyz'
    output_microbes = "%s/table_microbes.%d_%s.biom" % (
        output_dir, it, choice[rep])
    output_metabolites = "%s/table_metabolites.%d_%s.biom" % (
        output_dir, it, choice[rep])
    output_md = "%s/metadata.%d_%s.txt" % (
        output_dir, it, choice[rep])
    output_U = "%s/U.%d_%s.txt" % (
        output_dir, it, choice[rep])
    output_V = "%s/V.%d_%s.txt" % (
        output_dir, it, choice[rep])
    output_B = "%s/B.%d_%s.txt" % (
        output_dir, it, choice[rep])
    output_ranks = "%s/ranks.%d_%s.txt" % (
        output_dir, it, choice[rep])

    idx1 = table1.sum(axis=0) > 0
    idx2 = table2.sum(axis=0) > 0
    table1 = table1.loc[:, idx1]
    table2 = table2.loc[:, idx2]

    table1 = Table(table1.values.T, table1.columns, table1.index)
    table2 = Table(table2.values.T, table2.columns, table2.index)

    with biom_open(output_microbes, 'w') as f:
        table1.to_hdf5(f, generated_by='moi1')
    with biom_open(output_metabolites, 'w') as f:
        table2.to_hdf5(f, generated_by='moi2')

    ranks = clr(softmax(np.hstack(
        (np.zeros((U.shape[0], 1)), U @ V))))
    ranks = ranks[idx1, :]
    ranks = ranks[:, idx2]
    ranks = pd.DataFrame(
        ranks, index=table1.ids(axis='observation'),
        columns=table2.ids(axis='observation'))
    ranks.to_csv(output_ranks, sep='\t')
    metadata.to_csv(output_md, sep='\t', index_label='#SampleID')

    np.savetxt(output_B, B)
    np.savetxt(output_U, U)
    np.savetxt(output_V, V)
Ejemplo n.º 30
0
def make_biom(seq_table, output_file):
    new_table = Table(seq_table.values, list(seq_table.index),
                      list(seq_table.columns))
    with biom_open(output_file, "w") as fid:
        new_table.to_hdf5(fid, "Constructucted by micone in dada2 pipeline")
Ejemplo n.º 31
0
def custom_tree_pipeline(
        table: biom.Table,
        tree: skbio.TreeNode,
        threads: int = 1,
        hsp_method: str = "mp",
        max_nsti: float = 2.0) -> (biom.Table, biom.Table, biom.Table):

    # Run pipeline in temporary directory so that files are not saved locally.
    with TemporaryDirectory() as temp_dir:

        # Need to write out BIOM table and newick tree to be used in pipeline.

        # Write out biom table:
        biom_infile = path.join(temp_dir, "intable.biom")
        with biom.util.biom_open(biom_infile, 'w') as out_biom:
            table.to_hdf5(h5grp=out_biom,
                          generated_by="PICRUSt2 QIIME2 Plugin")

        # Write out newick tree.
        newick_infile = path.join(temp_dir, "placed_seqs.tre")
        tree.write(newick_infile, format="newick")

        picrust2_out = path.join(temp_dir, "picrust2_out")

        print("Running the below commands:", file=sys.stderr)

        # Run hidden-state prediction step (on 16S, EC, and KO tables
        # separately.
        hsp_out_16S = path.join(picrust2_out, "16S_predicted.tsv.gz")
        system_call_check("hsp.py -i 16S " + " -t " + newick_infile +
                          " -p 1 " + " -n " + "-o " + hsp_out_16S + " -m " +
                          hsp_method,
                          print_out=True)

        hsp_out_EC = path.join(picrust2_out, "EC_predicted.tsv.gz")
        system_call_check("hsp.py -i EC " + " -t " + newick_infile + " -p " +
                          str(threads) + " -o " + hsp_out_EC + " -m " +
                          hsp_method,
                          print_out=True)

        hsp_out_KO = path.join(picrust2_out, "KO_predicted.tsv.gz")
        system_call_check("hsp.py -i KO " + " -t " + newick_infile + " -p " +
                          str(threads) + " -o " + hsp_out_KO + " -m " +
                          hsp_method,
                          print_out=True)

        # Run metagenome pipeline step.
        EC_metagenome_out = path.join(picrust2_out, "EC_metagenome_out")
        system_call_check("metagenome_pipeline.py -i " + biom_infile + " -m " +
                          hsp_out_16S + " -f " + hsp_out_EC + " -o " +
                          EC_metagenome_out + " --max_nsti " + str(max_nsti),
                          print_out=True)

        KO_metagenome_out = path.join(picrust2_out, "KO_metagenome_out")
        system_call_check("metagenome_pipeline.py -i " + biom_infile + " -m " +
                          hsp_out_16S + " -f " + hsp_out_KO + " -o " +
                          KO_metagenome_out + " --max_nsti " + str(max_nsti),
                          print_out=True)

        EC_out = path.join(EC_metagenome_out, "pred_metagenome_unstrat.tsv.gz")
        KO_out = path.join(KO_metagenome_out, "pred_metagenome_unstrat.tsv.gz")

        # Run pathway inference step.
        pathways_out = path.join(picrust2_out, "pathways_out")
        pathabun_out = path.join(pathways_out, "path_abun_unstrat.tsv.gz")
        system_call_check("pathway_pipeline.py -i " + EC_out + " -o " +
                          pathways_out + " -p " + str(threads),
                          print_out=True)

        # Read in output unstratified metagenome tables and return as BIOM
        # objects.
        ko_biom = biom.load_table(KO_out)
        ec_biom = biom.load_table(EC_out)
        pathabun_biom = biom.load_table(pathabun_out)

        return ko_biom, ec_biom, pathabun_biom
Ejemplo n.º 32
0
Archivo: 54.py Proyecto: tanaes/qiita
def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table):
    """Creates the initial non-rarefied BIOM artifact of the analysis

    Parameters
    ----------
    analysis : dict
        Dictionary with the analysis information
    biom_data : dict
        Dictionary with the biom file information
    rarefied_table : biom.Table
        The rarefied BIOM table

    Returns
    -------
    int
        The id of the new artifact
    """
    # The non rarefied biom artifact is the initial biom table of the analysis.
    # This table does not currently exist anywhere, so we need to actually
    # create the BIOM file. To create this BIOM file we need: (1) the samples
    # and artifacts they come from and (2) whether the samples where
    # renamed or not. (1) is on the database, but we need to inferr (2) from
    # the existing rarefied BIOM table. Fun, fun...

    with TRN:
        # Get the samples included in the BIOM table grouped by artifact id
        # Note that the analysis contains a BIOM table per data type included
        # in it, and the table analysis_sample does not differentiate between
        # datatypes, so we need to check the data type in the artifact table
        sql = """SELECT artifact_id, array_agg(sample_id)
                 FROM qiita.analysis_sample
                    JOIN qiita.artifact USING (artifact_id)
                 WHERE analysis_id = %s AND data_type_id = %s
                 GROUP BY artifact_id"""
        TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']])
        samples_by_artifact = TRN.execute_fetchindex()

        # Create an empty BIOM table to be the new master table
        new_table = Table([], [], [])
        ids_map = {}
        for a_id, samples in samples_by_artifact:
            # Get the filepath of the BIOM table from the artifact
            artifact = Artifact(a_id)
            biom_fp = None
            for _, fp, fp_type in artifact.filepaths:
                if fp_type == 'biom':
                    biom_fp = fp
            # Note that we are sure that the biom table exists for sure, so
            # no need to check if biom_fp is undefined
            biom_table = load_table(biom_fp)
            samples = set(samples).intersection(biom_table.ids())
            biom_table.filter(samples, axis='sample', inplace=True)
            # we need to check if the table has samples left before merging
            if biom_table.shape[0] != 0 and biom_table.shape[1] != 0:
                new_table = new_table.merge(biom_table)
                ids_map.update(
                    {sid: "%d.%s" % (a_id, sid)
                     for sid in biom_table.ids()})

        # Check if we need to rename the sample ids in the biom table
        new_table_ids = set(new_table.ids())
        if not new_table_ids.issuperset(rarefied_table.ids()):
            # We need to rename the sample ids
            new_table.update_ids(ids_map, 'sample', True, True)

        sql = """INSERT INTO qiita.artifact
                    (generated_timestamp, data_type_id, visibility_id,
                     artifact_type_id, submitted_to_vamps)
            VALUES (%s, %s, %s, %s, %s)
            RETURNING artifact_id"""
        # Magic number 4 -> visibility sandbox
        # Magix number 7 -> biom artifact type
        TRN.add(
            sql,
            [analysis['timestamp'], biom_data['data_type_id'], 4, 7, False])
        artifact_id = TRN.execute_fetchlast()

        # Associate the artifact with the analysis
        sql = """INSERT INTO qiita.analysis_artifact
                    (analysis_id, artifact_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [analysis['analysis_id'], artifact_id])
        # Link the artifact with its file
        dd_id, mp = get_mountpoint('BIOM')[0]
        dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id))
        if not exists(dir_fp):
            makedirs(dir_fp)
        new_table_fp = join(dir_fp, "biom_table.biom")
        with biom_open(new_table_fp, 'w') as f:
            new_table.to_hdf5(f, "Generated by Qiita")

        sql = """INSERT INTO qiita.filepath
                    (filepath, filepath_type_id, checksum,
                     checksum_algorithm_id, data_directory_id)
                 VALUES (%s, %s, %s, %s, %s)
                 RETURNING filepath_id"""
        # Magic number 7 -> filepath_type_id = 'biom'
        # Magic number 1 -> the checksum algorithm id
        TRN.add(sql, [
            basename(new_table_fp), 7,
            compute_checksum(new_table_fp), 1, dd_id
        ])
        fp_id = TRN.execute_fetchlast()
        sql = """INSERT INTO qiita.artifact_filepath
                    (artifact_id, filepath_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [artifact_id, fp_id])
        TRN.execute()

    return artifact_id
Ejemplo n.º 33
0
Archivo: 54.py Proyecto: tkosciol/qiita
def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table):
    """Creates the initial non-rarefied BIOM artifact of the analysis

    Parameters
    ----------
    analysis : dict
        Dictionary with the analysis information
    biom_data : dict
        Dictionary with the biom file information
    rarefied_table : biom.Table
        The rarefied BIOM table

    Returns
    -------
    int
        The id of the new artifact
    """
    # The non rarefied biom artifact is the initial biom table of the analysis.
    # This table does not currently exist anywhere, so we need to actually
    # create the BIOM file. To create this BIOM file we need: (1) the samples
    # and artifacts they come from and (2) whether the samples where
    # renamed or not. (1) is on the database, but we need to inferr (2) from
    # the existing rarefied BIOM table. Fun, fun...

    with TRN:
        # Get the samples included in the BIOM table grouped by artifact id
        # Note that the analysis contains a BIOM table per data type included
        # in it, and the table analysis_sample does not differentiate between
        # datatypes, so we need to check the data type in the artifact table
        sql = """SELECT artifact_id, array_agg(sample_id)
                 FROM qiita.analysis_sample
                    JOIN qiita.artifact USING (artifact_id)
                 WHERE analysis_id = %s AND data_type_id = %s
                 GROUP BY artifact_id"""
        TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']])
        samples_by_artifact = TRN.execute_fetchindex()

        # Create an empty BIOM table to be the new master table
        new_table = Table([], [], [])
        ids_map = {}
        for a_id, samples in samples_by_artifact:
            # Get the filepath of the BIOM table from the artifact
            artifact = Artifact(a_id)
            biom_fp = None
            for _, fp, fp_type in artifact.filepaths:
                if fp_type == 'biom':
                    biom_fp = fp
            # Note that we are sure that the biom table exists for sure, so
            # no need to check if biom_fp is undefined
            biom_table = load_table(biom_fp)
            samples = set(samples).intersection(biom_table.ids())
            biom_table.filter(samples, axis='sample', inplace=True)
            # we need to check if the table has samples left before merging
            if biom_table.shape[0] != 0 and biom_table.shape[1] != 0:
                new_table = new_table.merge(biom_table)
                ids_map.update({sid: "%d.%s" % (a_id, sid)
                                for sid in biom_table.ids()})

        # Check if we need to rename the sample ids in the biom table
        new_table_ids = set(new_table.ids())
        if not new_table_ids.issuperset(rarefied_table.ids()):
            # We need to rename the sample ids
            new_table.update_ids(ids_map, 'sample', True, True)

        sql = """INSERT INTO qiita.artifact
                    (generated_timestamp, data_type_id, visibility_id,
                     artifact_type_id, submitted_to_vamps)
            VALUES (%s, %s, %s, %s, %s)
            RETURNING artifact_id"""
        # Magic number 4 -> visibility sandbox
        # Magix number 7 -> biom artifact type
        TRN.add(sql, [analysis['timestamp'], biom_data['data_type_id'],
                      4, 7, False])
        artifact_id = TRN.execute_fetchlast()

        # Associate the artifact with the analysis
        sql = """INSERT INTO qiita.analysis_artifact
                    (analysis_id, artifact_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [analysis['analysis_id'], artifact_id])
        # Link the artifact with its file
        dd_id, mp = get_mountpoint('BIOM')[0]
        dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id))
        if not exists(dir_fp):
            makedirs(dir_fp)
        new_table_fp = join(dir_fp, "biom_table.biom")
        with biom_open(new_table_fp, 'w') as f:
            new_table.to_hdf5(f, "Generated by Qiita")

        sql = """INSERT INTO qiita.filepath
                    (filepath, filepath_type_id, checksum,
                     checksum_algorithm_id, data_directory_id)
                 VALUES (%s, %s, %s, %s, %s)
                 RETURNING filepath_id"""
        # Magic number 7 -> filepath_type_id = 'biom'
        # Magic number 1 -> the checksum algorithm id
        TRN.add(sql, [basename(new_table_fp), 7,
                      compute_checksum(new_table_fp), 1, dd_id])
        fp_id = TRN.execute_fetchlast()
        sql = """INSERT INTO qiita.artifact_filepath
                    (artifact_id, filepath_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [artifact_id, fp_id])
        TRN.execute()

    return artifact_id
Ejemplo n.º 34
0
def full_pipeline(
        table: biom.Table,
        seq: pd.Series,
        threads: int = 1,
        hsp_method: str = "mp",
        max_nsti: float = 2.0) -> (biom.Table, biom.Table, biom.Table):

    # Write out BIOM table and FASTA to be used in pipeline.
    with TemporaryDirectory() as temp_dir:

        # Write out BIOM table:
        biom_infile = path.join(temp_dir, "intable.biom")
        with biom.util.biom_open(biom_infile, 'w') as out_biom:
            table.to_hdf5(h5grp=out_biom,
                          generated_by="PICRUSt2 QIIME2 Plugin")

        # Write out Pandas series as FASTA:
        seq_outfile = path.join(temp_dir, "seqs.fna")

        with open(seq_outfile, "w") as outfile_fh:
            for seqname, sequence in seq.iteritems():
                print(">" + str(seqname) + "\n" + str(sequence),
                      file=outfile_fh)

        picrust2_out = path.join(temp_dir, "picrust2_out")

        func_outputs, pathway_outputs = picrust2.pipeline.full_pipeline(
            study_fasta=seq_outfile,
            input_table=biom_infile,
            output_folder=picrust2_out,
            processes=threads,
            ref_dir=default_ref_dir,
            in_traits="EC,KO",
            custom_trait_tables=None,
            marker_gene_table=default_tables["16S"],
            pathway_map=default_pathway_map,
            rxn_func="EC",
            no_pathways=False,
            regroup_map=default_regroup_map,
            no_regroup=False,
            stratified=False,
            max_nsti=max_nsti,
            min_reads=1,
            min_samples=1,
            hsp_method=hsp_method,
            skip_nsti=False,
            skip_minpath=False,
            no_gap_fill=False,
            coverage=False,
            per_sequence_contrib=False,
            wide_table=False,
            skip_norm=False,
            remove_intermediate=False,
            verbose=True)

        # Convert the returned unstratified tables to BIOM tables.
        # Note that the 0-index in the func table returned objects corresponds
        # to the path to the unstratified table.
        ko_biom = biom.load_table(func_outputs["KO"][0])
        ec_biom = biom.load_table(func_outputs["EC"][0])
        pathabun_biom = biom.load_table(pathway_outputs["unstrat_abun"])

        return ko_biom, ec_biom, pathabun_biom
Ejemplo n.º 35
0
def deblur(qclient, job_id, parameters, out_dir):
    """Run deblur with the given parameters

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values to run deblur
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    boolean, list, str
        The results of the job

    Notes
    -----
    The code will check if the artifact has a preprocessed_demux element, if
    not it will use the preprocessed_fastq. We prefer to work with the
    preprocessed_demux as running time will be greatly improved
    """
    out_dir = join(out_dir, 'deblur_out')
    # Step 1 get the rest of the information need to run deblur
    qclient.update_job_step(job_id, "Step 1 of 4: Collecting information")
    artifact_id = parameters['Demultiplexed sequences']
    # removing input from parameters so it's not part of the final command
    del parameters['Demultiplexed sequences']

    # Get the artifact filepath information
    artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id)
    fps = artifact_info['files']

    # Step 2 generating command deblur
    if 'preprocessed_demux' in fps:
        qclient.update_job_step(job_id, "Step 2 of 4: Generating per sample "
                                "from demux (1/2)")

        if not exists(out_dir):
            mkdir(out_dir)
        split_out_dir = join(out_dir, 'split')
        if not exists(split_out_dir):
            mkdir(split_out_dir)

        # using the same number of parallel jobs as defined by the command
        n_jobs = int(parameters['Jobs to start'])
        # [0] cause there should be only 1 file
        to_per_sample_files(fps['preprocessed_demux'][0],
                            out_dir=split_out_dir, n_jobs=n_jobs)

        qclient.update_job_step(job_id, "Step 2 of 4: Generating per sample "
                                "from demux (2/2)")
        out_dir = join(out_dir, 'deblured')
        cmd = generate_deblur_workflow_commands([split_out_dir],
                                                out_dir, parameters)
    else:
        qclient.update_job_step(job_id, "Step 2 of 4: Generating deblur "
                                "command")
        cmd = generate_deblur_workflow_commands(fps['preprocessed_fastq'],
                                                out_dir, parameters)

    # Step 3 execute deblur
    qclient.update_job_step(job_id, "Step 3 of 4: Executing deblur job")
    std_out, std_err, return_value = system_call(cmd)
    if return_value != 0:
        error_msg = ("Error running deblur:\nStd out: %s\nStd err: %s"
                     % (std_out, std_err))
        return False, None, error_msg

    # Generating artifact
    pb = partial(join, out_dir)

    # Generate the filepaths
    final_biom = pb('all.biom')
    final_seqs = pb('all.seqs.fa')
    final_biom_hit = pb('reference-hit.biom')
    final_seqs_hit = pb('reference-hit.seqs.fa')

    if not exists(final_biom_hit):
        # Create an empty table. We need to send something to Qiita that is
        # a valid BIOM, so we are going to create an empty table
        t = Table([], [], [])
        with biom_open(final_biom_hit, 'w') as f:
            t.to_hdf5(f, 'qp-deblur generated')

    if not exists(final_seqs_hit):
        # Same as before, create an empty sequence file so we can send it
        with open(final_seqs_hit, 'w') as f:
            f.write("")

    # Step 4, communicate with archive to check and generate placements
    qclient.update_job_step(job_id, "Step 4 of 4 (1/4): Retrieving "
                            "observations information")
    features = list(load_table(final_biom_hit).ids(axis='observation'))

    fp_phylogeny = None
    if features:
        observations = qclient.post(
            "/qiita_db/archive/observations/", data={'job_id': job_id,
                                                     'features': features})
        novel_fragments = list(set(features) - set(observations.keys()))

        qclient.update_job_step(job_id, "Step 4 of 4 (2/4): Generating %d new "
                                "placements" % len(novel_fragments))

        # Once we support alternative reference phylogenies for SEPP in the
        # future, we need to translate the reference name here into
        # filepaths pointing to the correct reference alignment and
        # reference tree. If left 'None' the Greengenes 13.8 reference
        # shipped with the fragment-insertion conda package will be used.
        fp_reference_alignment = None
        fp_reference_phylogeny = None
        fp_reference_template = None
        fp_reference_rename = None
        if 'Reference phylogeny for SEPP' in parameters:
            if parameters['Reference phylogeny for SEPP'] == 'tiny':
                fp_reference_alignment = qp_deblur.get_data(join(
                    'sepp', 'reference_alignment_tiny.fasta'))
                fp_reference_phylogeny = qp_deblur.get_data(join(
                    'sepp', 'reference_phylogeny_tiny.nwk'))
                fp_reference_template = qp_deblur.get_data(join(
                    'sepp', 'tmpl_tiny_placement.json'))
                fp_reference_rename = qp_deblur.get_data(join(
                    'sepp', 'tmpl_tiny_rename-json.py'))
        try:
            new_placements = generate_sepp_placements(
                novel_fragments, out_dir, parameters['Threads per sample'],
                reference_alignment=fp_reference_alignment,
                reference_phylogeny=fp_reference_phylogeny)
        except ValueError as e:
            return False, None, str(e)

        qclient.update_job_step(job_id, "Step 4 of 4 (3/4): Archiving %d "
                                "new placements" % len(novel_fragments))
        # values needs to be json strings as well
        for fragment in new_placements.keys():
            new_placements[fragment] = json.dumps(new_placements[fragment])

        # fragments that get rejected by a SEPP run don't show up in
        # the placement file, however being rejected is a valuable
        # information and should be stored in the archive as well.
        # Thus, we avoid re-computation for rejected fragments in the
        # future.
        for fragment in novel_fragments:
            if fragment not in new_placements:
                new_placements[fragment] = ""
        if len(new_placements.keys()) > 0:
            qclient.patch(url="/qiita_db/archive/observations/", op="add",
                          path=job_id, value=json.dumps(new_placements))

        # retrieve all fragments and create actuall tree
        qclient.update_job_step(job_id, "Step 4 of 4 (4/4): Composing "
                                "phylogenetic insertion tree")
        placements = qclient.post(
            "/qiita_db/archive/observations/", data={'job_id': job_id,
                                                     'features': features})
        # remove fragments that have been rejected by SEPP, i.e. whoes
        # placement is the empty string and
        # convert all other placements from string to json
        placements = {frag: json.loads(placements[frag])
                      for frag, plc
                      in placements.items()
                      if plc != ''}
        try:
            fp_phylogeny = generate_insertion_trees(
                placements, out_dir,
                reference_template=fp_reference_template,
                reference_rename=fp_reference_rename)
        except ValueError as e:
            return False, None, str(e)
    else:
        new_placements = None

    ainfo = [ArtifactInfo('deblur final table', 'BIOM',
                          [(final_biom, 'biom'),
                           (final_seqs, 'preprocessed_fasta')])]
    if fp_phylogeny is not None:
        ainfo.append(ArtifactInfo('deblur reference hit table', 'BIOM',
                     [(final_biom_hit, 'biom'),
                      (final_seqs_hit, 'preprocessed_fasta'),
                      (fp_phylogeny, 'plain_text')], new_placements))

    return True, ainfo, ""