Example #1
0
def deposit_biofilms(output_dir, abs_table1, abs_table2, rel_table1,
                     rel_table2, edges, metadata, sample_id):
    """ Writes down tables and edges into files.

    Parameters
    ----------
    output_dir : str
        output directory
    rel_table1 : biom.Table
        Biom table of relative abundances
    rel_table2 : biom.Table
        Biom table of relative abundances
    abs_table1 : biom.Table
        Biom table of absolute abundances
    abs_table2 : biom.Table
        Biom table of absolute abundances
    edges : list
        Edge list for ground truthing.
    metadata : pd.DataFrame
        Dataframe of sample metadata
    sample_id : str
        sample id
    """
    output_abs_microbes = "%s/table.abs.microbes.%s.biom" % (output_dir,
                                                             sample_id)
    output_abs_metabolites = "%s/table.abs.metabolites.%s.biom" % (output_dir,
                                                                   sample_id)
    output_rel_microbes = "%s/table.rel.microbes.%s.biom" % (output_dir,
                                                             sample_id)
    output_rel_metabolites = "%s/table.rel.metabolites.%s.biom" % (output_dir,
                                                                   sample_id)
    output_md = "%s/metadata.%s.txt" % (output_dir, sample_id)
    output_U = "%s/U.%s.txt" % (output_dir, sample_id)
    output_V = "%s/V.%s.txt" % (output_dir, sample_id)
    output_edges = "%s/edges.%s.txt" % (output_dir, sample_id)
    output_ranks = "%s/ranks.%s.txt" % (output_dir, sample_id)

    # idx1 = table1.sum(axis=0) > 0
    # idx2 = table2.sum(axis=0) > 0
    # table1 = table1.loc[:, idx1]
    # table2 = table2.loc[:, idx2]

    # relative abundances
    table1 = Table(rel_table1.values.T, rel_table1.columns, rel_table1.index)
    table2 = Table(rel_table2.values.T, rel_table2.columns, rel_table2.index)
    with biom_open(output_rel_microbes, 'w') as f:
        table1.to_hdf5(f, generated_by='moi1')
    with biom_open(output_rel_metabolites, 'w') as f:
        table2.to_hdf5(f, generated_by='moi2')

    # absolute abundances
    table1 = Table(abs_table1.values.T, abs_table1.columns, abs_table1.index)
    table2 = Table(abs_table2.values.T, abs_table2.columns, abs_table2.index)
    with biom_open(output_abs_microbes, 'w') as f:
        table1.to_hdf5(f, generated_by='moi1')
    with biom_open(output_abs_metabolites, 'w') as f:
        table2.to_hdf5(f, generated_by='moi2')

    pd.DataFrame(edges).to_csv(output_edges, sep='\t')
    metadata.to_csv(output_md, sep='\t')
Example #2
0
def deposit_blocktable(output_dir, abs_table, rel_table, metadata, truth, sample_id):
    choice = 'abcdefghijklmnopqrstuvwxyz'
    output_abstable = "%s/rel_table.%s.biom" % (
        output_dir, sample_id)
    output_reltable = "%s/abs_table.%s.biom" % (
        output_dir, sample_id)
    output_metadata = "%s/metadata.%s.txt" % (
        output_dir, sample_id)
    output_truth = "%s/truth.%s.txt" % (
        output_dir, sample_id)

    abs_t = Table(abs_table.T.values,
                  abs_table.columns.values,
                  abs_table.index.values)
    with biom_open(output_abstable, 'w') as f:
        abs_t.to_hdf5(f, generated_by='moi')

    rel_t = Table(rel_table.T.values,
                  rel_table.columns.values,
                  rel_table.index.values)
    with biom_open(output_reltable, 'w') as f:
        rel_t.to_hdf5(f, generated_by='moi')

    metadata.to_csv(output_metadata, sep='\t')
    truth.to_csv(output_truth, sep='\t')
Example #3
0
    def setUp(self):
        np.random.seed(0)
        torch.manual_seed(0)
        self.k, self.D, self.N, self.M, self.C = 10, 50, 500, 100000, 3
        self.sims = multinomial_batch_bioms(k=self.k,
                                            D=self.D,
                                            N=self.N,
                                            M=self.M,
                                            C=self.C)
        Y = self.sims['Y']
        parts = Y.shape[0] // 10
        samp_ids = list(map(str, range(Y.shape[0])))
        obs_ids = list(map(str, range(Y.shape[1])))
        train = Table(Y[:parts * 8].T, obs_ids, samp_ids[:parts * 8])
        test = Table(Y[parts * 8:parts * 9].T, obs_ids,
                     samp_ids[parts * 8:parts * 9])
        valid = Table(Y[parts * 9:].T, obs_ids, samp_ids[parts * 9:])
        with biom_open('train.biom', 'w') as f:
            train.to_hdf5(f, 'train')
        with biom_open('test.biom', 'w') as f:
            test.to_hdf5(f, 'test')
        with biom_open('valid.biom', 'w') as f:
            valid.to_hdf5(f, 'valid')

        md = pd.DataFrame({'batch_category': self.sims['batch_idx']},
                          index=samp_ids)
        md.index.name = 'sampleid'
        md.to_csv('metadata.txt', sep='\t')
        batch_priors = pd.Series(self.sims['alphaILR'])
        batch_priors.to_csv('batch_priors.txt', sep='\t')
        self.sims['tree'].write('basis.nwk')
Example #4
0
def main(args):
    os.mkdir(args.output_dir)
    np.random.seed(args.seed)
    sims = multinomial_bioms(
        k=args.latent_dim, D=args.input_dim,
        N=args.samples, M=args.depth)
    Y = sims['Y']
    parts = Y.shape[0] // 10
    samp_ids = list(map(str, range(Y.shape[0])))
    obs_ids = list(map(str, range(Y.shape[1])))
    train = Table(Y[:parts * 8].T, obs_ids, samp_ids[:parts * 8])
    test = Table(Y[parts * 8 : parts * 9].T,
                 obs_ids, samp_ids[parts * 8 : parts * 9])
    valid = Table(Y[parts * 9:].T, obs_ids, samp_ids[parts * 9:])
    output_dir = args.output_dir
    with biom_open(f'{output_dir}/train.biom', 'w') as f:
        train.to_hdf5(f, 'train')
    with biom_open(f'{output_dir}/test.biom', 'w') as f:
        test.to_hdf5(f, 'test')
    with biom_open(f'{output_dir}/valid.biom', 'w') as f:
        valid.to_hdf5(f, 'valid')
    tree = sims['tree']
    tree.write(f'{output_dir}/basis.nwk')
    np.savetxt(f'{output_dir}/eigvals.txt', sims['eigs'])
    np.savetxt(f'{output_dir}/eigvecs.txt', sims['eigvectors'])
    np.savetxt(f'{output_dir}/W.txt', sims['W'])
def biom_data_from_vcfs(vcfs, min_position=0, max_position=inf):
    oids = {}
    ordered_oids = []
    sids = {}
    ordered_sids = []
    data = {}
    master_oids = set([])
    for vcf in vcfs:
        working_oids = set([])
        vcf = biom_open(vcf)
        for line in vcf:
            fields = line.strip().split('\t')
            if fields[0] == '#CHROM':
                pass
            elif fields[0].startswith("#"):
                pass
            else:
                chrom = fields[0]
                pos = int(fields[1])
                oid = '%s.%d' % (chrom, pos)
                working_oids.add(oid)
        if len(master_oids) == 0:
            master_oids = working_oids
        else:
            master_oids = set.intersection(master_oids, working_oids)
#            master_oids = master_oids | working_oids
        vcf.close()
    for vcf in vcfs:
        vcf = biom_open(vcf)
        for line in vcf:
            fields = line.strip().split('\t')
            if fields[0] == '#CHROM':
                # this will differ for human data (when multiple genomes per vcf):
                sid = fields[9]
                try:
                    sid_index = sids[sid]
                except KeyError:
                    ordered_sids.append(sid)
                    sid_index = len(ordered_sids) - 1
                    sids[sid] = sid_index
            elif fields[0].startswith("#"):
                pass
            else:
                chrom = fields[0]
                pos = int(fields[1])
                oid = '%s.%d' % (chrom, pos)
                if fields[4] != '.' and \
                   min_position <= pos <= max_position and \
                   oid in master_oids:
                    try:
                       oid_index = oids[oid]
                    except KeyError:
                        ordered_oids.append(oid)
                        oid_index = len(ordered_oids) - 1
                        oids[oid] = oid_index
                    # this will differ for non-haploid data:
                    data[(oid_index, sid_index)] = 1

    return data, ordered_oids, ordered_sids
Example #6
0
    def test_biom_open_gz(self):
        with biom_open(get_data_path('test.json.gz')) as f:
            self.assertTrue(isinstance(f, gzip.GzipFile))

        with biom_open(get_data_path('test_writing.json.gz'), 'w') as f:
            self.assertTrue(isinstance(f, gzip.GzipFile))

        remove(get_data_path('test_writing.json.gz'))
Example #7
0
    def test_biom_open_hdf5(self):
        with biom_open(get_data_path('test.biom')) as f:
            self.assertTrue(isinstance(f, h5py.File))

        with biom_open(get_data_path('test_writing.biom'), 'w') as f:
            self.assertTrue(isinstance(f, h5py.File))

        remove(get_data_path('test_writing.biom'))
Example #8
0
    def test_biom_open_gz(self):
        with biom_open(get_data_path('test.json.gz')) as f:
            self.assertTrue(isinstance(f, gzip.GzipFile))

        with biom_open(get_data_path('test_writing.json.gz'), 'w') as f:
            self.assertTrue(isinstance(f, gzip.GzipFile))

        remove(get_data_path('test_writing.json.gz'))
Example #9
0
    def test_biom_open_hdf5(self):
        with biom_open(get_data_path('test.biom')) as f:
            self.assertTrue(isinstance(f, h5py.File))

        with biom_open(get_data_path('test_writing.biom'), 'w') as f:
            self.assertTrue(isinstance(f, h5py.File))

        remove(get_data_path('test_writing.biom'))
Example #10
0
def deposit_biofilm(table1, table2, metadata, U, V, edges, it, rep,
                    output_dir):
    """ Writes down tables, metadata and feature metadata into files.

    Parameters
    ----------
    table : biom.Table
        Biom table
    metadata : pd.DataFrame
        Dataframe of sample metadata
    feature_metadata : pd.DataFrame
        Dataframe of features metadata
    it : int
        iteration number
    rep : int
        repetition number
    output_dir : str
        output directory
    """
    choice = 'abcdefghijklmnopqrstuvwxyz'
    output_microbes = "%s/table_microbes.%d_%s.biom" % (output_dir, it,
                                                        choice[rep])
    output_metabolites = "%s/table_metabolites.%d_%s.biom" % (output_dir, it,
                                                              choice[rep])
    output_md = "%s/metadata.%d_%s.txt" % (output_dir, it, choice[rep])
    output_U = "%s/U.%d_%s.txt" % (output_dir, it, choice[rep])
    output_V = "%s/V.%d_%s.txt" % (output_dir, it, choice[rep])
    output_B = "%s/edges.%d_%s.txt" % (output_dir, it, choice[rep])
    output_ranks = "%s/ranks.%d_%s.txt" % (output_dir, it, choice[rep])

    idx1 = table1.sum(axis=0) > 0
    idx2 = table2.sum(axis=0) > 0
    table1 = table1.loc[:, idx1]
    table2 = table2.loc[:, idx2]

    table1 = Table(table1.values.T, table1.columns, table1.index)
    table2 = Table(table2.values.T, table2.columns, table2.index)

    with biom_open(output_microbes, 'w') as f:
        table1.to_hdf5(f, generated_by='moi1')
    with biom_open(output_metabolites, 'w') as f:
        table2.to_hdf5(f, generated_by='moi2')

    ranks = (U @ V)

    ranks = ranks[idx1, :]
    ranks = ranks[:, idx2]
    ranks = pd.DataFrame(ranks,
                         index=table1.ids(axis='observation'),
                         columns=table2.ids(axis='observation'))
    ranks.to_csv(output_ranks, sep='\t')
    metadata.to_csv(output_md, sep='\t', index_label='#SampleID')

    B = B[:, idx1]

    np.savetxt(output_U, U)
    np.savetxt(output_V, V)
    np.savetxt(output_B, B)
Example #11
0
    def test_delete_analysis(self):
        # adding extra filepaths to make sure the delete works as expected, we
        # basically want 8 -> 9 -> 10 -> 12 -> 14
        #                       -> 11 -> 13
        fd, fp10 = mkstemp(suffix='_table.biom')
        close(fd)
        fd, fp11 = mkstemp(suffix='_table.biom')
        close(fd)
        fd, fp12 = mkstemp(suffix='_table.biom')
        close(fd)
        fd, fp13 = mkstemp(suffix='_table.biom')
        close(fd)
        fd, fp14 = mkstemp(suffix='_table.biom')
        close(fd)
        with biom_open(fp10, 'w') as f:
            et.to_hdf5(f, "test")
        with biom_open(fp11, 'w') as f:
            et.to_hdf5(f, "test")
        with biom_open(fp12, 'w') as f:
            et.to_hdf5(f, "test")
        with biom_open(fp13, 'w') as f:
            et.to_hdf5(f, "test")
        with biom_open(fp14, 'w') as f:
            et.to_hdf5(f, "test")
        self._clean_up_files.extend([fp10, fp11, fp12, fp13, fp14])

        # copying some processing parameters
        a9 = Artifact(9)
        pp = a9.processing_parameters

        # 7: BIOM
        a10 = Artifact.create([(fp10, 7)],
                              "BIOM",
                              parents=[a9],
                              processing_parameters=pp)
        a11 = Artifact.create([(fp11, 7)],
                              "BIOM",
                              parents=[a9],
                              processing_parameters=pp)
        a12 = Artifact.create([(fp12, 7)],
                              "BIOM",
                              parents=[a10],
                              processing_parameters=pp)
        Artifact.create([(fp13, 7)],
                        "BIOM",
                        parents=[a11],
                        processing_parameters=pp)
        Artifact.create([(fp14, 7)],
                        "BIOM",
                        parents=[a12],
                        processing_parameters=pp)

        job = self._create_job('delete_analysis', {'analysis_id': 1})
        private_task(job.id)
        self.assertEqual(job.status, 'success')
        with self.assertRaises(QiitaDBUnknownIDError):
            Analysis(1)
def test_between_correls(args, tmpdir):
    table1 = simulate_correls()
    table2 = simulate_correls()
    loc = tmpdir.mkdir("with_correls_test")
    with biom_open(str(loc.join("table1.biom")), 'w') as f:
        table1.to_hdf5(f, 'madebyme')
    with biom_open(str(loc.join("table2.biom")), 'w') as f:
        table2.to_hdf5(f, 'madebyme')
    os.chdir(str(loc))
    between_correls(args)
    files = os.listdir(str(loc)+'/out_dir')
    assert "correls.txt" in files
    assert "crossnet.gml" in files
Example #13
0
def write_biom_and_meta_data(orig_biom, orig_pd, augm_biom, augm_pd, out_dir,
                             biom_fp, meta_fp):
    with biom_open(out_dir + '/' + os.path.basename(biom_fp), 'w') as f:
        orig_biom.to_hdf5(f, "original biom table")
    with biom_open(out_dir + '/augmented_data.biom', 'w') as f:
        augm_biom.to_hdf5(f, "augmented biom table")
    if meta_fp is not None:
        orig_pd.to_csv(out_dir + '/' + os.path.basename(meta_fp),
                       sep='\t',
                       header=['#SampleID', 'label'])
        augm_pd.to_csv(out_dir + '/augmented_meta_data.csv',
                       sep='\t',
                       header=['#SampleID', 'label'])
Example #14
0
def split_dataset(input_biom, input_metadata, split_ratio, output_dir):
    table = load_table(input_biom)
    metadata = pd.read_table(input_metadata, index_col=0)
    metadata.columns = [x.replace('-', '_') for x in metadata.columns]

    metadata_filter = lambda val, id_, md: id_ in metadata.index
    table = table.filter(metadata_filter, axis='sample')
    metadata = metadata.loc[table.ids(axis='sample')]

    sample_ids = metadata.index
    D, N = table.shape
    samples = pd.Series(np.arange(N), index=sample_ids)
    train_size = int(N * split_ratio)
    test_size = N - train_size

    test_samples = set(np.random.choice(sample_ids, size=test_size))

    test_idx = np.array([(x in test_samples) for x in metadata.index])
    train_idx = ~test_idx
    f = lambda id_, md: id_ in test_samples
    gen = table.partition(f)

    _, train_table = next(gen)
    _, test_table = next(gen)

    train_metadata = metadata.iloc[train_idx]
    test_metadata = metadata.iloc[test_idx]

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    test_metadata_path = os.path.join(
        output_dir, 'test_' + os.path.basename(input_metadata))
    train_metadata_path = os.path.join(
        output_dir, 'train_' + os.path.basename(input_metadata))

    test_biom_path = os.path.join(output_dir,
                                  'test_' + os.path.basename(input_biom))
    train_biom_path = os.path.join(output_dir,
                                   'train_' + os.path.basename(input_biom))

    print(train_metadata_path)
    train_metadata.to_csv(train_metadata_path, sep='\t')
    test_metadata.to_csv(test_metadata_path, sep='\t')

    with biom_open(train_biom_path, 'w') as f:
        train_table.to_hdf5(f, "train")

    with biom_open(test_biom_path, 'w') as f:
        test_table.to_hdf5(f, "test")
Example #15
0
def main():
    args = parser.parse_args()
    n = args.n
    input_fp = args.input_fp
    output_dir = args.output_dir

    biom_table = load_table(input_fp)

    obs_ids = biom_table.ids(axis='observation')

    print "{0} total ids\n".format(len(obs_ids))

    chunk_size = int(len(obs_ids) / n)

    last_id = -1

    for chunk in range(1, n):

        begin_id = last_id + 1
        end_id = chunk * chunk_size
        print "chunk: {0} begin: {1} end: {2}\n".format(
            chunk, begin_id, end_id)

        sub_ids = obs_ids[begin_id:end_id]

        sub_table = biom_table.filter(lambda val, id_, md: id_ in sub_ids,
                                      axis='observation',
                                      invert=False,
                                      inplace=False)
        with biom_open(join(output_dir, 'chunk{0}.biom'.format(chunk)),
                       'w') as out_f:
            sub_table.to_hdf5(out_f, "split_biom.py")

        last_id = end_id

    begin_id = last_id + 1
    chunk += 1

    print "chunk: {0} begin: {1} end: {2}\n".format(chunk, begin_id,
                                                    len(obs_ids))

    sub_ids = obs_ids[last_id + 1:]

    sub_table = biom_table.filter(lambda val, id_, md: id_ in sub_ids,
                                  axis='observation',
                                  invert=False,
                                  inplace=False)
    with biom_open(join(output_dir, 'chunk{0}.biom'.format(n)), 'w') as out_f:
        sub_table.to_hdf5(out_f, "split_biom.py")
Example #16
0
    def test_write_biom(self):
        with tempfile.NamedTemporaryFile(suffix='biom') as biom:
            with biom_open(biom.name, 'w') as f:
                s = Stats_And_Summary()
                s.write_biom(('sample1', 'sample2'), [{
                    'readname': ['ab', 'c'],
                    'readnameE': ['ab', 'd']
                }, {
                    'readname2': ['ab', 'c']
                }], f)

            with tempfile.NamedTemporaryFile(suffix='csv') as biom_out:
                os.remove(
                    biom_out.name)  #delete because otherwise biom complains
                subprocess.check_call(
                    "biom convert -i %s -o %s --table-type 'OTU table' --to-tsv --header-key taxonomy"
                    % (biom.name, biom_out.name),
                    shell=True)
                observed = open(biom_out.name).read()
                self.assertTrue(observed in ('''# Constructed from biom file
#OTU ID\tsample1\tsample2\ttaxonomy
1\t1.0\t0.0\tab; d
2\t1.0\t1.0\tab; c''', '''# Constructed from biom file
#OTU ID\tsample1\tsample2\ttaxonomy
1\t1.0\t1.0\tab; c
2\t1.0\t0.0\tab; d'''),
                                msg=observed)
Example #17
0
def biom_artifact_output_translator(artifact):
    biom_table = artifact.data
    fd, temp_file_name = mkstemp(suffix=".biom")
    close(fd)
    with biom_open(temp_file_name, 'w') as f:
        biom_table.to_hdf5(f, "QIITA-QIIME 2 plugin")
    return temp_file_name, 'biom'
Example #18
0
    def test_validate_prefix(self):
        httpretty.register_uri(
            httpretty.POST,
            "https://test_server.com/qiita_db/jobs/job-id/step/")
        httpretty.register_uri(
            httpretty.GET,
            "https://test_server.com/qiita_db/prep_template/1/data",
            body='{"data": {"1.S1": {"orig_name": "S1"}, "1.S2": '
                 '{"orig_name": "S2"}, "1.S3": {"orig_name": "S3"}}}')

        fd, biom_fp = mkstemp(suffix=".biom")
        close(fd)
        data = np.asarray([[0, 0, 1], [1, 3, 42]])
        table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3'])
        with biom_open(biom_fp, 'w') as f:
            table.to_hdf5(f, "Test")

        self._clean_up_files.append(biom_fp)

        self.parameters['files'] = '{"BIOM": ["%s"]}' % biom_fp

        obs_success, obs_ainfo, obs_error = validate(
            self.qclient, 'job-id', self.parameters, self.out_dir)
        exp_biom_fp = join(self.out_dir, basename(biom_fp))
        self._clean_up_files.append(exp_biom_fp)
        self.assertTrue(obs_success)
        self.assertEqual(obs_ainfo, [[None, 'BIOM', [exp_biom_fp, 'biom']]])
        self.assertEqual(obs_error, "")
        obs_t = load_table(exp_biom_fp)
        self.assertItemsEqual(obs_t.ids(), ["1.S1", "1.S2", "1.S3"])
Example #19
0
def hashing(unhashed_otu_table_list, unhashed_rep_seqs_list,
            sample_metadata_list):
    otu_df_list = []
    rep_seq_ids = set()
    seqs = []
    # Create OTU table
    for unhashed_otu_table in unhashed_otu_table_list:
        otu_df_list.append(hash_otu_table(unhashed_otu_table))
    otu_df = pd.concat(otu_df_list, join="outer", axis=1)
    otu_df.fillna(0.0, inplace=True)
    otu_table = Table(otu_df.values, list(otu_df.index), list(otu_df.columns))
    # Create rep seqs
    for unhashed_rep_seqs in unhashed_rep_seqs_list:
        seqs.extend(hash_rep_seqs(unhashed_rep_seqs, rep_seq_ids))
    otu_table_ids = set(otu_df.index)
    assert otu_table_ids == rep_seq_ids
    assert len(otu_df.index) == len(rep_seq_ids)
    # Merge sample metadata
    sample_metadata = pd.concat(
        [pd.read_csv(s, sep="\\t") for s in sample_metadata_list])
    # Write files
    sample_metadata.to_csv("sample_metadata.tsv", sep="\\t", index=False)
    with biom_open("otu_table.biom", "w") as fid:
        otu_table.to_hdf5(fid,
                          "Constructed by micone in dada2/deblur pipeline")
    with open("rep_seqs.fasta", "w") as fid:
        fasta_writer = FastaIO.FastaWriter(fid, wrap=None)
        fasta_writer.write_file(seqs)
def load_category_files(category_files):
    """Loads the category tables as biom files

    INPUTS:
        category_files -- a dictionary that associates the mapping category
                    (key) with the file path to the otu_table summarizing that

    OUTPUTS:
        category_tables -- a dictionary that associates the mapping category
                    with the summarized otu table for the category.
    """

    category_tables = {}
    watch_count = 0
    watch_list = []

    for (category, category_file) in category_files.iteritems():
        if isfile(category_file):
            with biom_open(category_file, 'U') as fp:
                cat_table = parse_biom_table(fp)
            category_tables[category] = cat_table
        else:
            watch_list.append('The summarized OTU table file cannot be found '
                              'for %s. \n%s is not in the file path.'
                              % (category, category_file))
            watch_count = watch_count + 1

    if watch_count > 0:
        print 'The following category files could not be found: \n%s' \
            % '\n'.join(watch_list)
    if watch_count == len(category_files):
        raise ValueError('No files could be found for any of the supplied '
                         'categories. \n%s' % '\n'.join(watch_list))

    return category_tables
Example #21
0
def main(argv):
    parser=argparse.ArgumentParser(description=
        'Select Gammaproteobacteria (or other group) contamination candidates')
    parser.add_argument('-i','--biom',help='biom file of the experiment')
    parser.add_argument('-o','--output',help='output file name')
    parser.add_argument('-c','--classpos',
                        help='class of taxonomy name (0-kingdom,1-phylum etc.',
                        default=2,type=int)
    parser.add_argument('-t','--taxonomy',
                        help='taxonomy name (including c__ or equivalent)',
                        default='c__Gammaproteobacteria')
    parser.add_argument('-l','--level',help=
                        'minimal cumulative level to filter (0 to get all)',
                        default='0.03',type=float)
    
    args=parser.parse_args(argv)

    # load the biom table
    biom_table = parse_biom_table(biom_open(args.biom,'U'))
    # find the high freq. OTUs
    result=get_high_freq_otus(biom_table,args.classpos,args.taxonomy,args.level)
    
    # and write them to the file
    with open(args.output,'w') as snames:
        for cstr in result:
            snames.write(cstr+'\n')
Example #22
0
    def run(self, **kwargs):
        is_json = kwargs['is_json']

        if kwargs['format_version'] in [None, 'None']:
            if is_json:
                kwargs['format_version'] = '1.0.0'
            else:
                kwargs['format_version'] = '2.0.0'

        # this is not pyqi-appriopriate, but how we parse this thing is
        # dependent on runtime options :(
        with biom_open(kwargs['table']) as f:
            if is_json:
                kwargs['table'] = json.load(f)
                return self._validate_json(**kwargs)
            elif HAVE_H5PY:
                import h5py
                kwargs['table'] = f

                if not isinstance(f, h5py.File):
                    print("Attempting to validate an HDF5 BIOM table, but the "
                          "table does not appear to be in HDF5 format!")
                    sys.exit(1)
                return self._validate_hdf5(**kwargs)
            else:
                raise IOError("h5py is not installed, can only validate JSON "
                              "tables")
Example #23
0
def load_hdf5_or_json(fp):
    """Return a parsed JSON object or an HDF5 object"""
    with biom_open(fp) as f:
        if hasattr(f, 'seek'):
            return json.load(f)
        else:
            return f
Example #24
0
    def _get_distance_matrix(self, X):
        """
        computes UniFrac distances with the fitted samples

        Parameters
        ----------
        X : biom.Table
            new samples

        Returns
        -------
        dm : DistanceMatrix
            distances from old samples to new samples

        """
        # TODO one problem with this approach is that
        #  if any samples in X overlap self.table, the counts will
        #  be doubled
        merged_table = self.table.merge(X)
        with tempfile.NamedTemporaryFile() as f:
            with biom_open(f.name, 'w') as b:
                merged_table.to_hdf5(b, "merged")

            dm = ssu(
                f.name,
                self.tree_path,
                unifrac_method=self.unifrac_method,
                variance_adjust=False,
                alpha=1.0,
                bypass_tips=False,
                threads=1,
            )
        return dm
Example #25
0
def build_OTU_table_biom(OTU_table_classic, OTU_table_biom, dataset_ID):
    # Builds a BIOM format OTU table from an OTU table in classic dense format (sample IDs in the first row, OTU IDs in the first column).  For some reason, 'biom convert' command fails to recognize some OTU tables, and therefore the method classic2biom (above) fails.  Look into this sometime...
    with open(OTU_table_classic, 'r') as fidin:
        otu_table_data = fidin.readlines()
        firstrow = otu_table_data[0].split('\t')
        sample_labels = firstrow[1:]
        sample_labels[len(sample_labels) -
                      1] = sample_labels[len(sample_labels) - 1].rstrip('\n')
        OTU_labels = [
            otu_table_data[i].split('\t')[0]
            for i in range(1, len(otu_table_data))
        ]
        nOTUs = len(OTU_labels)
        nSamples = len(sample_labels)
        # Load OTU table row major order
        OTU_table_data = np.zeros((nOTUs, nSamples))
        for i in range(1, nOTUs + 1):
            OTU_table_data[i - 1, :] = otu_table_data[i].split('\t')[1:]
        # Write in BIOM format
        t = Table(OTU_table_data,
                  OTU_labels,
                  sample_labels,
                  observ_metadata=None,
                  sample_metadata=None,
                  table_id=dataset_ID)
        with biom_open(OTU_table_biom, 'w') as f:
            t.to_hdf5(f, "Generated by processing layer", compress=False)
Example #26
0
    def setUp(self):
        # Register the URIs for the QiitaClient
        httpretty.register_uri(
            httpretty.POST,
            "https://test_server.com/qiita_db/authenticate/",
            body='{"access_token": "token", "token_type": "Bearer", '
            '"expires_in": "3600"}')

        self.qclient = QiitaClient('https://test_server.com', 'client_id',
                                   'client_secret')
        # Create a biom table
        fd, self.biom_fp = mkstemp(suffix=".biom")
        close(fd)
        data = np.asarray([[0, 0, 1], [1, 3, 42]])
        table = Table(data, ['O1', 'O2'], ['1.S1', '1.S2', '1.S3'])
        with biom_open(self.biom_fp, 'w') as f:
            table.to_hdf5(f, "Test")
        self.out_dir = mkdtemp()
        self.parameters = {
            'template': 1,
            'files': '{"BIOM": ["%s"]}' % self.biom_fp,
            'artifact_type': 'BIOM'
        }

        self._clean_up_files = [self.biom_fp, self.out_dir]
Example #27
0
def write_biom_table(biom_table, biom_table_fp, compress=True,
                     write_hdf5=HAVE_H5PY, format_fs=None):
    """Writes a BIOM table to the specified filepath

    Parameters
    ----------
    biom_table : biom.Table
        The table object to write out
    biom_table_fp : str
        The path to the output file
    compress : bool, optional
        Defaults to ``True``. If True, built-in compression on the output HDF5
        file will be enabled. This option is only relevant if ``write_hdf5`` is
        ``True``.
    write_hdf5 : bool, optional
        Defaults to ``True`` if H5PY is installed and to ``False`` if H5PY is
        not installed. If ``True`` the output biom table will be written as an
        HDF5 binary file, otherwise it will be a JSON string.
    format_fs : dict, optional
        Formatting functions to be passed to `Table.to_hdf5`

    Notes
    -----
    This code was adapted from QIIME 1.9
    """
    generated_by = "PICRUSt " + __version__

    if write_hdf5:
        with biom_open(biom_table_fp, 'w') as biom_file:
            biom_table.to_hdf5(biom_file, generated_by, compress,
                               format_fs=format_fs)
    else:
        with open(biom_table_fp, 'w') as biom_file:
            biom_table.to_json(generated_by, biom_file)
Example #28
0
    def _create_job_and_biom(self, sample_ids, template=None, analysis=None):
        # Create the BIOM table that needs to be valdiated
        fd, biom_fp = mkstemp(suffix=".biom")
        close(fd)
        data = np.random.randint(100, size=(2, len(sample_ids)))
        table = Table(data, ['O1', 'O2'], sample_ids)
        with biom_open(biom_fp, 'w') as f:
            table.to_hdf5(f, "Test")
        self._clean_up_files.append(biom_fp)

        # Create a new job
        parameters = {
            'template': template,
            'files': dumps({'biom': [biom_fp]}),
            'artifact_type': 'BIOM',
            'analysis': analysis
        }
        data = {
            'command': dumps(['BIOM type', '2.1.4', 'Validate']),
            'parameters': dumps(parameters),
            'status': 'running'
        }
        res = self.qclient.post('/apitest/processing_job/', data=data)
        job_id = res['job']

        return biom_fp, job_id, parameters
Example #29
0
    def run(self, **kwargs):
        is_json = kwargs['is_json']

        if kwargs['format_version'] in [None, 'None']:
            if is_json:
                kwargs['format_version'] = '1.0.0'
            else:
                kwargs['format_version'] = '2.0.0'

        # this is not pyqi-appriopriate, but how we parse this thing is
        # dependent on runtime options :(
        with biom_open(kwargs['table']) as f:
            if is_json:
                kwargs['table'] = json.load(f)
                return self._validate_json(**kwargs)
            elif HAVE_H5PY:
                import h5py
                kwargs['table'] = f

                if not isinstance(f, h5py.File):
                    print("Attempting to validate an HDF5 BIOM table, but the "
                          "table does not appear to be in HDF5 format!")
                    sys.exit(1)
                return self._validate_hdf5(**kwargs)
            else:
                raise IOError("h5py is not installed, can only validate JSON "
                              "tables")
Example #30
0
    def test_execute_job_error(self):
        # Create a prep template
        prep_info = {'SKB8.640193': {'col': 'val1'},
                     'SKD8.640184': {'col': 'val2'}}
        data = {'prep_info': dumps(prep_info),
                'study': 1,
                'data_type': '16S'}
        template = self.qclient.post(
            '/apitest/prep_template/', data=data)['prep']
        # Create a new validate job
        fd, biom_fp = mkstemp(suffix=".biom")
        close(fd)
        data = np.random.randint(100, size=(2, 2))
        table = Table(data, ['O1', 'O2'], ['S1', 'S2'])
        with biom_open(biom_fp, 'w') as f:
            table.to_hdf5(f, "Test")
        data = {'command': dumps(['BIOM type', '2.1.4', 'Validate']),
                'parameters': dumps(
                    {'files': dumps({'biom': [biom_fp]}),
                     'template': template,
                     'artifact_type': 'BIOM'}),
                'artifact_type': 'BIOM',
                'status': 'queued'}
        job_id = self.qclient.post(
            '/apitest/processing_job/', data=data)['job']

        plugin("https://localhost:21174", job_id, self.out_dir)
        obs = self._wait_job(job_id)
        self.assertEqual(obs, 'error')
Example #31
0
def load_hdf5_or_json(fp):
    """Return a parsed JSON object or an HDF5 object"""
    with biom_open(fp) as f:
        if hasattr(f, 'seek'):
            return json.load(f)
        else:
            return f
Example #32
0
    def run(self, **kwargs):
        is_json = not is_hdf5_file(kwargs['table'])

        if kwargs['format_version'] in [None, 'None']:
            if is_json:
                kwargs['format_version'] = '1.0.0'
            else:
                kwargs['format_version'] = '2.1'
        else:
            if is_json:
                raise ValueError("Only format 1.0.0 is valid for JSON")

            fmt_ver = [int(v) for v in kwargs['format_version'].split('.')]
            if tuple(fmt_ver) not in self.HDF5FormatVersions:
                raise ValueError("Unrecognized format version: %s" %
                                 kwargs['format_version'])

        with biom_open(kwargs['table']) as f:
            if is_json:
                kwargs['table'] = json.load(f)
                return self._validate_json(**kwargs)
            elif HAVE_H5PY:
                import h5py
                kwargs['table'] = f

                if not isinstance(f, h5py.File):
                    print("Attempting to validate an HDF5 BIOM table, but the "
                          "table does not appear to be in HDF5 format!")
                    sys.exit(1)
                return self._validate_hdf5(**kwargs)
            else:
                raise IOError("h5py is not installed, can only validate JSON "
                              "tables")
def write_biom_table(biom_table, biom_table_fp, compress=True,
                     write_hdf5=HAVE_H5PY, format_fs=None):
    """Writes a BIOM table to the specified filepath

    Parameters
    ----------
    biom_table : biom.Table
        The table object to write out
    biom_table_fp : str
        The path to the output file
    compress : bool, optional
        Defaults to ``True``. If True, built-in compression on the output HDF5
        file will be enabled. This option is only relevant if ``write_hdf5`` is
        ``True``.
    write_hdf5 : bool, optional
        Defaults to ``True`` if H5PY is installed and to ``False`` if H5PY is
        not installed. If ``True`` the output biom table will be written as an
        HDF5 binary file, otherwise it will be a JSON string.
    format_fs : dict, optional
        Formatting functions to be passed to `Table.to_hdf5`

    Notes
    -----
    This code was adapted from QIIME 1.9
    """
    generated_by = "Microbiome Helper"

    if write_hdf5:
        with biom_open(biom_table_fp, 'w') as biom_file:
            biom_table.to_hdf5(biom_file, generated_by, compress,
                               format_fs=format_fs)
    else:
        with open(biom_table_fp, 'w') as biom_file:
            biom_table.to_json(generated_by, biom_file)
Example #34
0
def load_table(f):
    r"""Load a `Table` from a path

    Parameters
    ----------
    f : str

    Returns
    -------
    Table

    Raises
    ------
    IOError
        If the path does not exist
    TypeError
        If the data in the path does not appear to be a BIOM table

    Examples
    --------
    Parse a table from a path. BIOM will attempt to determine if the fhe file
    is either in TSV, HDF5, JSON, gzip'd JSON or gzip'd TSV and parse
    accordingly:

    >>> from biom import load_table
    >>> table = load_table('path/to/table.biom') # doctest: +SKIP

    """
    from biom.util import biom_open
    with biom_open(f) as fp:
        try:
            table = parse_table(fp)
        except (IndexError, TypeError):
            raise TypeError("%s does not appear to be a BIOM file!" % f)
    return table
Example #35
0
    def test_validate_prefix(self):
        httpretty.register_uri(
            httpretty.POST,
            "https://test_server.com/qiita_db/jobs/job-id/step/")
        httpretty.register_uri(
            httpretty.GET,
            "https://test_server.com/qiita_db/prep_template/1/data",
            body='{"data": {"1.S1": {"orig_name": "S1"}, "1.S2": '
            '{"orig_name": "S2"}, "1.S3": {"orig_name": "S3"}}}')

        fd, biom_fp = mkstemp(suffix=".biom")
        close(fd)
        data = np.asarray([[0, 0, 1], [1, 3, 42]])
        table = Table(data, ['O1', 'O2'], ['S1', 'S2', 'S3'])
        with biom_open(biom_fp, 'w') as f:
            table.to_hdf5(f, "Test")

        self._clean_up_files.append(biom_fp)

        self.parameters['files'] = '{"BIOM": ["%s"]}' % biom_fp

        obs_success, obs_ainfo, obs_error = validate(self.qclient, 'job-id',
                                                     self.parameters,
                                                     self.out_dir)
        exp_biom_fp = join(self.out_dir, basename(biom_fp))
        self._clean_up_files.append(exp_biom_fp)
        self.assertTrue(obs_success)
        self.assertEqual(obs_ainfo, [[None, 'BIOM', [exp_biom_fp, 'biom']]])
        self.assertEqual(obs_error, "")
        obs_t = load_table(exp_biom_fp)
        self.assertItemsEqual(obs_t.ids(), ["1.S1", "1.S2", "1.S3"])
Example #36
0
def main():
    args = prog_options()

    try:
        biomf = biom.load_table(args.in_biomf)
    except IOError as ioe:
        sys.exit("Error with input BIOM format file: {}".format(ioe))
    else:
        biomf_pa = biomf.pa(
            inplace=False)  # convert to presence/absence BIOM table
        obs_ids = biomf_pa.ids("observation")

    try:
        mheader, mdata = parse_map_file(args.map_fnh)
    except IOError as ioe:
        sys.exit("Error with input mapping file: {}".format(ioe))
    else:
        if args.group_by:
            sid_cat = gather_categories(mdata, mheader, [args.group_by])
        else:
            sid_cat = gather_categories(mdata, mheader)

    # calculate core
    core_calc = {k: set() for k in sid_cat.keys()}
    for idx in obs_ids:
        for cat, val in sid_cat.iteritems():
            obs_count = 0
            num_of_samples = len(val.sids)
            for sid in val.sids:
                try:
                    assert biomf_pa.get_value_by_ids(idx, sid) == 1
                except AssertionError:
                    continue
                else:
                    obs_count += 1
            try:
                assert obs_count > round(args.core_pct * num_of_samples)
            except AssertionError:
                continue
            else:
                core_calc[cat].add(idx)

    # Check if output directory exists, if not, create it
    try:
        assert os.path.exists(os.path.abspath(args.out_fnh)) is True
    except AssertionError:
        os.makedirs(os.path.abspath(args.out_fnh))
    finally:
        for k, v in core_calc.iteritems():
            print("{0} core IDs in {1}".format(len(v), k))
            idx_filename = os.path.join(os.path.abspath(args.out_fnh),
                                        k + "_80_pct_core_ids.txt")
            with open(idx_filename, "w") as of:
                of.write("{0}".format("\n".join(sorted(v))))
            filtered_biomf = biomf.filter(v, axis="observation", inplace=False)
            if args.biom_out:
                biom_filename = os.path.join(os.path.abspath(args.out_fnh),
                                             k + "_80_pct_core.biom")
                with biom_open(biom_filename, "w") as f:
                    filtered_biomf.to_hdf5(f, "CORE BIOM")
Example #37
0
def deposit(table, groups, truth, output_table, output_groups, output_truth):
    t = Table(table.T.values, table.columns.values, table.index.values)
    with biom_open(output_table, 'w') as f:
        t.to_hdf5(f, generated_by='moi')
    groups.to_csv(output_groups, sep='\t')
    with open(output_truth, 'w') as f:
        f.write(','.join(truth))
Example #38
0
def noisify(table_file, metadata_file, sigma, output_file):

    metadata = pd.read_table(metadata_file, index_col=0)
    table = load_table(table_file)
    table = pd.DataFrame(np.array(table.matrix_data.todense()).T,
                         index=table.ids(axis='sample'),
                         columns=table.ids(axis='observation'))
    cov = np.eye(table.shape[1] - 1)
    m_noise = compositional_noise(cov, nsamp=table.shape[0])
    table_ = table.values
    table_ = np.vstack(
        [perturb(table_[i, :], m_noise[i, :]) for i in range(table_.shape[0])])

    # note that this assumes that the column is named `library_size
    table_ = pd.DataFrame(
        multinomial_sample(table_, depths=metadata['library_size']))
    table_.index = table.index
    table_.columns = list(table.columns)

    metadata['observed'] = np.sum(table_.sum(axis=0) > 0)
    metadata['unobserved'] = np.sum(table_.sum(axis=0) == 0)
    metadata.to_csv(metadata_file, sep='\t')

    # drop zeros -- they are not informative
    table_ = table_.loc[:, table_.sum(axis=0) > 0]
    t = Table(table_.T.values, table_.columns.values, table_.index.values)
    with biom_open(output_file, 'w') as f:
        t.to_hdf5(f, generated_by='moi')
Example #39
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=
        'Select Gammaproteobacteria (or other group) contamination candidates')
    parser.add_argument('-i', '--biom', help='biom file of the experiment')
    parser.add_argument('-o', '--output', help='output file name')
    parser.add_argument('-c',
                        '--classpos',
                        help='class of taxonomy name (0-kingdom,1-phylum etc.',
                        default=2,
                        type=int)
    parser.add_argument('-t',
                        '--taxonomy',
                        help='taxonomy name (including c__ or equivalent)',
                        default='c__Gammaproteobacteria')
    parser.add_argument(
        '-l',
        '--level',
        help='minimal cumulative level to filter (0 to get all)',
        default='0.03',
        type=float)

    args = parser.parse_args(argv)

    # load the biom table
    biom_table = parse_biom_table(biom_open(args.biom, 'U'))
    # find the high freq. OTUs
    result = get_high_freq_otus(biom_table, args.classpos, args.taxonomy,
                                args.level)

    # and write them to the file
    with open(args.output, 'w') as snames:
        for cstr in result:
            snames.write(cstr + '\n')
Example #40
0
def load_table(f):
    r"""Load a `Table` from a path

    Parameters
    ----------
    f : str

    Returns
    -------
    Table

    Raises
    ------
    IOError
        If the path does not exist
    TypeError
        If the data in the path does not appear to be a BIOM table

    Examples
    --------
    Parse a table from a path. BIOM will attempt to determine if the fhe file
    is either in TSV, HDF5, JSON, gzip'd JSON or gzip'd TSV and parse
    accordingly:

    >>> from biom import load_table
    >>> table = load_table('path/to/table.biom') # doctest: +SKIP

    """
    with biom_open(f) as fp:
        try:
            table = parse_biom_table(fp)
        except (IndexError, TypeError):
            raise TypeError("%s does not appear to be a BIOM file!" % f)
    return table
Example #41
0
    def test_faith_pd_invalid_input(self):
        # tests are based of skbio tests, checking for duplicate ids,
        # negative counts are not included but should be incorporated

        # tree has duplicated tip ids
        tree = TreeNode.read(
            StringIO('((OTU1:0.1, OTU2:0.2):0.3, (OTU3:0.5, OTU4:0.7):1.1)'
                     'root;'))
        otu_ids = ['OTU%d' % i for i in range(1, 5)]
        u_counts = [1, 1, 0, 0]

        data = np.array([u_counts]).T

        bt = Table(data, otu_ids, ['u'])

        ta = os.path.join(gettempdir(), 'table.biom')
        tr = os.path.join(gettempdir(), 'tree.biom')

        self.files_to_delete.append(ta)
        self.files_to_delete.append(tr)

        with biom_open(ta, 'w') as fhdf5:
            bt.to_hdf5(fhdf5, 'Table for unit testing')
        tree.write(tr)

        self.assertRaises(IOError, faith_pd, 'dne.biom', tr)
        self.assertRaises(IOError, faith_pd, ta, 'dne.tre')
    def run(self, **kwargs):
        is_json = not is_hdf5_file(kwargs['table'])

        if kwargs['format_version'] in [None, 'None']:
            if is_json:
                kwargs['format_version'] = '1.0.0'
            else:
                kwargs['format_version'] = '2.1'
        elif is_json:
            if kwargs['format_version'] != "1.0.0":
                raise ValueError("Only format 1.0.0 is valid for JSON")
        else:
            fmt_ver = [int(v) for v in kwargs['format_version'].split('.')]
            if tuple(fmt_ver) not in self.HDF5FormatVersions:
                raise ValueError("Unrecognized format version: %s" %
                                 kwargs['format_version'])

        with biom_open(kwargs['table']) as f:
            if is_json:
                kwargs['table'] = json.load(f)
                return self._validate_json(**kwargs)
            elif HAVE_H5PY:
                import h5py
                kwargs['table'] = f

                if not isinstance(f, h5py.File):
                    print("Attempting to validate an HDF5 BIOM table, but the "
                          "table does not appear to be in HDF5 format!")
                    sys.exit(1)
                return self._validate_hdf5(**kwargs)
            else:
                raise IOError("h5py is not installed, can only validate JSON "
                              "tables")
    def run(self, **kwargs):
        json_table_str = kwargs['json_table_str']
        hdf5_biom = kwargs['hdf5_table']
        axis = kwargs['axis']
        ids = kwargs['ids']

        if axis not in self.Axes:
            raise CommandError("Invalid axis '%s'. Must be either %s." % (
                axis,
                ' or '.join(map(lambda e: "'%s'" % e, self.Axes))))

        if hdf5_biom is None and json_table_str is None:
            raise CommandError("Must specify an input table")
        elif hdf5_biom is not None and json_table_str is not None:
            raise CommandError("Can only specify one input table")

        if json_table_str is not None:
            idxs, new_axis_md = get_axis_indices(json_table_str, ids, axis)
            new_data = direct_slice_data(json_table_str, idxs, axis)

            # multiple walks over the string. bad form, but easy right now
            # ...should add a yield_and_ignore parser or something.
            def subset_generator():
                yield "{"
                yield direct_parse_key(json_table_str, "id")
                yield ","
                yield direct_parse_key(json_table_str, "format")
                yield ","
                yield direct_parse_key(json_table_str, "format_url")
                yield ","
                yield direct_parse_key(json_table_str, "type")
                yield ","
                yield direct_parse_key(json_table_str, "generated_by")
                yield ","
                yield direct_parse_key(json_table_str, "date")
                yield ","
                yield direct_parse_key(json_table_str, "matrix_type")
                yield ","
                yield direct_parse_key(json_table_str, "matrix_element_type")
                yield ","
                yield new_data
                yield ","
                yield new_axis_md
                yield ","

                if axis == "observation":
                    yield direct_parse_key(json_table_str, "columns")
                else:
                    yield direct_parse_key(json_table_str, "rows")
                yield "}"

            format_ = 'json'
            table = subset_generator()
        else:
            with biom_open(hdf5_biom) as f:
                table = Table.from_hdf5(f, ids=ids, axis=axis)
            format_ = 'hdf5'

        return {'subsetted_table': (table, format_)}
Example #44
0
def merge_biom_tables(master_fp, additional_fp):
    """
    :param master_fp: str
    :param additional_fp: str
    :return: None
    """
    master = load_table(master_fp)
    master = master.merge(load_table(additional_fp))

    with biom_open(master_fp, 'w') as biom_file:
        master.to_hdf5(biom_file, "amquery", True)
Example #45
0
def _subset_table(hdf5_biom, json_table_str, axis, ids):
    if axis not in ['sample', 'observation']:
        raise ValueError("Invalid axis '%s'. Must be either 'sample' or "
                         "'observation'." % axis)

    if hdf5_biom is None and json_table_str is None:
        raise ValueError("Must specify an input table")
    elif hdf5_biom is not None and json_table_str is not None:
        raise ValueError("Can only specify one input table")

    if json_table_str is not None:
        idxs, new_axis_md = get_axis_indices(json_table_str, ids, axis)
        new_data = direct_slice_data(json_table_str, idxs, axis)

        # multiple walks over the string. bad form, but easy right now
        # ...should add a yield_and_ignore parser or something.
        def subset_generator():
            yield "{"
            yield direct_parse_key(json_table_str, "id")
            yield ","
            yield direct_parse_key(json_table_str, "format")
            yield ","
            yield direct_parse_key(json_table_str, "format_url")
            yield ","
            yield direct_parse_key(json_table_str, "type")
            yield ","
            yield direct_parse_key(json_table_str, "generated_by")
            yield ","
            yield direct_parse_key(json_table_str, "date")
            yield ","
            yield direct_parse_key(json_table_str, "matrix_type")
            yield ","
            yield direct_parse_key(json_table_str, "matrix_element_type")
            yield ","
            yield new_data
            yield ","
            yield new_axis_md
            yield ","

            if axis == "observation":
                yield direct_parse_key(json_table_str, "columns")
            else:
                yield direct_parse_key(json_table_str, "rows")
            yield "}"

        format_ = 'json'
        table = subset_generator()
    else:
        with biom_open(hdf5_biom) as f:
            table = Table.from_hdf5(f, ids=ids, axis=axis)
        format_ = 'hdf5'

    return table, format_
Example #46
0
def write_biom_table(table, biom_output_fp):
    """Write BIOM table to file.

    Parameters
    ----------
    table: biom.Table
        an instance of a BIOM table
    biom_output_fp: str
        filepath to output BIOM table
    """
    with biom_open(biom_output_fp, 'w') as f:
            table.to_hdf5(h5grp=f, generated_by="tcga-kraken-translate")
Example #47
0
def main():
    args = parser.parse_args()
    n = args.n
    input_fp = args.input_fp


    biom_table = load_table(input_fp)

    obs_ids = biom_table.ids(axis='observation')

    print "{0} total ids\n".format(len(obs_ids))
    
    chunk_size = int(len(obs_ids)/n)

    last_id = -1

    for chunk in range(1,n):

        begin_id = last_id + 1
        end_id = chunk * chunk_size
        print "chunk: {0} begin: {1} end: {2}\n".format(chunk, begin_id, end_id)

        sub_ids = obs_ids[begin_id : end_id]

        sub_table = biom_table.filter(lambda val, id_, md: id_ in sub_ids, axis='observation', invert=False, inplace=False)
        with biom_open('chunk{0}.biom'.format(chunk), 'w') as out_f:
            sub_table.to_hdf5(out_f, "split_biom.py")

        last_id = end_id

    begin_id = last_id + 1
    chunk += 1

    print "chunk: {0} begin: {1} end: {2}\n".format(chunk, begin_id, len(obs_ids))

    sub_ids = obs_ids[last_id + 1 : ]

    sub_table = biom_table.filter(lambda val, id_, md: id_ in sub_ids, axis='observation', invert=False, inplace=False)
    with biom_open('chunk{0}.biom'.format(n), 'w') as out_f:
        sub_table.to_hdf5(out_f, "split_biom.py")
Example #48
0
    def _build_biom_tables(self, samples, rarefaction_depth):
        """Build tables and add them to the analysis"""
        with qdb.sql_connection.TRN:
            # filter and combine all study BIOM tables needed for
            # each data type
            new_tables = {dt: None for dt in self.data_types}
            base_fp = qdb.util.get_work_base_dir()
            for a_id, samps in viewitems(samples):
                # one biom table attached to each artifact object
                artifact = qdb.artifact.Artifact(a_id)
                table_fp = None
                for _, fp, fp_type in artifact.filepaths:
                    if fp_type == 'biom':
                        table_fp = fp
                        break
                if not table_fp:
                    raise RuntimeError(
                        "Artifact %s do not have a biom table associated"
                        % a_id)
                table = load_table(table_fp)
                # HACKY WORKAROUND FOR DEMO. Issue # 246
                # make sure samples not in biom table are not filtered for
                table_samps = set(table.ids())
                filter_samps = table_samps.intersection(samps)
                # add the metadata column for study the samples come from
                study_meta = {'Study': artifact.study.title,
                              'Processed_id': artifact.id}
                samples_meta = {sid: study_meta for sid in filter_samps}
                # filter for just the wanted samples and merge into new table
                # this if/else setup avoids needing a blank table to
                # start merges
                table.filter(filter_samps, axis='sample', inplace=True)
                table.add_metadata(samples_meta, axis='sample')
                data_type = artifact.data_type
                if new_tables[data_type] is None:
                    new_tables[data_type] = table
                else:
                    new_tables[data_type] = new_tables[data_type].merge(table)

            # add the new tables to the analysis
            _, base_fp = qdb.util.get_mountpoint(self._table)[0]
            for dt, biom_table in viewitems(new_tables):
                # rarefy, if specified
                if rarefaction_depth is not None:
                    biom_table = biom_table.subsample(rarefaction_depth)
                # write out the file
                biom_fp = join(base_fp, "%d_analysis_%s.biom" % (self._id, dt))
                with biom_open(biom_fp, 'w') as f:
                    biom_table.to_hdf5(f, "Analysis %s Datatype %s" %
                                       (self._id, dt))
                self._add_file("%d_analysis_%s.biom" % (self._id, dt),
                               "biom", data_type=dt)
Example #49
0
    def setUp(self):
        # Generate some files for a root artifact
        fd, self.fp1 = mkstemp(suffix='_seqs.fastq')
        close(fd)
        with open(self.fp1, 'w') as f:
            f.write("@HWI-ST753:189:D1385ACXX:1:1101:1214:1906 1:N:0:\n"
                    "NACGTAGGGTGCAAGCGTTGTCCGGAATNA\n"
                    "+\n"
                    "#1=DDFFFHHHHHJJJJJJJJJJJJGII#0\n")

        fd, self.fp2 = mkstemp(suffix='_barcodes.fastq')
        close(fd)
        with open(self.fp2, 'w') as f:
            f.write("@HWI-ST753:189:D1385ACXX:1:1101:1214:1906 2:N:0:\n"
                    "NNNCNNNNNNNNN\n"
                    "+\n"
                    "#############\n")
        self.filepaths_root = [(self.fp1, 1), (self.fp2, 3)]

        # Generate some files for a processed artifact
        fd, self.fp3 = mkstemp(suffix='_seqs.fna')
        close(fd)
        with open(self.fp3, 'w') as f:
            f.write(">1.sid_r4_0 M02034:17:000000000-A5U18:1:1101:15370:1394 "
                    "1:N:0:1 orig_bc=CATGAGCT new_bc=CATGAGCT bc_diffs=0\n"
                    "GTGTGCCAGCAGCCGCGGTAATACGTAGGG\n")
        self.filepaths_processed = [(self.fp3, 4)]

        # Generate some file for a BIOM
        fd, self.fp4 = mkstemp(suffix='_table.biom')
        with biom_open(self.fp4, 'w') as f:
            et.to_hdf5(f, "test")
        self.filepaths_biom = [(self.fp4, 7)]

        # Create a new prep template
        metadata_dict = {
            'SKB8.640193': {'center_name': 'ANL',
                            'primer': 'GTGCCAGCMGCCGCGGTAA',
                            'barcode': 'GTCCGCAAGTTA',
                            'run_prefix': "s_G1_L001_sequences",
                            'platform': 'ILLUMINA',
                            'instrument_model': 'Illumina MiSeq',
                            'library_construction_protocol': 'AAAA',
                            'experiment_design_description': 'BBBB'}}
        metadata = pd.DataFrame.from_dict(metadata_dict, orient='index',
                                          dtype=str)
        self.prep_template = \
            qdb.metadata_template.prep_template.PrepTemplate.create(
                metadata, qdb.study.Study(1), "16S")

        self._clean_up_files = [self.fp1, self.fp2, self.fp3, self.fp4]
Example #50
0
def load_biom_table_with_file_contents(biom_fp):
    """Return a BIOM table and the original open filehandle as a tuple.

    Useful when additional computation needs to be performed on the file
    contents, such as an MD5 sum.

    WARNING: this function does not close the open filehandle that it returns.
    Users of this function are responsible for closing the filehandle when done
    using it!
    """
    biom_f = biom_open(biom_fp, 'U')
    table = parse_biom_table(biom_f)
    biom_f.seek(0)
    return table, biom_f
Example #51
0
    def run(self, **kwargs):
        is_json = kwargs['is_json']

        # this is not pyqi-appriopriate, but how we parse this thing is
        # dependent on runtime options :(
        with biom_open(kwargs['table']) as f:
            if is_json:
                kwargs['table'] = json.load(f)
                return self._validate_json(**kwargs)
            elif HAVE_H5PY:
                kwargs['table'] = f
                return self._validate_hdf5(**kwargs)
            else:
                raise IOError("h5py is not installed, can only validate JSON "
                              "tables")
Example #52
0
    def test_rarefy_to_files(self):
        """rarefy_to_files should write valid files

        """
        maker = RarefactionMaker(self.otu_table_fp, 0, 1, 1, 1)
        maker.rarefy_to_files(
            self.rare_dir,
            include_full=True,
            include_lineages=False)

        fname = os.path.join(self.rare_dir, "rarefaction_1_0.biom")
        with biom_open(fname, 'U') as biom_file:
            otu_table = Table.from_hdf5(biom_file)

        self.assertItemsEqual(
            otu_table.sample_ids,
            self.otu_table.sample_ids[:2])
Example #53
0
    def _build_biom_tables(self, samples, rarefaction_depth,
                           conn_handler=None):
        """Build tables and add them to the analysis"""
        # filter and combine all study BIOM tables needed for each data type
        new_tables = {dt: None for dt in self.data_types}
        base_fp = get_work_base_dir()
        for pid, samps in viewitems(samples):
            # one biom table attached to each processed data object
            proc_data = ProcessedData(pid)
            proc_data_fp = proc_data.get_filepaths()[0][0]
            table_fp = join(base_fp, proc_data_fp)
            table = load_table(table_fp)
            # HACKY WORKAROUND FOR DEMO. Issue # 246
            # make sure samples not in biom table are not filtered for
            table_samps = set(table.ids())
            filter_samps = table_samps.intersection(samps)
            # add the metadata column for study the samples come from
            study_meta = {'Study': Study(proc_data.study).title,
                          'Processed_id': proc_data.id}
            samples_meta = {sid: study_meta for sid in filter_samps}
            # filter for just the wanted samples and merge into new table
            # this if/else setup avoids needing a blank table to start merges
            table.filter(filter_samps, axis='sample', inplace=True)
            table.add_metadata(samples_meta, axis='sample')
            data_type = proc_data.data_type()
            if new_tables[data_type] is None:
                new_tables[data_type] = table
            else:
                new_tables[data_type] = new_tables[data_type].merge(table)

        # add the new tables to the analysis
        conn_handler = conn_handler if conn_handler is not None \
            else SQLConnectionHandler()
        base_fp = get_db_files_base_dir(conn_handler)
        for dt, biom_table in viewitems(new_tables):
            # rarefy, if specified
            if rarefaction_depth is not None:
                biom_table = biom_table.subsample(rarefaction_depth)
            # write out the file
            biom_fp = join(base_fp, "analysis", "%d_analysis_%s.biom" %
                           (self._id, dt))
            with biom_open(biom_fp, 'w') as f:
                biom_table.to_hdf5(f, "Analysis %s Datatype %s" %
                                   (self._id, dt))
            self._add_file("%d_analysis_%s.biom" % (self._id, dt),
                           "biom", data_type=dt, conn_handler=conn_handler)
 def test_json_to_hdf5_collapsed_metadata(self):
     """Correctly converts json to HDF5 changing the observation metadata"""
     with biom_open(self.json_collapsed_obs) as f:
         obs = self.cmd(table=parse_biom_table(f), to_hdf5=True,
                        collapsed_observations=True)
     self.assertEqual(obs.keys(), ['table'])
     exp = Table(np.array([[2., 1., 1., 0., 0., 1.],
                           [0., 0., 1., 4., 0., 2.],
                           [5., 1., 0., 2., 3., 1.],
                           [0., 1., 2., 0., 0., 0.]]),
                 observation_ids=['p__Firmicutes', 'p__Euryarchaeota',
                                  'p__Cyanobacteria', 'p__Proteobacteria'],
                 sample_ids=['Sample1', 'Sample2', 'Sample3',
                             'Sample4', 'Sample5', 'Sample6'],
                 observation_metadata=[
                     {'collapsed_ids': ['GG_OTU_4']},
                     {'collapsed_ids': ['GG_OTU_3']},
                     {'collapsed_ids': ['GG_OTU_2']},
                     {'collapsed_ids': ['GG_OTU_1', 'GG_OTU_5']}],
                 sample_metadata=[
                     {'LinkerPrimerSequence': 'CATGCTGCCTCCCGTAGGAGT',
                      'BarcodeSequence': 'CGCTTATCGAGA',
                      'Description': 'human gut',
                      'BODY_SITE': 'gut'},
                     {'LinkerPrimerSequence': 'CATGCTGCCTCCCGTAGGAGT',
                      'BarcodeSequence': 'CATACCAGTAGC',
                      'Description': 'human gut',
                      'BODY_SITE': 'gut'},
                     {'LinkerPrimerSequence': 'CATGCTGCCTCCCGTAGGAGT',
                      'BarcodeSequence': 'CTCTCTACCTGT',
                      'Description': 'human gut',
                      'BODY_SITE': 'gut'},
                     {'LinkerPrimerSequence': 'CATGCTGCCTCCCGTAGGAGT',
                      'BarcodeSequence': 'CTCTCGGCCTGT',
                      'Description': 'human skin',
                      'BODY_SITE': 'skin'},
                     {'LinkerPrimerSequence': 'CATGCTGCCTCCCGTAGGAGT',
                      'BarcodeSequence': 'CTCTCTACCAAT',
                      'Description': 'human skin',
                      'BODY_SITE': 'skin'},
                     {'LinkerPrimerSequence': 'CATGCTGCCTCCCGTAGGAGT',
                      'BarcodeSequence': 'CTAACTACCAAT',
                      'Description': 'human skin',
                      'BODY_SITE': 'skin'}],
                 type='OTU table')
     self.assertEqual(obs['table'][0], exp)
Example #55
0
def rarefy(qclient, job_id, parameters, out_dir):
    """rarefy a table

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values to rarefy
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    boolean, list, str
        The results of the job
    """
    out_dir = join(out_dir, 'rarefy')

    qclient.update_job_step(job_id, "Step 1 of 2: Collecting information")
    artifact_id = int(parameters['BIOM table'])
    rarefy_level = int(parameters['Sampling depth'])
    artifact_info = qclient.get("/qiita_db/artifacts/%d/" % artifact_id)

    # getting just the biom file, [0] it should be only one
    to_rarefy = artifact_info['files']['biom'][0]
    qclient.update_job_step(job_id, "Step 2 of 2: Rarefying")
    b = load_table(to_rarefy)

    if not exists(out_dir):
        mkdir(out_dir)

    rarefied = b.subsample(rarefy_level)
    if rarefied.sum() == 0:
        return False, None, "Rarefaction level too high %d" % rarefy_level

    rarefied_fp = join(out_dir, 'rarefied.biom')
    with biom_open(rarefied_fp, 'w') as bf:
        rarefied.to_hdf5(bf, "Qiita's Qiime2 plugin")

    ainfo = [ArtifactInfo('Rarefied table', 'BIOM', [(rarefied_fp, 'biom')])]

    return True, ainfo, ""
def build_OTU_table_biom(OTU_table_classic, OTU_table_biom, dataset_ID):
    # Builds a BIOM format OTU table from an OTU table in classic dense format (sample IDs in the first row, OTU IDs in the first column).  For some reason, 'biom convert' command fails to recognize some OTU tables, and therefore the method classic2biom (above) fails.
    with open(OTU_table_classic,'r') as fidin:
        otu_table_data = fidin.readlines()
        firstrow = otu_table_data[0].split('\t')
        sample_labels = firstrow[1:]
        sample_labels[len(sample_labels)-1] = sample_labels[len(sample_labels)-1].rstrip('\n')
        OTU_labels = [otu_table_data[i].split('\t')[0] for i in range(1,len(otu_table_data))]
        nOTUs = len(OTU_labels)
        nSamples = len(sample_labels)
        # Load OTU table row major order
        OTU_table_data = np.zeros((nOTUs, nSamples))
        for i in range(1,nOTUs+1):
            OTU_table_data[i-1,:] = otu_table_data[i].split('\t')[1:]
        # Write in BIOM format
        t = Table(OTU_table_data, OTU_labels, sample_labels, observ_metadata=None, sample_metadata=None, table_id=dataset_ID)
        with biom_open(OTU_table_biom, 'w') as f:
            t.to_hdf5(f, "Generated by processing layer", compress=False)
 def test_json_to_hdf5_collapsed_samples(self):
     """Correctly converts json to HDF5 changing the sample metadata"""
     with biom_open(self.json_collapsed_samples) as f:
         obs = self.cmd(table=parse_biom_table(f), to_hdf5=True,
                        collapsed_samples=True)
     self.assertEqual(obs.keys(), ['table'])
     exp = Table(np.array([[0., 1.], [6., 6.], [6., 1.],
                           [1., 4.], [0., 2.]]),
                 observation_ids=['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3',
                                  'GG_OTU_4', 'GG_OTU_5'],
                 sample_ids=['skin', 'gut'],
                 observation_metadata=[
                     {'taxonomy': ['k__Bacteria', 'p__Proteobacteria',
                                   'c__Gammaproteobacteria',
                                   'o__Enterobacteriales',
                                   'f__Enterobacteriaceae',
                                   'g__Escherichia', 's__']},
                     {'taxonomy': ['k__Bacteria', 'p__Cyanobacteria',
                                   'c__Nostocophycideae', 'o__Nostocales',
                                   'f__Nostocaceae', 'g__Dolichospermum',
                                   's__']},
                     {'taxonomy': ['k__Archaea', 'p__Euryarchaeota',
                                   'c__Methanomicrobia',
                                   'o__Methanosarcinales',
                                   'f__Methanosarcinaceae',
                                   'g__Methanosarcina', 's__']},
                     {'taxonomy': ['k__Bacteria', 'p__Firmicutes',
                                   'c__Clostridia', 'o__Halanaerobiales',
                                   'f__Halanaerobiaceae',
                                   'g__Halanaerobium',
                                   's__Halanaerobiumsaccharolyticum']},
                     {'taxonomy': ['k__Bacteria', 'p__Proteobacteria',
                                   'c__Gammaproteobacteria',
                                   'o__Enterobacteriales',
                                   'f__Enterobacteriaceae',
                                   'g__Escherichia', 's__']}],
                 sample_metadata=[
                     {'collapsed_ids': ['Sample5', 'Sample4', 'Sample6']},
                     {'collapsed_ids': ['Sample1', 'Sample3', 'Sample2']}
                     ],
                 type='OTU table')
     self.assertEqual(obs['table'][0], exp)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    mapping_fp = opts.mapping_fp
    mapping_category = opts.mapping_category
    otu_table_fp = opts.otu_table_fp
    output_fp = opts.output_fp
    normalize = opts.normalize

    # define a function that returns the bin a sample shouldbe placed into
    bin_function = lambda id_, sample_metadata:\
        sample_metadata[mapping_category]
    # parse the sample metadata and add it to the OTU table (we assume that
    # sample metadata is not already present in the table)
    mapping, headers, comments = parse_mapping_file(open(mapping_fp, 'U'))

    # added in ability to combine metadata columns and summarize based on the
    # new combined category
    if '&&' in mapping_category:
        new_mapping = []
        new_mapping.append(headers)
        for i in range(len(mapping)):
            new_mapping.append(mapping[i])
        # Create an array using multiple columns from mapping file
        combinecolorby = mapping_category.split('&&')
        mapping = combine_map_label_cols(combinecolorby, new_mapping)

    sample_metadata = mapping_file_to_dict(mapping, headers)
    with biom_open(otu_table_fp, 'U') as biom_file:
        table = parse_biom_table(biom_file)
    table.add_metadata(sample_metadata)
    # create a new OTU table where samples are binned based on their return
    # value from bin_function
    result = table.collapse(bin_function, norm=False, min_group_size=1,
                            axis='sample')

    # normalize the result if requested by the user
    if normalize:
        result.norm(axis='sample', inplace=True)

    # write a new BIOM file
    write_biom_table(result, output_fp)
Example #59
0
    def test_write_biom(self):
        with tempfile.NamedTemporaryFile(suffix='biom') as biom:
            with biom_open(biom.name,'w') as f:
                s = Stats_And_Summary()
                s.write_biom(('sample1','sample2'),
                          [
                                                 {'readname': ['ab','c'], 'readnameE': ['ab','d']},
                                                 {'readname2': ['ab','c']}
                                                 ],
                          f)

            with tempfile.NamedTemporaryFile(suffix='csv') as biom_out:
                os.remove(biom_out.name) #delete because otherwise biom complains
                subprocess.check_call("biom convert -i %s -o %s --table-type 'OTU table' --to-tsv --header-key taxonomy" %
                                                   (biom.name, biom_out.name), shell=True)
                observed = open(biom_out.name).read()
                self.assertEqual('''# Constructed from biom file
#OTU ID\tsample1\tsample2\ttaxonomy
1\t1.0\t0.0\tab; d
2\t1.0\t1.0\tab; c''', observed)