def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    if isdir(opts.otu_table_fp):
        ret_code = create_dir(opts.output_fp, fail_on_exist=False)
        # run on each file in dir
        for fp in glob(opts.otu_table_fp + '/*biom'):
            parent_dir_name, file_name = split(fp)
            basename, extension = splitext(file_name)
            out_fp = opts.output_fp + "/" + basename + "_shared_OTUs.txt"

            with open(out_fp, 'w') as out_fh:
                out_fh.write(calc_shared_phylotypes(load_table(fp),
                                                    opts.reference_sample))
    else:
        # run in single file mode
        try:
            out_fh = open(opts.output_fp, "w")
        except IOError as message:
            exit(("Can't open output file %s for writing. Check the "
                  "permissions or existing directory with identical "
                  "name.\n%s") % (opts.output_fp, message))
        out_fh.write(calc_shared_phylotypes(load_table(opts.otu_table_fp),
                                            opts.reference_sample))
    def test_normalize_table_CSS(self):
        """OTU table IDs should be the same before and after CSS normalization
        """
        q=load_table(self.tmp_otu_fp)
        self.assertItemsEqual(
            q.ids(),
            load_table(self.tmp_otu_fp_out_CSS).ids())

        #test taxonomy added to CSS; DESeq gives negatives so no taxonomy added
        self.assertItemsEqual(
            q.metadata(axis='observation'),
            load_table(self.tmp_otu_fp_out_CSS).metadata(axis='observation'))

        """catch any R/metagenomeSeq version changes by testing output against current version
        """
        z = load_table(self.tmp_otu_fp_out_CSS)
        OTU_1848 = [val[28] for (val, otu_id, meta) in z.iter(axis='sample')]
        OTU_1848_CSS = [13.873, 14.185, 13.532, 12.824, 14.666, 14.257, 14.416, 14.993, 13.882, 13.453, 14.84, 14.435, 8.8397, 9.8069, 8.0537, 10.571, 8.3851, 8.27, 6.6582, 8.8221, 11.136, 10.928, 11.419, 9.1489, 9.6962, 14.257, 13.901, 13.288, 13.162, 13.84, 12.759, 13.796, 14.489, 15.433, 13.804, 14.298, 13.484, 14.101]
        assert_almost_equal(OTU_1848, OTU_1848_CSS, decimal=3, 
                            err_msg='possible CSS method change, or version change')

        OTU_88 = [val[0] for (val, otu_id, meta) in z.iter(axis='sample')]
        OTU_88_CSS = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.745, 6.8905, 6.3399, 0.0, 6.8628, 7.0697, 6.6114, 0.0, 0.0, 5.745, 7.3895, 6.4382, 8.7468, 6.6582, 6.1785, 6.398, 6.8088, 7.1685, 6.8088, 0.0, 0.0, 0.0, 7.1685, 0.0, 7.1685]
        assert_almost_equal(OTU_88, OTU_88_CSS, decimal=2, 
                            err_msg='possible CSS method change, or version change')
def compute_mock_results(result_tables,
                 expected_table_lookup,
                 taxonomy_level=6):
    """ Compute precision, recall, and f-measure for result_tables at taxonomy_level

        result_tables: 2d list of tables to be compared to expected tables,
         where the data in the inner list is:
          [dataset_id, reference_database_id, method_id,
           parameter_combination_id, table_fp]
        expected_table_lookup: 2d dict of dataset_id, reference_db_id to BIOM
         table filepath, for the expected result tables
        taxonomy_level: level to compute results

    """
    results = []
    for dataset_id, reference_id, method_id, params, actual_table_fp in result_tables:
        ## parse the expected table (unless taxonomy_level is specified, this should be
        ## collapsed on level 6 taxonomy)
        try:
            expected_table_fp = expected_table_lookup[dataset_id][reference_id]
        except KeyError:
            raise KeyError, "Can't find expected table for (%s, %s)." % (dataset_id, reference_id)

        try:
            expected_table = load_table(expected_table_fp)
        except ValueError:
            raise ValueError, "Couldn't parse BIOM table: %s" % expected_table_fp

        ## parse the actual table and collapse it at the specified taxonomic level
        try:
            actual_table = load_table(actual_table_fp)
        except ValueError:
            raise ValueError, "Couldn't parse BIOM table: %s" % actual_table_fp
        collapse_by_taxonomy = get_taxonomy_collapser(taxonomy_level)
        actual_table = actual_table.collapse(collapse_by_taxonomy, axis='observation', min_group_size=1)

        ## compute precision, recall, and f-measure
        try:
            p,r,f = compute_prf(actual_table,
                                expected_table)
        except ZeroDivisionError:
            p, r, f = -1., -1., -1.

        # compute pearson and spearman
        actual_vector, expected_vector = get_actual_and_expected_vectors(actual_table,
                                                                         expected_table)

        pearson_r, pearson_p = pearsonr(actual_vector, expected_vector)
        spearman_r, spearman_p = spearmanr(actual_vector, expected_vector)

        results.append((dataset_id, reference_id, method_id, params, p, r, f,
                        pearson_r, pearson_p, spearman_r, spearman_p))

    return pd.DataFrame(results, columns=["Dataset", "Reference", "Method",
                                           "Parameters", "Precision", "Recall",
                                           "F-measure", "Pearson r", "Pearson p",
                                           "Spearman r", "Spearman p"])
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    input_fps = opts.input_fps

    master = load_table(input_fps[0])

    for input_fp in input_fps[1:]:
        master = master.merge(load_table(input_fp))

    write_biom_table(master, opts.output_fp)
Exemple #5
0
    def test_biom_v210_format_to_biom_v100_format(self):
        input, obs = self.transform_format(BIOMV210Format, BIOMV100Format,
                                           filename='feature-table_v210.biom')
        exp = biom.load_table(str(input))
        obs = biom.load_table(str(obs))

        self.assertEqual(obs.ids(axis='observation').all(),
                         exp.ids(axis='observation').all())
        self.assertEqual(obs.ids(axis='sample').all(),
                         exp.ids(axis='sample').all())
Exemple #6
0
def merge_biom_tables(master_fp, additional_fp):
    """
    :param master_fp: str
    :param additional_fp: str
    :return: None
    """
    master = load_table(master_fp)
    master = master.merge(load_table(additional_fp))

    with biom_open(master_fp, 'w') as biom_file:
        master.to_hdf5(biom_file, "amquery", True)
Exemple #7
0
    def test_run_pick_closed_reference_otus_sortmerna(self):
        """run_pick_closed_reference_otus generates expected results
           using sortmerna
        """

        self.params['pick_otus']['otu_picking_method'] = "sortmerna"

        run_pick_closed_reference_otus(
            self.test_data['seqs'][0],
            self.test_data['refseqs'][0],
            self.test_out,
            self.test_data['refseqs_tax'][0],
            call_commands_serially,
            self.params,
            self.qiime_config,
            parallel=False,
            status_update_callback=no_status_updates)

        input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0]
        otu_map_fp = join(self.test_out, 'sortmerna_picked_otus',
                          '%s_otus.txt' % input_file_basename)
        otu_table_fp = join(self.test_out, 'otu_table.biom')
        otu_table = load_table(otu_table_fp)
        expected_sample_ids = ['f1', 'f2', 'f3', 'f4', 'p1', 'p2', 't1', 't2']
        self.assertItemsEqual(otu_table.ids(), expected_sample_ids)

        # Number of OTUs matches manually confirmed result
        otu_map_lines = list(open(otu_map_fp))
        num_otus = len(otu_map_lines)
        otu_map_otu_ids = [o.split()[0] for o in otu_map_lines]
        self.assertEqual(num_otus, 3)

        # parse the otu table
        otu_table = load_table(otu_table_fp)
        expected_sample_ids = ['f1', 'f2', 'f3', 'f4', 'p1', 'p2', 't1', 't2']
        # sample IDs are as expected
        self.assertItemsEqual(otu_table.ids(), expected_sample_ids)
        # otu ids are as expected
        self.assertItemsEqual(otu_table.ids(axis='observation'),
                              otu_map_otu_ids)

        # expected number of sequences in OTU table
        number_seqs_in_otu_table = sum([v.sum()
                                       for v in otu_table.iter_data()])
        self.assertEqual(number_seqs_in_otu_table, 117)

        # One tax assignment per otu
        self.assertEqual(len(otu_table.metadata(axis='observation')), 3)

        # Check that the log file is created and has size > 0
        log_fp = glob(join(self.test_out, 'log*.txt'))[0]
        self.assertTrue(getsize(log_fp) > 0)
Exemple #8
0
    def test_biom_table_to_biom_v210_format(self):
        filepath = self.get_data_path('feature-table_v210.biom')
        transformer = self.get_transformer(biom.Table, BIOMV210Format)
        input = biom.load_table(filepath)

        obs = transformer(input)
        obs = biom.load_table(str(obs))

        exp = input
        self.assertEqual(obs.ids(axis='observation').all(),
                         exp.ids(axis='observation').all())
        self.assertEqual(obs.ids(axis='sample').all(),
                         exp.ids(axis='sample').all())
Exemple #9
0
    def dropped_samples(self):
        """The samples that were selected but dropped in processing

        Returns
        -------
        dict of sets
            Format is {artifact_id: {sample_id, sample_id, ...}, ...}
        """
        with qdb.sql_connection.TRN:
            bioms = self.biom_tables
            if not bioms:
                return {}

            # get all samples selected for the analysis, converting lists to
            # sets for fast searching. Overhead less this way
            # for large analyses
            all_samples = {k: set(v) for k, v in viewitems(self.samples)}

            for biom, filepath in viewitems(bioms):
                table = load_table(filepath)
                ids = set(table.ids())
                for k in all_samples:
                    all_samples[k] = all_samples[k] - ids

            # what's left are unprocessed samples, so return
            return all_samples
def multiple_file_DA_fitZIG(input_dir, output_dir, mapping_fp, mapping_category, subcategory_1, subcategory_2):
    """perform metagenomeSeq's Zero Inflated Gaussian (ZIG) OTU differential abundance test on a directory of raw abundance OTU matrices
    """
    if not exists(output_dir):
        makedirs(output_dir)
    file_names = [fname for fname in listdir(input_dir) if not (fname.startswith('.')\
        or isdir(fname))]

    for fname in file_names:
        base_fname, ext = splitext(fname)
        original_fname = base_fname+'.biom'
        hdf5_infile = join(input_dir, original_fname)
        tmp_bt = load_table(hdf5_infile) 
        tmp_pmf, _ = parse_mapping_file_to_dict(mapping_fp)
        check_mapping_file_category(tmp_bt, mapping_fp, mapping_category, subcategory_1, subcategory_2)
        tmp_bt.add_metadata(tmp_pmf, 'sample')
        #make temporary json biom version - R currently does not have hdf5
        outfile = join(output_dir, 'fitZIG_DA_'+base_fname+'.txt')

        with tempfile.NamedTemporaryFile(dir=get_qiime_temp_dir(),
                                         prefix='QIIME-differential-abundance-temp-table-',
                                         suffix='.biom') as temp_fh:
            temp_fh.write(tmp_bt.to_json('forR'))
            temp_fh.flush()
            run_fitZIG(temp_fh.name, outfile, mapping_category, subcategory_1, subcategory_2) 
def get_relative_abundance(biomfile):
    """
    Return arcsine transformed relative abundance from a BIOM format file.

    :type biomfile: BIOM format file
    :param biomfile: BIOM format file used to obtain relative abundances for each OTU in
                     a SampleID, which are used as node sizes in network plots.

    :type return: Dictionary of dictionaries.
    :return: Dictionary keyed on SampleID whose value is a dictionarykeyed on OTU Name
             whose value is the arc sine tranfsormed relative abundance value for that
             SampleID-OTU Name pair.
    """
    biomf = biom.load_table(biomfile)
    norm_biomf = biomf.norm(inplace=False)
    rel_abd = {}
    for sid in norm_biomf.ids():
        rel_abd[sid] = {}
        for otuid in norm_biomf.ids("observation"):
            otuname = oc.otu_name(norm_biomf.metadata(otuid, axis="observation")["taxonomy"])
            otuname = " ".join(otuname.split("_"))
            abd = norm_biomf.get_value_by_ids(otuid, sid)
            rel_abd[sid][otuname] = abd
    ast_rel_abd = bc.arcsine_sqrt_transform(rel_abd)
    return ast_rel_abd
Exemple #12
0
    def test_rarefy(self):
        params = {'Sampling depth': 2, 'BIOM table': 5}
        data = {'user': '******',
                'command': dumps(['qiime2', qiime2_version,
                                  'Rarefy features']),
                'status': 'running',
                'parameters': dumps(params)}

        jid = self.qclient.post('/apitest/processing_job/', data=data)['job']

        out_dir = mkdtemp()
        self._clean_up_files.append(out_dir)

        success, ainfo, msg = rarefy(self.qclient, jid, params, out_dir)
        self.assertTrue(success)
        self.assertEqual(msg, '')
        self.assertEqual(ainfo[0].files,
                         [(join(out_dir, 'rarefy', 'rarefied.biom'), 'biom')])
        self.assertEqual(ainfo[0].output_name, 'Rarefied table')

        # testing that the table is actually rarefied, [0] cause there is only
        # one element, and [0][0] from that element we want the first element
        # of the first tuple
        rb = load_table(ainfo[0].files[0][0])
        # 2 * 7 cause we rarefied at 2 sequences per sample and we have 7
        # samples
        self.assertEqual(rb.sum(), 2 * 7)
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    if opts.level <= 0:
        option_parser.error("level must be greater than zero!")

    collapse_f = make_collapse_f(opts.metadata_category, opts.level,
                                 opts.ignore)
    table = load_table(opts.input_fp)

    if h5py.is_hdf5(opts.input_fp):
        # metadata are not deserializing correctly. Duct tape it.
        update_d = {}
        for i, md in zip(table.ids(axis='observation'),
                         table.metadata(axis='observation')):
            update_d[i] = {k: json.loads(v[0]) for k, v in md.items()}
        table.add_metadata(update_d, axis='observation')

    result = table.collapse(collapse_f, axis='observation', one_to_many=True,
                            norm=False,
                            one_to_many_md_key=opts.metadata_category)

    if(opts.format_tab_delimited):
        f = open(opts.output_fp, 'w')
        f.write(result.to_tsv(header_key=opts.metadata_category,
                              header_value=opts.metadata_category,
                              metadata_formatter=lambda s: '; '.join(s)))
        f.close()
    else:
        format_fs = {opts.metadata_category: vlen_list_of_str_formatter}
        write_biom_table(result, opts.output_fp, format_fs=format_fs)
    def run(self, **kwargs):
        biom_table = kwargs['biom_table']
        axis = kwargs['axis']
        relative_abund = kwargs['relative_abund']
        p_a = kwargs['presence_absence']

        if axis not in self.Axes:
            raise CommandError("Invalid axis '%s'. Must be either %s." % (
                axis,
                ' or '.join(map(lambda e: "'%s'" % e, self.Axes))))

        if biom_table is None:
            raise CommandError("Must specify an input table")

        if relative_abund is False and p_a is False:
            raise CommandError("Must specifiy a normalization type")
        elif relative_abund is True and p_a is True:
            raise CommandError("Must specify only one normalization type")

        table = load_table(biom_table)

        if relative_abund is True:
            table.norm(axis=axis)
        else:
            table.pa()

        if HAVE_H5PY:
            return {'table': (table, 'hdf5')}
        else:
            return {'table': (table, 'json')}
Exemple #15
0
    def getResult(self, data_path, tree_path=None):
        """Returns distance matrix from (indcidence matrix and optionally tree).

        Parameters:

        data_path: path to data file, matrix (samples = cols, taxa = rows)
        in tab-delimited text format

        tree_path: path or object.
        if method is phylogenetic, must supply tree_path.
        if path, path to
        Newick-format tree file where taxon ids match taxon ids in the
        input data file.

        returns 2d dist matrix, list of sample names ordered as in dist mtx
        """
        # if it's a phylogenetic metric, read the tree
        if self.IsPhylogenetic:
            tree = self.getTree(tree_path)
        else:
            tree = None

        otu_table = load_table(data_path)
        otumtx = asarray([v for v in otu_table.iter_data(axis='sample')])

        # get the 2d dist matrix from beta diversity analysis
        if self.IsPhylogenetic:
            return (self.Metric(otumtx, otu_table.observation_ids, tree,
                                otu_table.sample_ids),
                    list(otu_table.sample_ids))
        else:
            return self.Metric(otumtx), list(otu_table.sample_ids)
Exemple #16
0
def _read_biom(fp, transpose=True, sparse=True):
    """Read in a biom table file.

    Parameters
    ----------
    fp : str
        file path to the biom table
    transpose : bool
        Transpose the table or not. The OTU table has samples in
        column while sklearn and other packages require samples in
        row. So you should transpose the data table.
    """
    logger.debug("loading biom table %s" % fp)
    table = biom.load_table(fp)
    sid = table.ids(axis="sample")
    oid = table.ids(axis="observation")
    logger.debug("loaded %d samples, %d observations" % (len(sid), len(oid)))
    if sparse:
        logger.debug("storing as sparse matrix")
        data = scipy.sparse.csr_matrix(table.matrix_data)
    else:
        logger.debug("storing as dense matrix")
        data = table.matrix_data.toarray()

    feature_md = _get_md_from_biom(table)

    if transpose:
        logger.debug("transposing table")
        data = data.transpose()

    return sid, oid, data, feature_md
Exemple #17
0
def main():
    if len(sys.argv) != 4:
        sys.exit(
            'Usage: %s <input BIOM file> <output filepath> '
            '<alpha diversity metric>\n'
            'Example: %s table.biom results.txt pielou_e' %
            (sys.argv[0], sys.argv[0]))

    biom_fp, output_fp, metric = sys.argv[1:]

    status('Loading BIOM table...')
    table = biom.load_table(biom_fp)
    sample_ids = table.ids('sample')

    status('Obtaining dense array from BIOM table (if you run out of memory, '
           'email Jai)...')
    table_data = table.transpose().matrix_data.toarray().astype(
        int, casting='unsafe')

    status('Computing alpha diversity for each sample (metric=%s)...' % metric)
    results = skbio.diversity.alpha_diversity(metric, table_data,
                                              ids=sample_ids)

    with open(output_fp, 'w') as output_fh:
        output_fh.write('\t%s\n' % metric)
        results.to_csv(output_fh, sep='\t', index=True, decimal='.',
                       na_rep='nan')
    status('Results are in %s' % output_fp)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    output_f = open(opts.output_distance_matrix, 'w')
    if opts.otu_table_fp:
        otu_table = load_table(opts.otu_table_fp)
        samples_to_keep = otu_table.ids()
        # samples_to_keep = \
        # sample_ids_from_otu_table(open(opts.otu_table_fp,'U'))
    elif opts.sample_id_fp:
        samples_to_keep = \
            get_seqs_to_keep_lookup_from_seq_id_file(
                open(opts.sample_id_fp, 'U'))
    elif opts.mapping_fp and opts.valid_states:
        try:
            samples_to_keep = sample_ids_from_metadata_description(
                open(opts.mapping_fp, 'U'), opts.valid_states)
        except ValueError as e:
            option_parser.error(e.message)
    else:
        option_parser.error('must pass either --sample_id_fp, -t, or -m and '
                            '-s')
    # note that negate gets a little weird here. The function we're calling
    # removes the specified samples from the distance matrix, but the other
    # QIIME filter scripts keep these samples specified.  So, the interface of
    # this script is designed to keep the specified samples, and therefore
    # negate=True is passed to filter_samples_from_distance_matrix by default.
    d = filter_samples_from_distance_matrix(
        parse_distmat(
            open(opts.input_distance_matrix, 'U')),
        samples_to_keep,
        negate=not opts.negate)
    output_f.write(d)
    output_f.close()
Exemple #19
0
    def dropped_samples(self):
        """The samples that were selected but dropped in processing

        Returns
        -------
        dict of sets
            Format is {processed_data_id: {sample_id, sample_id, ...}, ...}
        """
        bioms = self.biom_tables
        if not bioms:
            return {}

        # get all samples selected for the analysis, converting lists to
        # sets for fast searching. Overhead less this way for large analyses
        all_samples = {k: set(v) for k, v in viewitems(self.samples)}

        for biom, filepath in viewitems(bioms):
            table = load_table(filepath)
            # remove the samples from the sets as they are found in the table
            proc_data_ids = set(sample['Processed_id']
                                for sample in table.metadata())
            ids = set(table.ids())
            for proc_data_id in proc_data_ids:
                all_samples[proc_data_id] = all_samples[proc_data_id] - ids

        # what's left are unprocessed samples, so return
        return all_samples
def main():
    args = parser.parse_args()

    input_biom_fp = args.input_biom_fp
    output_biom_fp = args.output_biom_fp
    merged_fastq_fp = args.merged_fastq_fp
    
    deblur_biom = load_table(input_biom_fp)

    if output_biom_fp is None:
        output_biom_fp = os.path.splitext(input_biom_fp)[0] + '.merged.biom'

    with open(merged_fastq_fp) as fq:
        
        merged_fastq = readfq(fq)

        # read each of the fastqs, make a dict of label:merged read
        merge_dict = get_merged_dict(merged_fastq)

        # filter biom to just the keys of dict
        deblur_biom = deblur_biom.filter(lambda val, id_, md: id_ in merge_dict,
                                         axis='observation')

        output_biom = collapse_biom_observations(deblur_biom, merge_dict)

        with biom_open(output_biom_fp, 'w') as f:
            output_biom.to_hdf5(f, 'deblur_relabel_merged.py')
    def test_build_biom_tables(self):
        new_id = qdb.util.get_count('qiita.filepath') + 1
        grouped_samples = {'18S.1.3': [(
            4, ['1.SKB8.640193', '1.SKD8.640184', '1.SKB7.640196'])]}
        self.analysis._build_biom_tables(grouped_samples, 100)
        obs = self.analysis.biom_tables
        self.assertEqual(obs, {'18S': self.biom_fp})

        table = load_table(self.biom_fp)
        obs = set(table.ids(axis='sample'))
        exp = {'1.SKB8.640193', '1.SKD8.640184', '1.SKB7.640196'}
        self.assertEqual(obs, exp)

        obs = table.metadata('1.SKB8.640193')
        exp = {'study':
               'Identification of the Microbiomes for Cannabis Soils',
               'artifact_ids': '4',
               'reference_id': '1',
               'command_id': '3'}
        self.assertEqual(obs, exp)

        sql = """SELECT EXISTS(SELECT * FROM qiita.filepath
                 WHERE filepath_id=%s)"""
        obs = self.conn_handler.execute_fetchone(sql, (new_id,))[0]
        self.assertTrue(obs)

        sql = """SELECT * FROM qiita.analysis_filepath
                 WHERE analysis_id=%s ORDER BY filepath_id"""
        obs = self.conn_handler.execute_fetchall(sql, (self.analysis.id,))
        exp = [[1L, 15L, 2L], [1L, 16L, None], [1L, new_id, 2L]]
        self.assertEqual(obs, exp)
def multiple_file_DA_DESeq2(input_dir, output_dir, mapping_fp, mapping_category, subcategory_1, subcategory_2, DESeq2_diagnostic_plots):
    """perform DESeq2 negative binomial Wald differential abundance test on a directory of raw abundance OTU matrices
    """
    if not exists(output_dir):
        makedirs(output_dir)
    file_names = [fname for fname in listdir(input_dir) if not (fname.startswith('.')\
        or isdir(fname))]

    for fname in file_names:
        base_fname, ext = splitext(fname)
        original_fname = base_fname+'.biom'
        hdf5_infile = join(input_dir, original_fname)
        tmp_bt = load_table(hdf5_infile)
        tmp_pmf, _ = parse_mapping_file_to_dict(mapping_fp)
        check_mapping_file_category(tmp_bt, mapping_fp, mapping_category, subcategory_1, subcategory_2)
        tmp_bt.add_metadata(tmp_pmf, 'sample')
        outfile = join(output_dir, 'DESeq2_DA_'+base_fname+'.txt') 
        outfile_diagnostic = join(output_dir, 'DESeq2_diagnostic_plots_'+base_fname+'.pdf') 

        with tempfile.NamedTemporaryFile(dir=get_qiime_temp_dir(),
                                         prefix='QIIME-differential-abundance-temp-table-',
                                         suffix='.biom') as temp_fh:
            temp_fh.write(tmp_bt.to_json('forR'))
            temp_fh.flush()
            run_DESeq2(temp_fh.name, outfile, mapping_category, subcategory_1, subcategory_2, DESeq2_diagnostic_plots, outfile_diagnostic) 
def main(table_in, table_out, pathways, to_classic):
    # setup
    table = load_table(table_in)
    pathway_dict = get_pathway2kos()

    # get set of kos from pathways
    pathways_kos = set()
    for pathway in pathways:
        pathways_kos = pathways_kos | pathway_dict[pathway.strip()[-5:]]

    # get selected kos
    kos_to_keep = set(table.ids('observation')) & \
        pathways_kos
    if len(kos_to_keep) == 0:
        raise EmptySetERROR('Intersection created empty set')
    obs_ids = np.array(list(kos_to_keep))
    data = np.empty([len(obs_ids), len(table.ids('sample'))])
    for i, obs in enumerate(obs_ids):
        data[i] = table.data(obs, 'observation')

    # output
    new_table = Table(data, obs_ids, table.ids('sample'), type="OTU table")
    if to_classic:
        # print to tab delimited biom table
        f = open(table_out, 'w')
        f.write(new_table.to_tsv())
    else:
        # print json biom table
        new_table.to_json("filter_KOs_by_pathway.py", open(table_out, 'w'))
Exemple #24
0
def sufficient_sequence_counts(opts, sample_ids):
    """Errors if the sequence counts post filtering are < 1000

    Parameters
    ----------
    opts : dict
        A dict of relevant opts.

    sample_ids : Iterable of str
        A list of sample IDs of interest

    Returns
    -------
    dict
        A dict containing each sample ID and any errors observed or None if
        no error was observed for the sample. {str: str or None}
    """
    results = {}
    table = biom.load_table(opts['otus']['100nt']['ag-biom'])

    minimum_depth = opts['rarefaction-depth']

    for id_ in sample_ids:
        results[id_] = None

        if table.exists(id_):
            counts = table.data(id_).sum()
            if counts < minimum_depth:
                results[id_] = '%d seqs after filtering for blooms' % counts
        else:
            results[id_] = '0 seqs after filtering for blooms'

    return results
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    output_table_fp = opts.output_otu_table_fp
    metadata_field = opts.metadata_field
    positive_taxa = opts.positive_taxa
    negative_taxa = opts.negative_taxa

    input_table = load_table(opts.input_otu_table_fp)

    if positive_taxa is not None:
        positive_taxa = positive_taxa.split(',')
    else:
        positive_taxa = None

    if negative_taxa is not None:
        negative_taxa = negative_taxa.split(',')
    else:
        negative_taxa = None

    filter_fn = get_otu_ids_from_taxonomy_f(positive_taxa, negative_taxa,
                                            metadata_field)
    input_table.filter(filter_fn, axis='observation')

    try:
        write_biom_table(input_table, output_table_fp)
    except EmptyBIOMTableError:
        option_parser.error(
            "Filtering resulted in an empty BIOM table. "
            "This indicates that no OTUs remained after filtering.")
def main():
    args = parser.parse_args()

    input_biom = args.input_biom
    output_fp = args.output_fp
    read1_q = args.read1_q
    read2_q = args.read2_q
    read1_trim = args.read1_trim
    read2_trim = args.read2_trim
    orientation = args.orientation
    test = args.test
    
    if test:
        run_unittests()
        return(0)

    deblur_biom = load_table(input_biom)

    if output_fp is None:
        output_fp = os.path.splitext(input_biom)[0]

    r1_fp = output_fp + '.R1.fastq'
    r2_fp = output_fp + '.R2.fastq'

    joined_seqs = deblur_biom.ids(axis='observation')
    split_seqs = uncat_seqs_to_fastq(joined_seqs, read1_q, read2_q, read1_trim, read2_trim, orientation=orientation)

    with open(r1_fp, 'w') as r1_f, open(r2_fp, 'w') as r2_f:
        i = 0
        for r1, r2 in split_seqs:
            i += 1
            r1_f.write(r1)
            r2_f.write(r2)

    print('Split {0} records'.format(i, file=sys.stderr),file=sys.stderr)
Exemple #27
0
def _format_nodes(fp, table):
    """Format the nodes file

    Parameters
    ---------
    fp : open file
        Where to write the results too
    table : file path
        A file path to the BIOM table
    """
    header = ['Feature1', 'Fsum']

    table = biom.load_table(table)
    _, _, tmp_md = next(table.iter(axis='observation'))
    header.extend(sorted(tmp_md.keys()))

    fp.write("\t".join(header))
    fp.write('\n')

    for values, id_, md in table.iter(axis='observation'):
        line = [str(id_), str(values.sum())]
        for key in sorted(md.keys()):
            md_value = md[key]
            if isinstance(md_value, (list, tuple, set)):
                line.append(" ".join([str(v) for v in md_value]))
            else:
                line.append(str(md_value))
        fp.write('\t'.join(line))
        fp.write('\n')
Exemple #28
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    output_dir = opts.output_dir
    create_dir(output_dir)

    otu_table_fp = opts.otu_table
    otu_table = load_table(otu_table_fp)

    tree_fh = open(opts.tree_file, "U")
    tree = DndParser(tree_fh)
    tree_fh.close()

    mapping_fp = opts.mapping_fp
    if mapping_fp:
        mapping_f = open(mapping_fp, "U")
        input_map_basename = splitext(split(mapping_fp)[1])[0]
    else:
        mapping_f = None
        input_map_basename = None

    input_table_basename = splitext(split(otu_table_fp)[1])[0]

    simsam_range_to_files(
        otu_table,
        tree,
        simulated_sample_sizes=map(int, opts.num.split(",")),
        dissimilarities=map(float, opts.dissim.split(",")),
        output_dir=output_dir,
        mapping_f=mapping_f,
        output_table_basename=input_table_basename,
        output_map_basename=input_map_basename,
    )
Exemple #29
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    otu_table_fp = opts.otu_table_fp
    mapping_fp = opts.mapping_fp
    mapping_field = opts.mapping_field
    output_dir = opts.output_dir
    # column_rename_ids = opts.column_rename_ids
    # include_repeat_cols = opts.include_repeat_cols

    create_dir(output_dir)

    # split mapping file
    mapping_f = open(mapping_fp, 'U')
    for fp_str, sub_mapping_s in split_mapping_file_on_field(mapping_f, mapping_field):
        mapping_output_fp = join(output_dir, 'mapping_%s.txt' % fp_str)
        open(mapping_output_fp, 'w').write(sub_mapping_s)

    # split otu table
    otu_table_base_name = splitext(split(otu_table_fp)[1])[0]
    mapping_f = open(mapping_fp, 'U')

    otu_table = load_table(otu_table_fp)

    try:
        for fp_str, sub_otu_table_s in split_otu_table_on_sample_metadata(
                otu_table,
                mapping_f,
                mapping_field):
            otu_table_output_fp = join(output_dir, '%s_%s.biom' % (
                otu_table_base_name, fp_str))

            write_biom_table(sub_otu_table_s, otu_table_output_fp)
    except OTUTableSplitError as e:
        option_parser.error(e)
Exemple #30
0
 def test_ctf(self):
     """Tests the basic validity of the actual ctf() method's outputs."""
     self.biom_table = load_table(self.in_table)
     self.meta_table = read_csv(self.in_meta, sep='\t', index_col=0)
     ordtst, disttst, stst, ftst = ctf(table=self.biom_table,
                                       sample_metadata=self.meta_table,
                                       individual_id_column=self.subj,
                                       state_column=self.state)
     # Validate types of the ctf outputs
     self.assertIsInstance(ordtst, OrdinationResults)
     self.assertIsInstance(disttst, DistanceMatrix)
     self.assertIsInstance(stst, pd.DataFrame)
     self.assertIsInstance(ftst, pd.DataFrame)
     # Ensure that no NaNs are in the OrdinationResults
     # NOTE that we have to use the DataFrame .any() functions instead of
     # python's built-in any() functions -- see #29 for details on this
     self.assertFalse(np.isnan(ordtst.features).any(axis=None))
     self.assertFalse(np.isnan(ordtst.samples).any(axis=None))
Exemple #31
0
def get_otu_color(taxonomy_file,phylum_colors,feature_table):
    otu_lineage = {}
    taxon = pd.read_csv(taxonomy_file, sep='\t')
    taxon = taxon.set_index(taxon['Feature ID'])
    otutable = biom.load_table(feature_table)
    otutable = otutable.to_dataframe()
    for ele in otutable.index:
        lineage = taxon.loc[ele]['Taxon']
        lineage = lineage.split(';')
        try:
            otu_lineage[ele] = lineage[1]
        except:
            otu_lineage[ele] = lineage[0]
    otu_color={}
    for key in otu_lineage:
        phylum = otu_lineage[key]
        otu_color[key] = phylum_colors[phylum] 
    return otu_color
Exemple #32
0
def load_qiime_feature_table(artifact):
    """Load a feature table from a Qiime 2 artifact."""
    try:
        import biom
    except ImportError:
        raise ImportError(
            "Reading Qiime 2 FeatureTables requires the `biom-format` package."
            "You can install it with:\n pip install numpy Cython\n"
            "pip install biom-format")
    meta = metadata(artifact)
    if not meta["type"].startswith("FeatureTable["):
        raise ValueError("%s is not a Qiime 2 FeatureTable :(" % artifact)
    uuid = meta["uuid"]
    with ZipFile(artifact) as zf, TemporaryDirectory(prefix="micom_") as td:
        zf.extract(uuid + "/data/feature-table.biom", str(td))
        table = biom.load_table(
            path.join(str(td), uuid, "data", "feature-table.biom"))
    return table
Exemple #33
0
 def check_networks(self, msg):
     # define how files should be checked for, it is important that import functions work!
     if 'network' in msg:
         if msg['network'] is not None:
             filelist = deepcopy(msg['network'])
             for file in filelist:
                 network = nx.read_weighted_edgelist(file)
                 self.checks += "Loaded network from " + file + ". \n\n"
                 nodes = len(network.nodes)
                 edges = len(network.edges)
                 self.checks += "This network has " + str(nodes) + \
                                " nodes and " + str(edges) + " edges. \n\n"
                 weight = nx.get_edge_attributes(network, 'weight')
                 if len(weight) > 0:
                     self.checks += 'This is a weighted network. \n\n'
                 else:
                     self.checks += 'This is an unweighted network. \n\n'
                 allbioms = list()
                 for level in msg['procbioms']:
                     for biom in msg['procbioms'][level]:
                         allbioms.append(msg['procbioms'][level][biom])
                 match = 0
                 taxa = None
                 for biomfile in allbioms:
                     try:
                         biomtab = load_table(biomfile)
                         taxa = biomtab.ids(axis='observation')
                     except TypeError:
                         wx.LogError(
                             "Could not access source BIOM file '%s'." %
                             file)
                         logger.error("Could not access source BIOM file. ",
                                      exc_info=True)
                     if len(taxa) > 1:
                         nodes = list(network.nodes)
                         if all(elem in taxa for elem in nodes):
                             match += 1
                             self.checks += 'Node identifiers in ' + biomfile + \
                                            ' matched node identifiers in ' + file + '. \n\n'
                 if match == 0:
                     wx.LogError("No BIOM file matched network nodes!")
                     logger.error("No BIOM file matched network nodes!. ",
                                  exc_info=True)
         self.review.SetValue(self.checks)
Exemple #34
0
def alpha_diversity_pre(otu_table, metric, tree=None):
    df = biom.load_table(otu_table).to_dataframe()
    result = ''
    if metric == 'faith_pd':
        tree = skbio.TreeNode.read(tree)
        result = skbio.diversity.alpha_diversity(counts=df.T.values,
                                                 ids=df.columns,
                                                 metric='faith_pd',
                                                 tree=tree,
                                                 otu_ids=df.index)
    elif metric == 'ace':
        result = skbio.diversity.alpha_diversity(
            counts=df.T.values.astype(int), ids=df.columns, metric=metric)
    else:
        result = skbio.diversity.alpha_diversity(counts=df.T.values,
                                                 ids=df.columns,
                                                 metric=metric)
    result = pd.DataFrame(result, columns=['alpha_div'])
    return result
Exemple #35
0
    def test_build_biom_tables_duplicated_samples_not_merge(self):
        analysis = self._create_analyses_with_samples()
        grouped_samples = {
            '18S || algorithm':
            [(4, ['1.SKB8.640193', '1.SKD8.640184', '1.SKB7.640196']),
             (5, ['1.SKB8.640193', '1.SKD8.640184', '1.SKB7.640196'])]
        }
        obs_bioms = analysis._build_biom_tables(grouped_samples, True)
        obs = [(a, basename(b)) for a, b in obs_bioms]
        biom_fp = ("%s_analysis_18S_algorithm.biom" % analysis.id)
        self.assertEqual(obs, [('18S', biom_fp)])

        table = load_table(obs_bioms[0][1])
        obs = set(table.ids(axis='sample'))
        exp = {
            '4.1.SKD8.640184', '4.1.SKB7.640196', '4.1.SKB8.640193',
            '5.1.SKB8.640193', '5.1.SKB7.640196', '5.1.SKD8.640184'
        }
        self.assertItemsEqual(obs, exp)
Exemple #36
0
def run_poisson_cat(
    table: str,
    metadata: str,
    category: str,
    reference_category: str,
    output_path: str,
    filter_category_value: str,
) -> None:

    # table_df = load_table(table).to_dataframe()
    loaded_table = load_table(table)
    metadata_df = pd.read_csv(metadata, index_col=0, sep="\t")
    unique_cats = metadata_df[category].unique()
    if filter_category_value is not None and unique_cats.shape[0] > 2:
        if filter_category_value in unique_cats:
            # Based on https://stackoverflow.com/a/18173074/1073
            print("Number of samples pre-filtering: {}".format(
                metadata_df.shape[0]))
            # filtered_row_idxs = metadata_df[
            #     metadata_df[category] == filter_category_value
            # ].index
            metadata_df = metadata_df[
                metadata_df[category] != filter_category_value]
            loaded_table.filter(metadata_df.index)
            print("Number of samples post-filtering: {}".format(
                metadata_df.shape[0]))
            print("Number of features pre-filtering those with 0 counts: {}".
                  format(loaded_table.shape[0]))
            # remove features in table with 0 counts (to eliminate any features
            # that were only present in now-filtered-out samples)
            loaded_table.remove_empty(axis="observation")
            print("Number of features post-filtering those with 0 counts: {}".
                  format(loaded_table.shape[0]))
            # TODO remove samples in table without a certain amount of reads
            # supporting them?

    print("Running poisson_cat...")
    diff = poisson_cat(loaded_table, metadata_df, category, reference_category)
    print("Done.")
    diff.to_csv(output_path,
                sep="\t",
                header=["Differential"],
                index_label="FeatureID")
 def test_blat_database_mapper(self):
     """blat_database_mapper functions as expected """
     blat_database_mapper(query_fp=self.inseqs1_fp,
                          refseqs_fp=self.refseqs1_fp,
                          output_dir=self.test_out,
                          evalue=1e-10,
                          min_id=0.75,
                          genetic_code=11,
                          HALT_EXEC=False)
     observation_map_fp = join(self.test_out, 'observation_map.txt')
     self.assertTrue(exists(observation_map_fp))
     observation_table_fp = join(self.test_out, 'observation_table.biom')
     table = load_table(observation_table_fp)
     self.assertItemsEqual(table.ids(), ['s2', 's1'])
     self.assertItemsEqual(
         table.ids(axis='observation'),
         ['eco:b0122-pr',
          'eco:b0015-pr'])
     self.assertEqual(table.sum(), 5)
 def test_bwa_sw_database_mapper(self):
     """bwa_sw_database_mapper functions as expected """
     bwa_sw_database_mapper(query_fp=self.inseqs1_fp,
                            refseqs_fp=self.refseqs2_fp,
                            output_dir=self.test_out,
                            HALT_EXEC=False)
     observation_map_fp = join(self.test_out, 'observation_map.txt')
     self.assertTrue(exists(observation_map_fp))
     observation_table_fp = join(self.test_out, 'observation_table.biom')
     table = load_table(observation_table_fp)
     self.assertItemsEqual(table.ids(), ['s2', 's1'])
     self.assertItemsEqual(
         table.ids(axis='observation'),
         ['r1',
          'r2',
          'r3',
          'r4',
          'r5'])
     self.assertEqual(table.sum(), 6)
Exemple #39
0
    def _prepare_spar(self):
        """
        Carries out initial work before actually running SparCC.
        The initial writing function cannot be carried out
        in a multiprocessing operation because the Biom object cannot be pickled.
        However, the bash calls can be pickled; therefore, initial data prep
        is done first, then the SparCC calls are in parallel.

        :return:
        """
        filenames = self.get_filenames()
        for x in filenames:
            for y in filenames[x]:
                file = biom.load_table(filenames[x][y])
                otu = file.to_tsv()
                tempname = filenames[x][y][:-5] + '_otus_sparcc.txt'
                text_file = open(tempname, 'w')
                text_file.write(otu[29:])
                text_file.close()
Exemple #40
0
def get_otus(OTU_table):
    """ read the otu table.
        Args:
            OTUS_table: a biom format OTU table.
        Return:
            return a dict include otus and samples.like:{
                 OTU0:{Sample0:12,Sample1:22,Sample3:2},
                 OTU1:{Sample0:2,Sample1:22,Sample3:22},
                 OTU2:{Sample0:133,Sample1:122,Sample3:52},
            }
    """
    table = biom.load_table(OTU_table)
    df = table.to_dataframe().transpose().to_dense()
    otus = {}
    for otu in df.columns:
        tmp = df[otu]
        tmp = series2dict(tmp)
        otus[otu] = tmp
    return otus
 def setUp(self):
     super().setUp()
     self.exp_taxa = pd.read_csv(
         self.get_data_path('mock-3-expected-taxonomy.tsv'), sep='\t',
         index_col=0)
     self.obs_taxa = pd.read_csv(
         self.get_data_path('mock-3-observed-taxonomy.tsv'), sep='\t',
         index_col=0)
     self.obs_table = biom.load_table(
         self.get_data_path('mock-3-obs-table.biom'))
     self.prf_res_unw = pd.DataFrame.from_dict({
         'level': {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7},
         'Precision': {0: 0.96, 1: 0.96, 2: 0.96, 3: 0.96, 4: 0.96,
                       5: 0.9583333333333334, 6: 0.7368421052631579},
         'Recall': {0: 0.96, 1: 0.96, 2: 0.96, 3: 0.96, 4: 0.96,
                    5: 0.92, 6: 0.56},
         'F-measure': {0: 0.96, 1: 0.96, 2: 0.96, 3: 0.96, 4: 0.96,
                       5: 0.9387755102040817, 6: 0.6363636363636364}})[[
                          'level', 'Precision', 'Recall', 'F-measure']]
Exemple #42
0
def rpca(in_biom: str, output_dir: str,
         min_sample_depth: int, rank: int) -> None:
    """ Runs RPCA with an rclr preprocessing step"""

    # import table
    table = load_table(in_biom)
    # filter sample to min depth

    def sample_filter(val, id_, md): return sum(val) > min_sample_depth
    table = table.filter(sample_filter, axis='sample')
    table = table.to_dataframe().T.drop_duplicates()
    # rclr preprocessing and OptSpace (RPCA)
    opt = OptSpace(rank=rank).fit(rclr().fit_transform(table.copy()))
    rename_cols = {i - 1: 'PC' + str(i) for i in range(1, rank + 1)}

    # Feature Loadings
    feature_loading = pd.DataFrame(opt.feature_weights, index=table.columns)
    feature_loading = feature_loading.rename(columns=rename_cols)
    feature_loading.sort_values('PC1', inplace=True, ascending=True)

    # Sample Loadings
    sample_loading = pd.DataFrame(opt.sample_weights, index=table.index)
    sample_loading = sample_loading.rename(columns=rename_cols)

    proportion_explained = pd.Series(opt.explained_variance_ratio,
                                     index=list(rename_cols.values()))
    eigvals = pd.Series(opt.eigenvalues,
                        index=list(rename_cols.values()))
    # save ordination results
    ord_res = OrdinationResults(
        'PCoA',
        'Principal Coordinate Analysis',
        eigvals.copy(),
        sample_loading.copy(),
        features=feature_loading.copy(),
        proportion_explained=proportion_explained.copy())
    # write files to output folder
    ord_res.write(os.path.join(output_dir, 'RPCA_Ordination.txt'))
    # save distance matrix
    dist_res = skbio.stats.distance.DistanceMatrix(
        opt.distance, ids=sample_loading.index)
    dist_res.write(os.path.join(output_dir, 'RPCA_distance.txt'))
    return
Exemple #43
0
def t_test_cmd(table_file, metadata_file, category, output_file):
    metadata = pd.read_table(metadata_file, index_col=0)
    table = load_table(table_file)
    table = pd.DataFrame(np.array(table.matrix_data.todense()).T,
                         index=table.ids(axis='sample'),
                         columns=table.ids(axis='observation'))
    cats = metadata[category]
    cs = np.unique(cats)

    def func(x):
        return ttest_ind(*[x[cats == k] for k in cs])

    m, p = np.apply_along_axis(func, axis=0, arr=table.values)

    reject = p < 0.05
    features = pd.Series(reject, index=table.columns)
    diff_features = list(features.loc[features > 0].index)
    with open(output_file, 'w') as f:
        f.write(','.join(diff_features))
Exemple #44
0
def _disallow_empty_tables(wrapped_function, *args, **kwargs):
    bound_arguments = signature(wrapped_function).bind(*args, **kwargs)
    table = bound_arguments.arguments.get('table')
    if table is None:
        raise TypeError("The wrapped function has no parameter 'table'")

    if isinstance(table, BIOMV210Format):
        table = str(table)
        table_obj = biom.load_table(table)
    elif isinstance(table, biom.Table):
        table_obj = table
    else:
        raise ValueError("Invalid view type: table passed as "
                         f"{type(table)}")

    if table_obj.is_empty():
        raise ValueError("The provided table is empty")

    return wrapped_function(*args, **kwargs)
def main():

    parser = argparse.ArgumentParser(

        description="Creates output FASTA for each sample with each ASV repeated for every count in that sample.",

formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument("-f", "--fasta", metavar="FASTA", type=str,
                        help="Path to full FASTA file.", required=True)

    parser.add_argument("-b", "--biom", metavar="BIOM", type=str,
                        help="Path to BIOM table.", required=True)

    parser.add_argument("-o", "--outdir", metavar="PATH", type=str,
                        help="Name of folder to make for output files.", required=True)

    args = parser.parse_args()

    in_fasta = read_fasta(args.fasta)

    in_table = biom_to_pandas_df(biom.load_table(args.biom))

    # If no sequences in file then stop job.
    if not in_fasta:
        sys.exit("Stopping - no sequences in file.")

    make_output_dir(args.outdir)

    for sample in in_table.columns:
        sample_outfile = args.outdir + "/" + sample + ".fasta"

        sample_outfh = open(sample_outfile, 'wt')

        for asv in in_table.index.values:
            asv_count = in_table.loc[asv, sample]
            if asv_count > 0:
                for i in range(int(asv_count)):
                    print(">" + asv + "_" + sample + "_" + str(i), file=sample_outfh)
                    print(in_fasta[asv], file=sample_outfh)

        sample_outfh.close()
def read_seqabun(infile):
    '''Will read in sequence abundance table in either TSV, BIOM, or mothur
    shared format.'''

    # First check extension of input file. If extension is "biom" then read in
    # as BIOM table and return. This is expected to be the most common input.
    in_name, in_ext = splitext(infile)
    if in_ext == ".biom":
        input_seqabun = biom.load_table(infile).to_dataframe(dense=True)
        input_seqabun.index.astype('str', copy=False)
        return(input_seqabun)

    # Next check if input file is a mothur shared file or not by read in first
    # row only.
    mothur_format = False
    try:
        in_test = pd.read_csv(filepath_or_buffer=infile, sep="\t", nrows=1)
        in_test_col = list(in_test.columns.values)
        if len(in_test_col) >= 4 and (in_test_col[0] == "label" and \
                                      in_test_col[1] == "Group" and \
                                      in_test_col[2] == "numOtus"):
            mothur_format = True
    except Exception:
        pass

    # If identified to be mothur format then remove extra columns, set "Group"
    # to be index (i.e. row) names and then transpose.
    if mothur_format:
        input_seqabun = pd.read_csv(filepath_or_buffer=infile, sep="\t",
                                    dtype={'Group': str}, low_memory=False)
        input_seqabun.drop(labels=["label", "numOtus"], axis=1, inplace=True)
        input_seqabun.set_index(keys="Group", drop=True, inplace=True)
        input_seqabun.index.name = None
        input_seqabun = input_seqabun.transpose()
        input_seqabun.index.astype('str', copy=False)
        return(input_seqabun)
    else:
        first_col = str(pd.read_csv(infile, sep="\t", nrows=0).columns[0])
        input_seqabun = pd.read_csv(filepath_or_buffer=infile, sep="\t",
                                    dtype={first_col: str}, low_memory=False)
        input_seqabun.set_index(first_col, drop=True, inplace=True)
        return(input_seqabun)
Exemple #47
0
def main(args):
    input_table = load_table(args.biom)
    logging.basicConfig(format='', level=logging.INFO)
    logger = logging.getLogger()

    # If treatmenst are given get treatment names
    # Additionally, if reps in treatment file get names of reps
    # else reps are empty and treats = whole data set
    if args.treatments:
        (treats, reps) = _getT(args.treatments)
    else:
        treats = np.ndarray.tolist(input_table.ids())
        reps = ()
    # get treatment indices/data set table indices
    inds = _get_inds(input_table, treats)

    # get abundance threshold
    sums = _get_sample_sums(input_table, inds)

    if args.replicate_threshold:
        core = _get_rep_core_otus(input_table, sums, reps,
                                  args.replicate_threshold, inds,
                                  args.abundance_minimum, args.exclude)
    else:
        core = _get_all_core_otus(input_table, sums, inds,
                                  args.abundance_minimum)

    logger.info("\n[STATUS] Done! No of core OTUs: %d\n\n" % (len(core)))

    output = open(args.output, "w")
    output.write("# Core OTUs of file %s: %d \n" % (args.biom, len(core)))

    for c in core:
        if args.print_taxonomy == "True":
            output.write("%s\t%s\n" % (c, ";".join(
                input_table.metadata(axis="observation")[int(
                    np.where(input_table.ids(
                        axis="observation") == c)[0])]['taxonomy'])))
        else:
            output.write("%s\n" % (c))

    output.close()
Exemple #48
0
    def test_unweighted_fp32_inmem(self):
        tree_fp = self.get_data_path('crawford.tre')
        table_fp = self.get_data_path('crawford.biom')

        table = load_table(table_fp)
        tree = skbio.TreeNode.read(tree_fp)

        ids = table.ids()
        otu_ids = table.ids(axis='observation')
        cnts = table.matrix_data.astype(int).toarray().T
        exp = skbio.diversity.beta_diversity('unweighted_unifrac',
                                             cnts,
                                             ids=ids,
                                             otu_ids=otu_ids,
                                             tree=tree)
        obs = ssu_inmem(table, tree, 'unweighted_fp32', False, 1.0, False, 1)
        npt.assert_almost_equal(obs.data, exp.data, decimal=6)

        obs2 = unweighted(table_fp, tree_fp)
        npt.assert_almost_equal(obs2.data, exp.data)
Exemple #49
0
 def test_usearch_database_mapper(self):
     """usearch_database_mapper functions as expected """
     usearch_database_mapper(query_fp=self.inseqs1_fp,
                             refseqs_fp=self.refseqs1_fp,
                             output_dir=self.test_out,
                             evalue=1e-10,
                             min_id=0.75,
                             queryalnfract=0.35,
                             targetalnfract=0.0,
                             maxaccepts=1,
                             maxrejects=8,
                             HALT_EXEC=False)
     observation_map_fp = join(self.test_out, 'observation_map.txt')
     self.assertTrue(exists(observation_map_fp))
     observation_table_fp = join(self.test_out, 'observation_table.biom')
     table = load_table(observation_table_fp)
     self.assertItemsEqual(table.ids(), ['s2', 's1'])
     self.assertItemsEqual(table.ids(axis='observation'),
                           ['eco:b0122-pr', 'eco:b0015-pr'])
     self.assertEqual(table.sum(), 5)
Exemple #50
0
    def test_build_biom_tables(self):
        samples = {1: ['1.SKB8.640193', '1.SKD8.640184', '1.SKB7.640196']}
        self.analysis._build_biom_tables(samples,
                                         100,
                                         conn_handler=self.conn_handler)
        obs = self.analysis.biom_tables

        self.assertEqual(obs, {'18S': self.biom_fp})

        table = load_table(self.biom_fp)
        obs = set(table.ids(axis='sample'))
        exp = {'1.SKB8.640193', '1.SKD8.640184', '1.SKB7.640196'}
        self.assertEqual(obs, exp)

        obs = table.metadata('1.SKB8.640193')
        exp = {
            'Study': 'Identification of the Microbiomes for Cannabis Soils',
            'Processed_id': 1
        }
        self.assertEqual(obs, exp)
Exemple #51
0
def DA_DESeq2(input_path, out_path, mapping_fp, mapping_category,
              subcategory_1, subcategory_2, DESeq2_diagnostic_plots):
    """perform DESeq2 negative binomial Wald differential abundance test on a raw abundance OTU matrix
    """
    tmp_bt = load_table(input_path)
    tmp_pmf, _ = parse_mapping_file_to_dict(mapping_fp)
    check_mapping_file_category(tmp_bt, mapping_fp, mapping_category,
                                subcategory_1, subcategory_2)
    tmp_bt.add_metadata(tmp_pmf, 'sample')
    base_fname, ext = splitext(out_path)
    outfile_diagnostic = join(base_fname + '_diagnostic_plots.pdf')

    with tempfile.NamedTemporaryFile(
            dir=get_qiime_temp_dir(),
            prefix='QIIME-differential-abundance-temp-table-',
            suffix='.biom') as temp_fh:
        temp_fh.write(tmp_bt.to_json('forR'))
        temp_fh.flush()
        run_DESeq2(temp_fh.name, out_path, mapping_category, subcategory_1,
                   subcategory_2, DESeq2_diagnostic_plots, outfile_diagnostic)
Exemple #52
0
    def test_defaults(self):
        exp_tab = biom.load_table(
            self.get_data_path('expected/16S-default.biom'))
        exp_rep_seqs = list(
            skbio.io.read(self.get_data_path('expected/16S-default.fasta'),
                          'fasta',
                          constructor=skbio.DNA,
                          lowercase='ignore'))
        for seq in exp_rep_seqs:
            del seq.metadata['description']

        obs_tab, rep_seqs, stats = denoise_16S(self.demux_seqs, 100)

        rep_seqs = _sort_seqs(rep_seqs)
        exp_rep_seqs = _sort_seqs(exp_rep_seqs)

        self.assertEqual(obs_tab, exp_tab)
        self.assertEqual(rep_seqs, exp_rep_seqs)
        self.assertEqual(list(stats.columns), STATS_HEADER[1:])
        self.assertEqual(len(stats), 0)
Exemple #53
0
    def test_validate_run_prefix(self):
        httpretty.register_uri(
            httpretty.POST,
            "https://test_server.com/qiita_db/jobs/job-id/step/")
        httpretty.register_uri(
            httpretty.GET,
            "https://test_server.com/qiita_db/prep_template/1/data",
            body='{"data": {"1.S11": {"run_prefix": "1.S1"}, "1.S22": '
            '{"run_prefix": "1.S2"}, "1.S33": {"run_prefix": "1.S3"}}}')

        obs_success, obs_ainfo, obs_error = validate(self.qclient, 'job-id',
                                                     self.parameters,
                                                     self.out_dir)
        exp_biom_fp = join(self.out_dir, basename(self.biom_fp))
        self._clean_up_files.append(exp_biom_fp)
        self.assertTrue(obs_success)
        self.assertEqual(obs_ainfo, [[None, 'BIOM', [exp_biom_fp, 'biom']]])
        self.assertEqual(obs_error, "")
        obs_t = load_table(exp_biom_fp)
        self.assertItemsEqual(obs_t.ids(), ["1.S11", "1.S22", "1.S33"])
Exemple #54
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    otu_table_data = load_table(opts.input_otu_table)
    sort_field = opts.sort_field
    mapping_fp = opts.mapping_fp
    sorted_sample_ids_fp = opts.sorted_sample_ids_fp

    if sort_field and mapping_fp:
        mapping_data = parse_mapping_file(open(mapping_fp, 'U'))
        result = sort_otu_table_by_mapping_field(otu_table_data, mapping_data,
                                                 sort_field)
    elif sorted_sample_ids_fp:
        sorted_sample_ids = sample_ids_from_f(open(sorted_sample_ids_fp, 'U'))
        result = sort_otu_table(otu_table_data, sorted_sample_ids)
    else:
        result = sort_otu_table(otu_table_data,
                                natsort_case_insensitive(otu_table_data.ids()))

    write_biom_table(result, opts.output_fp)
Exemple #55
0
def delete_sample_data(study_id, tag, context, path):
    import traceback
    if not os.path.exists(path):
        print("Unable to find: %s" % path)
        return 0

    table = biom.load_table(path)
    try:
        ndeleted = redbiom.admin.delete_studies_by_id(context, tag)
    except ValueError:
        print("unable to load: %s, %s, %s" % (str(tag), str(context), str(path)))
        nsdat = 0
    except Exception as e:
        # there are some studies in which there are samples in the biom table
        # which lack metadata

        print(tag, context, path)
        traceback.print_exc()
        raise
    return ndeleted
Exemple #56
0
def read_biom(redbiom_output: str) -> biom.Table:
    """
    Read biom file

    Parameters
    ----------
    redbiom_output : str
        The biom table returned by redbiom.

    Returns
    -------
    biom_tab : biom.table
        Feature table retrieved from redbiom.
    biom_tab_sams : list
        Samples of the feature table.
    """
    print('- Load biom table... ', end='')
    biom_tab = biom.load_table(redbiom_output)
    print('Done -> %s samples (all preps)' % biom_tab.shape[0])
    return biom_tab
Exemple #57
0
def load_b(files):
    b = []
    for file in files:
        b.append(biom.load_table(file))

    b_all = b[0]
    for i in range(1, len(b)):
        b_all = b_all.merge(b[i], sample='union', observation='union')

    b_norm = b_all.norm(axis='sample', inplace=False)

    b_rank = {}
    for r, n in ranks.items():
        b_rank[r] = b_norm.collapse(
            lambda id_, md: collapse_to(md['taxonomy'], n, 'Unassigned'),
            axis='observation',
            norm=False).to_dataframe(dense=True)
    b_rank['zotu'] = b_norm.to_dataframe(dense=True)

    return b_rank
Exemple #58
0
def _biom_to_pysurvey_mat(table):
    """Convert a BIOM table to a compatible pysurvey DataFrame

    Parameters
    ----------
    table : biom.Table
        The BIOM table

    Returns
    -------
    DataFrame
        A pandas DataFrame representing the BIOM table where the rows are
        samples and the columns are observations. The sample identifiers are
        stripped as they are not used, and any BIOM metadata are ignored
    """
    table = biom.load_table(table)
    mat = table.matrix_data.toarray().T
    return pd.DataFrame(mat,
                        columns=table.ids(axis='observation'),
                        index=table.ids())
 def test_write_biom_table(self):
     """Test functionality of write_biom_table().
     """
     table_exp = Table(np.array([[1., 1., 1., 0., 0.],
                                 [1., 0., 0., 0., 0.],
                                 [0., 0., 1., 0., 1.],
                                 [0., 0., 0., 1., 0.],
                                 [0., 0., 0., 1., 0.],
                                 [0., 0., 1., 0., 0.]]),
                       ["k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Propionibacteriaceae;g__Propionibacterium",
                        "k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Staphylococcaceae;g__Staphylococcus",
                        "k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Escherichia",
                        "k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Actinomycetaceae;g__Mobiluncus",
                        "k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Xanthomonadales;f__Xanthomonadaceae;g__Stenotrophomonas",
                        "k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Corynebacteriaceae;g__Corynebacterium"],
                       ["s1", "s2", "s3", "s4", "s5"])
     self.biom_output_fp = join(self.working_dir, "test_output_biom")
     write_biom_table(table_exp, self.biom_output_fp)
     table_obs = load_table(self.biom_output_fp)
     self.assertEqual(table_obs, table_exp)
Exemple #60
-1
    def test_build_biom_tables(self):
        new_id = get_count('qiita.filepath') + 1
        samples = {1: ['1.SKB8.640193', '1.SKD8.640184', '1.SKB7.640196']}
        self.analysis._build_biom_tables(samples, 100)
        obs = self.analysis.biom_tables

        self.assertEqual(obs, {'18S': self.biom_fp})

        table = load_table(self.biom_fp)
        obs = set(table.ids(axis='sample'))
        exp = {'1.SKB8.640193', '1.SKD8.640184', '1.SKB7.640196'}
        self.assertEqual(obs, exp)

        obs = table.metadata('1.SKB8.640193')
        exp = {'Study':
               'Identification of the Microbiomes for Cannabis Soils',
               'Processed_id': 1}
        self.assertEqual(obs, exp)

        sql = """SELECT EXISTS(SELECT * FROM qiita.filepath
                 WHERE filepath_id=%s)"""
        obs = self.conn_handler.execute_fetchone(sql, (new_id,))[0]

        self.assertTrue(obs)

        sql = """SELECT * FROM qiita.analysis_filepath
                 WHERE analysis_id=%s ORDER BY filepath_id"""
        obs = self.conn_handler.execute_fetchall(sql, (self.analysis.id,))
        exp = [[1L, 14L, 2L], [1L, 15L, None], [1L, new_id, None]]