def _function(inputs, database, output, levels, save_median_taxatable=False): # Check if output exists, if not then make if not os.path.exists(output): os.makedirs(output) # Load the datafiles to locate function db data_files = _load_metadata(database) # Load the functional db logger.info("Loading the functional database and converting.") func_db = parse_function_db(data_files, database) for input, level in zip(inputs, levels): # Verify it is in a reasonable level if level in ['genus', 'species', 'strain']: logger.info( "Starting functional prediction with input file %s at level %s" % (os.path.abspath(input), level)) function_run_and_save(input, func_db, output, TAXAMAP[level], save_median_taxatable=save_median_taxatable) else: continue
def summarize_functional(ctx, input, database, output): prefix = ".".join(os.path.basename(input).split('.')[:-1]).replace(".kegg", "") # Check if output exists, if not then make if not os.path.exists(output): os.makedirs(output) # Load the datafiles to locate function db data_files = _load_metadata(database) # Load the functional db logger.info("Loading the functional database and converting.") func_db = parse_function_db(data_files, database) kegg_df = pd.read_csv(input, sep="\t", index_col=0) out_kegg_modules_df, out_kegg_modules_coverage = summarize_kegg_table(kegg_df, func_db['modules']) out_kegg_pathways_df, out_kegg_pathways_coverage = summarize_kegg_table(kegg_df, func_db['pathways']) out_kegg_modules_df.to_csv(os.path.join(output, "%s.kegg.modules.txt" % prefix), sep='\t', float_format="%d", na_rep=0, index_label="#MODULE ID") out_kegg_modules_coverage.to_csv(os.path.join(output, "%s.kegg.modules.coverage.txt" % prefix), sep='\t', float_format="%f", na_rep=0, index_label="#MODULE ID") out_kegg_pathways_df.to_csv(os.path.join(output, "%s.kegg.pathways.txt" % prefix), sep='\t', float_format="%d", na_rep=0, index_label="#PATHWAY ID") out_kegg_pathways_coverage.to_csv(os.path.join(output, "%s.kegg.pathways.coverage.txt" % prefix), sep='\t', float_format="%f", na_rep=0, index_label="#PATHWAY ID")
def _post_align(self, outf, **kwargs): alignments = set() i = 0 with open(outf) as alignment_file: alignment_gen = csv.reader(alignment_file, delimiter="\t") for row in alignment_gen: alignment_score = float(row[2]) if alignment_score >= self.percent_id: alignments.add(row[0]) i += 1 logger.info("Human hits filter: %d" % i) return alignments
def function_run_and_save(input, func_db, output, level, save_median_taxatable=True): prefix = ".".join(os.path.basename(input).split('.')[:-1]) kegg_pathways_df = func_db['pathways'] kegg_modules_df = func_db['modules'] row_names = func_db['names'] kegg_ids = func_db['kegg_ids'] kegg_table_csr = func_db['csr'] logger.debug( "Level for summarization %d and starting summarizing KEGG Table at level with median." % level) if level < 8: kegg_table_csr, row_names = summarize_at_level(kegg_table_csr, row_names, kegg_ids, level) logger.debug("Number of rows %d" % len(list(row_names.keys()))) if TAXA[level - 1] not in prefix: prefix += "." + TAXA[level - 1] logger.info("Reading in taxatable for functional prediction at %s." % os.path.abspath(input)) taxatable_df = pd.read_csv(input, sep="\t", index_col=0) logger.debug("Taxatable for functional prediction shape %s" % str(taxatable_df.shape)) taxatable_df = taxatable_df[[type(_) == str for _ in taxatable_df.index]] taxatable_df['summary'] = [ ';'.join(_.split(';')[:level]).replace(' ', '_') for _ in taxatable_df.index ] # Drop names above taxatable_df = taxatable_df[[ _.count(';') + 1 >= level for _ in taxatable_df['summary'] ]] taxatable_df = taxatable_df.groupby('summary').sum().fillna(0.) # Normalizing for depth at median depth taxatable_df = normalize_by_median_depth(taxatable_df) if save_median_taxatable: taxatable_df.to_csv(os.path.join(output, "%s.normalized.txt" % prefix), sep='\t', float_format="%d", na_rep=0, index_label="#OTU ID") logger.debug("Taxatable summarized shape %s" % str(taxatable_df.shape)) logger.info("Starting functional prediction.") out_kegg_table_df, out_kegg_modules_df, out_kegg_modules_coverage, out_kegg_pathways_df, out_kegg_pathways_coverage = _do_function( taxatable_df, row_names, kegg_ids, kegg_table_csr, kegg_modules_df, kegg_pathways_df) out_kegg_table_df.to_csv(os.path.join(output, "%s.kegg.txt" % prefix), sep='\t', float_format="%d", na_rep=0, index_label="#KEGG ID") out_kegg_modules_df.to_csv(os.path.join(output, "%s.kegg.modules.txt" % prefix), sep='\t', float_format="%d", na_rep=0, index_label="#MODULE ID") out_kegg_modules_coverage.to_csv(os.path.join( output, "%s.kegg.modules.coverage.txt" % prefix), sep='\t', float_format="%f", na_rep=0, index_label="#MODULE ID") out_kegg_pathways_df.to_csv(os.path.join(output, "%s.kegg.pathways.txt" % prefix), sep='\t', float_format="%d", na_rep=0, index_label="#PATHWAY ID") out_kegg_pathways_coverage.to_csv(os.path.join( output, "%s.kegg.pathways.coverage.txt" % prefix), sep='\t', float_format="%f", na_rep=0, index_label="#PATHWAY ID")
def get_coverage_of_microbes(infile, shear, level, parse_taxonomy_from_row=lambda row: row[-1]): #Load in the shear df at level shear_df = summarize_bayes_at_level(shear, level=level) samples_begin_map = dict() taxa_hits = defaultdict(int) logger.info("Started the coverage parsing.") with open(infile) as utree_f: csv_embalm = csv.reader(utree_f, delimiter='\t') # qname, lca, confidence, support for num, line in enumerate(csv_embalm): if num % 10000 == 0: logger.info("Parsed %d lines of b6." % num) # TODO confidence/support filter begin = int(line[8]) taxaname = parse_taxonomy_from_row(line) taxa_level = taxaname.count(';') + 1 if taxa_level >= level: if taxa_level != level: taxaname = ';'.join(taxaname.split(";")[:level]) if taxaname in shear_df.index: taxa_hits[taxaname] += 1 indx = int(np.floor(begin / 100.)) if not taxaname in samples_begin_map: genome_length = shear_df['genome_length_median'][ taxaname] samples_begin_map[taxaname] = np.zeros(genome_length) if indx == 0: samples_begin_map[taxaname][0] += 1 elif indx >= shear_df['genome_length_median'][taxaname]: samples_begin_map[taxaname][-1] += 1 else: samples_begin_map[taxaname][indx] += 1 samples_begin_map[taxaname][indx + 1] += 1 else: logger.warning("The taxa %s not found." % taxaname) xx = np.zeros((len(samples_begin_map), 8)) for i, taxaname in enumerate(sorted(samples_begin_map.keys())): if i % 1000 == 0: logger.info("Calculated %d coverages." % i) unique_hits = taxa_hits[taxaname] hits = samples_begin_map[taxaname] coverages = zero_runs(hits) if coverages[0][0] == 0: if coverages[-1][-1] == hits.shape[0]: temp = coverages[:, 1] - coverages[:, 0] coverages = np.concatenate( (coverages, np.atleast_2d(np.array([0, temp[0] + temp[-1]])))) max_uncovered_region = np.max(coverages[:, 1] - coverages[:, 0]) percent_max_unconvered = max_uncovered_region / shear_df[ 'genome_length_median'][taxaname] percent_covered = np.sum( hits > 0) / shear_df['genome_length_median'][taxaname] unique_counts = shear_df.iloc[:, level - 1][taxaname] expected_c = expected_coverage(unique_counts, unique_hits) row = np.array([ max_uncovered_region, percent_max_unconvered, percent_covered, shear_df['genome_length_median'][taxaname], unique_hits, unique_counts, expected_c, percent_covered / (expected_c) ]) row[np.isnan(row)] = 0 xx[i] = row df = pd.DataFrame(xx, columns=[ 'max_uncovered_region', 'percent_max_uncovered_region', 'percent_of_genome_covered', 'median_genome_size', 'hits_in_clade', 'unique_counts_of_clade', 'expected_coverage', 'ratio_covered_over_expected' ], index=sorted(samples_begin_map.keys())) logger.info("Completed the coverage analysis.") return df