def main(): args = parse_args() utility.add_tmp_dir(args) utility.check_input(args) utility.check_dependencies(['prodigal', 'hmmsearch', 'blastp', 'blastn']) utility.check_database(args) print("\n## Calling genes with Prodigal") utility.run_prodigal(args['fna'], args['tmp_dir']) print(" all genes: %s/genes.[ffn|faa]" % args['tmp_dir']) print("\n## Identifying PhyEco phylogenetic marker genes with HMMER") utility.run_hmmsearch(args['db'], args['tmp_dir'], args['tmp_dir'], args['threads']) extract_homologs(args['tmp_dir']) print(" hmm results: %s/phyeco.hmmsearch" % args['tmp_dir']) print(" marker genes: %s/markers" % args['tmp_dir']) print( "\n## Performing pairwise BLAST alignment of marker genes against database" ) align_homologs(args['db'], args['tmp_dir'], args['seq_type'], args['threads']) print(" blast results: %s/alns" % args['tmp_dir']) print("\n## Finding taxonomic outliers") flagged = flag_contigs(args['db'], args['tmp_dir'], args) out = '%s/flagged_contigs' % args['tmp_dir'] print(" flagged contigs: %s" % out) with open(out, 'w') as f: for contig in flagged: f.write(contig + '\n')
def main(): args = fetch_args() utility.add_tmp_dir(args) utility.check_input(args) utility.check_database(args) print("\n## Computing mean genome-wide GC content") contigs = {} for id, seq in utility.parse_fasta(args['fna']): contig = Contig() contig.id = id contig.seq = str(seq) contig.gc = compute_gc(seq) contigs[id] = contig mean = np.mean([c.gc for c in contigs.values()]) std = np.std([c.gc for c in contigs.values()]) print("\n## Computing per-contig deviation from mean") for contig in contigs.values(): contig.values = {} contig.values['delta'] = abs(contig.gc - mean) contig.values['percent'] = 100 * abs(contig.gc - mean) / mean contig.values['z-score'] = abs(contig.gc - mean) / std print("\n## Identifying outlier contigs") flagged = [] for contig in contigs.values(): if contig.values['delta'] > args['cutoff']: flagged.append(contig.id) out = '%s/flagged_contigs' % args['tmp_dir'] print(" flagged contigs: %s" % out) with open(out, 'w') as f: for contig in flagged: f.write(contig + '\n')
def main(): args = fetch_args() utility.add_tmp_dir(args) utility.check_input(args) utility.check_dependencies(['blastn']) utility.check_database(args) tmp_dir = '%s/%s' % (args['out'], args['program']) if not os.path.exists(args['tmp_dir']): os.makedirs(args['tmp_dir']) print("\n## Searching database with BLASTN") for target in ['hg38', 'phix']: db = '%s/known-contam/%s/%s' % (args['db'], target, target) out = '%s/%s.m8' % (args['tmp_dir'], target) run_blastn(args['fna'], db, out, args['threads'], args['qcov'], args['pid'], args['evalue']) print("\n## Identifying contigs with hits to db") flagged = set([]) for target in ['hg38', 'phix']: out = '%s/%s.m8' % (args['tmp_dir'], target) for r in utility.parse_blast(out): flagged.add(r['qname']) flagged = list(flagged) out = '%s/flagged_contigs' % args['tmp_dir'] print(" flagged contigs: %s" % out) with open(out, 'w') as f: for contig in flagged: f.write(contig + '\n')
def main(): args = fetch_args() utility.add_tmp_dir(args) utility.check_input(args) utility.check_database(args) print("\n## Reading database info") ref_taxonomy = read_ref_taxonomy(args['db']) taxon_to_taxonomy = {} for taxonomy in set(ref_taxonomy.values()): for taxon in taxonomy.split('|'): taxon_to_taxonomy[taxon] = taxonomy min_pid = {'k': 57, 'p': 77, 'c': 82, 'o': 86, 'f': 87, 'g': 91, 's': 96} if args['min_genes'] is not None: args['min_genes'] = dict([(r, args['min_genes']) for r in ranks]) else: args['min_genes'] = { 'k': 237, 'p': 44, 'c': 30, 'o': 24, 'f': 22, 'g': 20, 's': 19 } print("\n## Calling genes with Prodigal") utility.run_prodigal(args['fna'], args['tmp_dir']) print(" all genes: %s/genes.[ffn|faa]" % args['tmp_dir']) print( "\n## Performing pairwise alignment of genes against MetaPhlan2 db of clade-specific genes" ) utility.run_lastal(args['db'], args['tmp_dir'], args['threads']) print(" alignments: %s/genes.m8" % args['tmp_dir']) print("\n## Finding top hits to db") genes = {} for aln in utility.parse_last(args['tmp_dir'] + '/genes.m8'): # clade exclusion ref_taxa = ref_taxonomy[aln['tid']].split('|') if (args['exclude_clades'] and any( [taxon in ref_taxa for taxon in args['exclude_clades'].split(',')])): continue # initialize gene if aln['qid'] not in genes: genes[aln['qid']] = Gene() genes[aln['qid']].id = aln['qid'] genes[aln['qid']].contig_id = aln['qid'].rsplit('_', 1)[0] # get top alignments if genes[aln['qid']].aln is None: genes[aln['qid']].aln = aln genes[aln['qid']].ref_taxa = ref_taxa elif float(aln['score']) > float(genes[aln['qid']].aln['score']): genes[aln['qid']].ref_taxa = ref_taxa print(" %s genes with a database hit" % len(genes)) print("\n## Classifying genes at each taxonomic rank") counts = {} for gene in genes.values(): for ref_taxon in gene.ref_taxa: rank = ref_taxon.split('__')[0] if rank not in counts: counts[rank] = 0 if rank == 't': continue elif float(gene.aln['pid']) < min_pid[rank]: continue elif gene.aln['qcov'] < 0.4: continue elif gene.aln['tcov'] < 0.4: continue gene.taxa[rank] = ref_taxon counts[rank] += 1 for rank in ranks: print(" %s: %s classified genes" % (rank_names[rank], counts[rank])) print("\n## Taxonomically classifying contigs") contigs = {} for id, seq in utility.parse_fasta(args['fna']): contigs[id] = Contig() contigs[id].id = id contigs[id].length = len(seq) # aggregate hits by contig for gene in genes.values(): contigs[gene.contig_id].genes.append(gene) # classify contigs at each level for contig in contigs.values(): contig.classify() # summarize counts = {} for contig in contigs.values(): for rank, taxon in contig.cons_taxa.items(): if rank not in counts: counts[rank] = 0 if taxon is not None: counts[rank] += 1 print(" total contigs: %s" % len(contigs)) for rank in ranks: print(" %s: %s classified contigs" % (rank_names[rank], counts[rank])) print("\n## Taxonomically classifying genome") bin = Bin() bin.classify(contigs, args['min_bin_fract'], args['min_contig_fract'], args['min_gene_fract'], args['min_genes'], args['lowest_rank']) print(" consensus taxon: %s" % bin.cons_taxon) print("\n## Identifying taxonomically discordant contigs") if bin.cons_taxon is not None: bin.rank_index = taxon_to_taxonomy[bin.cons_taxon].split('|').index( bin.cons_taxon) bin.taxonomy = taxon_to_taxonomy[bin.cons_taxon].split( '|')[0:bin.rank_index + 1] flag_contigs(contigs, bin) flagged = [] for contig in contigs.values(): if contig.flagged: flagged.append(contig.id) out = '%s/flagged_contigs' % args['tmp_dir'] print(" flagged contigs: %s" % out) with open(out, 'w') as f: for contig in flagged: f.write(contig + '\n')
def main(): args = fetch_args() utility.add_tmp_dir(args) utility.check_input(args) utility.check_dependencies(['blastn']) utility.check_database(args) print("\n## Counting tetranucleotides") # init data kmer_counts = init_kmers() contigs = {} for rec in Bio.SeqIO.parse(args['fna'], 'fasta'): contig = Contig() contig.id = rec.id contig.seq = str(rec.seq) contig.kmers = kmer_counts.copy() contigs[rec.id] = contig # count kmers for contig in contigs.values(): start, stop, step = 0, 4, 1 while stop <= len(contig.seq): kmer_fwd = contig.seq[start:stop] kmer_rev = str(Bio.Seq.Seq(kmer_fwd).reverse_complement()) if kmer_fwd in kmer_counts: contigs[rec.id].kmers[kmer_fwd] += 1 elif kmer_rev in kmer_counts: contigs[rec.id].kmers[kmer_rev] += 1 start += step stop += step print("\n## Normalizing counts") for contig in contigs.values(): total = float(sum(contig.kmers.values())) for kmer, count in contig.kmers.items(): if total > 0: contig.kmers[kmer] = 100*count/total else: contig.kmers[kmer] = 0.00 print("\n## Performing PCA") df = pd.DataFrame(dict([(c.id, c.kmers) for c in contigs.values()])) pca = PCA(n_components=1) pca.fit(df) pc1 = pca.components_[0] print("\n## Computing per-contig deviation from the mean along the first principal component") mean_pc = np.mean(pc1) std_pc = np.std(pc1) for contig_id, contig_pc in zip(list(df.columns), pc1): contigs[contig_id].pc = contig_pc contigs[contig_id].values = {} contigs[contig_id].values['zscore'] = abs(contig_pc - mean_pc)/std_pc if std_pc > 0 else 0.0 contigs[contig_id].values['delta'] = abs(contig_pc - mean_pc) contigs[contig_id].values['percent'] = 100*abs(contig_pc - mean_pc)/mean_pc print("\n## Identifying outlier contigs") flagged = [] for contig in contigs.values(): if contig.values['delta'] > args['cutoff']: flagged.append(contig.id) out = '%s/flagged_contigs' % args['tmp_dir'] print (" flagged contigs: %s" % out) with open(out, 'w') as f: for contig in flagged: f.write(contig+'\n')