def main(): args = parse_args() utility.add_tmp_dir(args) utility.check_input(args) utility.check_dependencies(['prodigal', 'hmmsearch', 'blastp', 'blastn']) utility.check_database(args) print("\n## Calling genes with Prodigal") utility.run_prodigal(args['fna'], args['tmp_dir']) print(" all genes: %s/genes.[ffn|faa]" % args['tmp_dir']) print("\n## Identifying PhyEco phylogenetic marker genes with HMMER") utility.run_hmmsearch(args['db'], args['tmp_dir'], args['tmp_dir'], args['threads']) extract_homologs(args['tmp_dir']) print(" hmm results: %s/phyeco.hmmsearch" % args['tmp_dir']) print(" marker genes: %s/markers" % args['tmp_dir']) print( "\n## Performing pairwise BLAST alignment of marker genes against database" ) align_homologs(args['db'], args['tmp_dir'], args['seq_type'], args['threads']) print(" blast results: %s/alns" % args['tmp_dir']) print("\n## Finding taxonomic outliers") flagged = flag_contigs(args['db'], args['tmp_dir'], args) out = '%s/flagged_contigs' % args['tmp_dir'] print(" flagged contigs: %s" % out) with open(out, 'w') as f: for contig in flagged: f.write(contig + '\n')
def main(): args = fetch_args() utility.add_tmp_dir(args) utility.check_input(args) utility.check_dependencies(['blastn']) utility.check_database(args) tmp_dir = '%s/%s' % (args['out'], args['program']) if not os.path.exists(args['tmp_dir']): os.makedirs(args['tmp_dir']) print("\n## Searching database with BLASTN") for target in ['hg38', 'phix']: db = '%s/known-contam/%s/%s' % (args['db'], target, target) out = '%s/%s.m8' % (args['tmp_dir'], target) run_blastn(args['fna'], db, out, args['threads'], args['qcov'], args['pid'], args['evalue']) print("\n## Identifying contigs with hits to db") flagged = set([]) for target in ['hg38', 'phix']: out = '%s/%s.m8' % (args['tmp_dir'], target) for r in utility.parse_blast(out): flagged.add(r['qname']) flagged = list(flagged) out = '%s/flagged_contigs' % args['tmp_dir'] print(" flagged contigs: %s" % out) with open(out, 'w') as f: for contig in flagged: f.write(contig + '\n')
def main(): args = fetch_args() utility.add_tmp_dir(args) utility.check_input(args) utility.check_database(args) print "\n## Computing mean genome-wide GC content" contigs = {} for id, seq in utility.parse_fasta(args['fna']): contig = Contig() contig.id = id contig.seq = str(seq) contig.gc = compute_gc(seq) contigs[id] = contig mean = np.mean([c.gc for c in contigs.values()]) std = np.std([c.gc for c in contigs.values()]) print "\n## Computing per-contig deviation from mean" for contig in contigs.values(): contig.values = {} contig.values['delta'] = abs(contig.gc - mean) contig.values['percent'] = 100 * abs(contig.gc - mean)/mean contig.values['z-score'] = abs(contig.gc - mean)/std print "\n## Identifying outlier contigs" flagged = [] for contig in contigs.values(): if contig.values['delta'] > args['cutoff']: flagged.append(contig.id) out = '%s/flagged_contigs' % args['tmp_dir'] print (" flagged contigs: %s" % out) with open(out, 'w') as f: for contig in flagged: f.write(contig+'\n')
def main(): args = fetch_args() utility.add_tmp_dir(args) utility.check_input(args) utility.check_database(args) print("\n## Reading database info") ref_taxonomy = read_ref_taxonomy(args['db']) taxon_to_taxonomy = {} for taxonomy in set(ref_taxonomy.values()): for taxon in taxonomy.split('|'): taxon_to_taxonomy[taxon] = taxonomy min_pid = {'k': 57, 'p': 77, 'c': 82, 'o': 86, 'f': 87, 'g': 91, 's': 96} if args['min_genes'] is not None: args['min_genes'] = dict([(r, args['min_genes']) for r in ranks]) else: args['min_genes'] = { 'k': 237, 'p': 44, 'c': 30, 'o': 24, 'f': 22, 'g': 20, 's': 19 } print("\n## Calling genes with Prodigal") utility.run_prodigal(args['fna'], args['tmp_dir']) print(" all genes: %s/genes.[ffn|faa]" % args['tmp_dir']) print( "\n## Performing pairwise alignment of genes against MetaPhlan2 db of clade-specific genes" ) utility.run_lastal(args['db'], args['tmp_dir'], args['threads']) print(" alignments: %s/genes.m8" % args['tmp_dir']) print("\n## Finding top hits to db") genes = {} for aln in utility.parse_last(args['tmp_dir'] + '/genes.m8'): # clade exclusion ref_taxa = ref_taxonomy[aln['tid']].split('|') if (args['exclude_clades'] and any( [taxon in ref_taxa for taxon in args['exclude_clades'].split(',')])): continue # initialize gene if aln['qid'] not in genes: genes[aln['qid']] = Gene() genes[aln['qid']].id = aln['qid'] genes[aln['qid']].contig_id = aln['qid'].rsplit('_', 1)[0] # get top alignments if genes[aln['qid']].aln is None: genes[aln['qid']].aln = aln genes[aln['qid']].ref_taxa = ref_taxa elif float(aln['score']) > float(genes[aln['qid']].aln['score']): genes[aln['qid']].ref_taxa = ref_taxa print(" %s genes with a database hit" % len(genes)) print("\n## Classifying genes at each taxonomic rank") counts = {} for gene in genes.values(): for ref_taxon in gene.ref_taxa: rank = ref_taxon.split('__')[0] if rank not in counts: counts[rank] = 0 if rank == 't': continue elif float(gene.aln['pid']) < min_pid[rank]: continue elif gene.aln['qcov'] < 0.4: continue elif gene.aln['tcov'] < 0.4: continue gene.taxa[rank] = ref_taxon counts[rank] += 1 for rank in ranks: print(" %s: %s classified genes" % (rank_names[rank], counts[rank])) print("\n## Taxonomically classifying contigs") contigs = {} for id, seq in utility.parse_fasta(args['fna']): contigs[id] = Contig() contigs[id].id = id contigs[id].length = len(seq) # aggregate hits by contig for gene in genes.values(): contigs[gene.contig_id].genes.append(gene) # classify contigs at each level for contig in contigs.values(): contig.classify() # summarize counts = {} for contig in contigs.values(): for rank, taxon in contig.cons_taxa.items(): if rank not in counts: counts[rank] = 0 if taxon is not None: counts[rank] += 1 print(" total contigs: %s" % len(contigs)) for rank in ranks: print(" %s: %s classified contigs" % (rank_names[rank], counts[rank])) print("\n## Taxonomically classifying genome") bin = Bin() bin.classify(contigs, args['min_bin_fract'], args['min_contig_fract'], args['min_gene_fract'], args['min_genes'], args['lowest_rank']) print(" consensus taxon: %s" % bin.cons_taxon) print("\n## Identifying taxonomically discordant contigs") if bin.cons_taxon is not None: bin.rank_index = taxon_to_taxonomy[bin.cons_taxon].split('|').index( bin.cons_taxon) bin.taxonomy = taxon_to_taxonomy[bin.cons_taxon].split( '|')[0:bin.rank_index + 1] flag_contigs(contigs, bin) flagged = [] for contig in contigs.values(): if contig.flagged: flagged.append(contig.id) out = '%s/flagged_contigs' % args['tmp_dir'] print(" flagged contigs: %s" % out) with open(out, 'w') as f: for contig in flagged: f.write(contig + '\n')
def main(): args = fetch_args() utility.add_tmp_dir(args) utility.check_input(args) utility.check_dependencies(['mash']) if not os.path.exists(args['mash_sketch']): sys.exit("\nError: mash sketch '%s' not found\n" % args['mash_sketch']) print("\n## Finding conspecific genomes in database") run_mash(args['mash_sketch'], args['fna'], args['tmp_dir'], args['threads']) genomes = find_conspecific(args['tmp_dir'], args['mash_dist'], args['exclude']) print(" %s genomes within %s mash-dist" % (len(genomes), args['mash_dist'])) out = '%s/conspecific.list' % args['tmp_dir'] with open(out, 'w') as f: f.write('genome_id\tmash_dist\n') for genome_id, mash_dist in genomes: f.write(genome_id + '\t' + str(mash_dist) + '\n') print(" list of genomes: %s" % (out)) print(" mash output: %s/mash.dist" % args['tmp_dir']) if len(genomes) < args['min_genomes']: sys.exit("\nError: insufficient number of conspecific genomes\n") if len(genomes) > args['max_genomes']: print("\n## Selecting top %s most-similar genomes" % args['max_genomes']) genomes = genomes[0:args['max_genomes']] out = '%s/conspecific_subset.list' % args['tmp_dir'] with open(out, 'w') as f: f.write('genome_id\tmash_dist\n') for genome_id, mash_dist in genomes: f.write(genome_id + '\t' + str(mash_dist) + '\n') print(" list of genomes: %s" % (out)) print( "\n## Performing pairwise alignment of contigs in bin to database genomes" ) alignments = align_contigs(args, genomes) num_alns = sum([len(_.split('\n')) for _ in alignments]) print(" total alignments: %s" % num_alns) print("\n## Summarizing alignments") contigs = find_contig_targets(args, genomes, alignments) out = '%s/contig_hits.tsv' % args['tmp_dir'] with open(out, 'w') as f: f.write('contig_id\tlength\talignment_rate\n') for contig, values in contigs.items(): row = [ contig, str(values['len']), '%s/%s' % (values['hits'], len(genomes)) ] f.write('\t'.join(row) + '\n') print(" contig features: %s" % out) print("\n## Identifying contigs with no conspecific alignments") flagged_contigs = flag_contigs(args, contigs) flagged_length = round( sum([contigs[id]['len'] for id in flagged_contigs]) / 1000.0, 2) print(" %s flagged contigs, %s Kbp" % (len(flagged_contigs), flagged_length)) out = '%s/flagged_contigs' % args['tmp_dir'] with open(out, 'w') as f: for contig in flagged_contigs: f.write(contig + '\n') print(" flagged contigs: %s" % out)
def main(): args = fetch_args() utility.add_tmp_dir(args) utility.check_input(args) utility.check_dependencies(['blastn']) utility.check_database(args) print "\n## Counting tetranucleotides" # init data kmer_counts = init_kmers() contigs = {} for rec in Bio.SeqIO.parse(args['fna'], 'fasta'): contig = Contig() contig.id = rec.id contig.seq = str(rec.seq) contig.kmers = kmer_counts.copy() contigs[rec.id] = contig # count kmers for contig in contigs.values(): start, stop, step = 0, 4, 1 while stop <= len(contig.seq): kmer_fwd = contig.seq[start:stop] kmer_rev = str(Bio.Seq.Seq(kmer_fwd).reverse_complement()) if kmer_fwd in kmer_counts: contigs[rec.id].kmers[kmer_fwd] += 1 elif kmer_rev in kmer_counts: contigs[rec.id].kmers[kmer_rev] += 1 start += step stop += step print "\n## Normalizing counts" for contig in contigs.values(): total = float(sum(contig.kmers.values())) for kmer, count in contig.kmers.items(): if total > 0: contig.kmers[kmer] = 100*count/total else: contig.kmers[kmer] = 0.00 print "\n## Performing PCA" df = pd.DataFrame(dict([(c.id, c.kmers) for c in contigs.values()])) pca = PCA(n_components=1) pca.fit(df) pc1 = pca.components_[0] print "\n## Computing per-contig deviation from the mean along the first principal component" mean_pc = np.mean(pc1) std_pc = np.std(pc1) for contig_id, contig_pc in zip(list(df.columns), pc1): contigs[contig_id].pc = contig_pc contigs[contig_id].values = {} contigs[contig_id].values['zscore'] = abs(contig_pc - mean_pc)/std_pc if std_pc > 0 else 0.0 contigs[contig_id].values['delta'] = abs(contig_pc - mean_pc) contigs[contig_id].values['percent'] = 100*abs(contig_pc - mean_pc)/mean_pc print "\n## Identifying outlier contigs" flagged = [] for contig in contigs.values(): if contig.values['delta'] > args['cutoff']: flagged.append(contig.id) out = '%s/flagged_contigs' % args['tmp_dir'] print (" flagged contigs: %s" % out) with open(out, 'w') as f: for contig in flagged: f.write(contig+'\n')