Esempio n. 1
0
def main():

    args = parse_args()
    utility.add_tmp_dir(args)
    utility.check_input(args)
    utility.check_dependencies(['prodigal', 'hmmsearch', 'blastp', 'blastn'])
    utility.check_database(args)

    print("\n## Calling genes with Prodigal")
    utility.run_prodigal(args['fna'], args['tmp_dir'])
    print("   all genes: %s/genes.[ffn|faa]" % args['tmp_dir'])

    print("\n## Identifying PhyEco phylogenetic marker genes with HMMER")
    utility.run_hmmsearch(args['db'], args['tmp_dir'], args['tmp_dir'],
                          args['threads'])
    extract_homologs(args['tmp_dir'])
    print("   hmm results: %s/phyeco.hmmsearch" % args['tmp_dir'])
    print("   marker genes: %s/markers" % args['tmp_dir'])

    print(
        "\n## Performing pairwise BLAST alignment of marker genes against database"
    )
    align_homologs(args['db'], args['tmp_dir'], args['seq_type'],
                   args['threads'])
    print("   blast results: %s/alns" % args['tmp_dir'])

    print("\n## Finding taxonomic outliers")
    flagged = flag_contigs(args['db'], args['tmp_dir'], args)
    out = '%s/flagged_contigs' % args['tmp_dir']
    print("   flagged contigs: %s" % out)
    with open(out, 'w') as f:
        for contig in flagged:
            f.write(contig + '\n')
Esempio n. 2
0
def main():

    args = fetch_args()

    utility.add_tmp_dir(args)
    utility.check_input(args)
    utility.check_dependencies(['blastn'])
    utility.check_database(args)

    tmp_dir = '%s/%s' % (args['out'], args['program'])
    if not os.path.exists(args['tmp_dir']):
        os.makedirs(args['tmp_dir'])

    print("\n## Searching database with BLASTN")
    for target in ['hg38', 'phix']:
        db = '%s/known-contam/%s/%s' % (args['db'], target, target)
        out = '%s/%s.m8' % (args['tmp_dir'], target)
        run_blastn(args['fna'], db, out, args['threads'], args['qcov'],
                   args['pid'], args['evalue'])

    print("\n## Identifying contigs with hits to db")
    flagged = set([])
    for target in ['hg38', 'phix']:
        out = '%s/%s.m8' % (args['tmp_dir'], target)
        for r in utility.parse_blast(out):
            flagged.add(r['qname'])
    flagged = list(flagged)
    out = '%s/flagged_contigs' % args['tmp_dir']
    print("   flagged contigs: %s" % out)
    with open(out, 'w') as f:
        for contig in flagged:
            f.write(contig + '\n')
Esempio n. 3
0
def main():

	args = fetch_args()
	utility.add_tmp_dir(args)
	utility.check_input(args)
	utility.check_database(args)
	
	print "\n## Computing mean genome-wide GC content"
	contigs = {}
	for id, seq in utility.parse_fasta(args['fna']):
		contig = Contig()
		contig.id = id
		contig.seq = str(seq)
		contig.gc = compute_gc(seq)
		contigs[id] = contig
	mean = np.mean([c.gc for c in contigs.values()])
	std = np.std([c.gc for c in contigs.values()])
	
	print "\n## Computing per-contig deviation from mean"
	for contig in contigs.values():
		contig.values = {}
		contig.values['delta'] = abs(contig.gc - mean)
		contig.values['percent'] = 100 * abs(contig.gc - mean)/mean
		contig.values['z-score'] = abs(contig.gc - mean)/std
		
	print "\n## Identifying outlier contigs"
	flagged = []
	for contig in contigs.values():
		if contig.values['delta'] > args['cutoff']:
			flagged.append(contig.id)
	out = '%s/flagged_contigs' % args['tmp_dir']
	print ("   flagged contigs: %s" % out)
	with open(out, 'w') as f:
		for contig in flagged:
			f.write(contig+'\n')
Esempio n. 4
0
def main():

    args = fetch_args()
    utility.add_tmp_dir(args)
    utility.check_input(args)
    utility.check_database(args)

    print("\n## Reading database info")
    ref_taxonomy = read_ref_taxonomy(args['db'])
    taxon_to_taxonomy = {}
    for taxonomy in set(ref_taxonomy.values()):
        for taxon in taxonomy.split('|'):
            taxon_to_taxonomy[taxon] = taxonomy
    min_pid = {'k': 57, 'p': 77, 'c': 82, 'o': 86, 'f': 87, 'g': 91, 's': 96}
    if args['min_genes'] is not None:
        args['min_genes'] = dict([(r, args['min_genes']) for r in ranks])
    else:
        args['min_genes'] = {
            'k': 237,
            'p': 44,
            'c': 30,
            'o': 24,
            'f': 22,
            'g': 20,
            's': 19
        }

    print("\n## Calling genes with Prodigal")
    utility.run_prodigal(args['fna'], args['tmp_dir'])
    print("   all genes: %s/genes.[ffn|faa]" % args['tmp_dir'])

    print(
        "\n## Performing pairwise alignment of genes against MetaPhlan2 db of clade-specific genes"
    )
    utility.run_lastal(args['db'], args['tmp_dir'], args['threads'])
    print("   alignments: %s/genes.m8" % args['tmp_dir'])

    print("\n## Finding top hits to db")
    genes = {}
    for aln in utility.parse_last(args['tmp_dir'] + '/genes.m8'):

        # clade exclusion
        ref_taxa = ref_taxonomy[aln['tid']].split('|')
        if (args['exclude_clades'] and any(
            [taxon in ref_taxa
             for taxon in args['exclude_clades'].split(',')])):
            continue

        # initialize gene
        if aln['qid'] not in genes:
            genes[aln['qid']] = Gene()
            genes[aln['qid']].id = aln['qid']
            genes[aln['qid']].contig_id = aln['qid'].rsplit('_', 1)[0]

        # get top alignments
        if genes[aln['qid']].aln is None:
            genes[aln['qid']].aln = aln
            genes[aln['qid']].ref_taxa = ref_taxa
        elif float(aln['score']) > float(genes[aln['qid']].aln['score']):
            genes[aln['qid']].ref_taxa = ref_taxa
    print("   %s genes with a database hit" % len(genes))

    print("\n## Classifying genes at each taxonomic rank")
    counts = {}
    for gene in genes.values():
        for ref_taxon in gene.ref_taxa:
            rank = ref_taxon.split('__')[0]
            if rank not in counts: counts[rank] = 0
            if rank == 't':
                continue
            elif float(gene.aln['pid']) < min_pid[rank]:
                continue
            elif gene.aln['qcov'] < 0.4:
                continue
            elif gene.aln['tcov'] < 0.4:
                continue
            gene.taxa[rank] = ref_taxon
            counts[rank] += 1
    for rank in ranks:
        print("   %s: %s classified genes" % (rank_names[rank], counts[rank]))

    print("\n## Taxonomically classifying contigs")
    contigs = {}
    for id, seq in utility.parse_fasta(args['fna']):
        contigs[id] = Contig()
        contigs[id].id = id
        contigs[id].length = len(seq)

    # aggregate hits by contig
    for gene in genes.values():
        contigs[gene.contig_id].genes.append(gene)

    # classify contigs at each level
    for contig in contigs.values():
        contig.classify()

    # summarize
    counts = {}
    for contig in contigs.values():
        for rank, taxon in contig.cons_taxa.items():
            if rank not in counts:
                counts[rank] = 0
            if taxon is not None:
                counts[rank] += 1
    print("   total contigs: %s" % len(contigs))
    for rank in ranks:
        print("   %s: %s classified contigs" %
              (rank_names[rank], counts[rank]))

    print("\n## Taxonomically classifying genome")
    bin = Bin()
    bin.classify(contigs, args['min_bin_fract'], args['min_contig_fract'],
                 args['min_gene_fract'], args['min_genes'],
                 args['lowest_rank'])
    print("   consensus taxon: %s" % bin.cons_taxon)

    print("\n## Identifying taxonomically discordant contigs")
    if bin.cons_taxon is not None:
        bin.rank_index = taxon_to_taxonomy[bin.cons_taxon].split('|').index(
            bin.cons_taxon)
        bin.taxonomy = taxon_to_taxonomy[bin.cons_taxon].split(
            '|')[0:bin.rank_index + 1]
        flag_contigs(contigs, bin)
    flagged = []
    for contig in contigs.values():
        if contig.flagged:
            flagged.append(contig.id)
    out = '%s/flagged_contigs' % args['tmp_dir']
    print("   flagged contigs: %s" % out)
    with open(out, 'w') as f:
        for contig in flagged:
            f.write(contig + '\n')
Esempio n. 5
0
def main():

    args = fetch_args()

    utility.add_tmp_dir(args)
    utility.check_input(args)
    utility.check_dependencies(['mash'])
    if not os.path.exists(args['mash_sketch']):
        sys.exit("\nError: mash sketch '%s' not found\n" % args['mash_sketch'])

    print("\n## Finding conspecific genomes in database")
    run_mash(args['mash_sketch'], args['fna'], args['tmp_dir'],
             args['threads'])
    genomes = find_conspecific(args['tmp_dir'], args['mash_dist'],
                               args['exclude'])
    print("   %s genomes within %s mash-dist" %
          (len(genomes), args['mash_dist']))
    out = '%s/conspecific.list' % args['tmp_dir']
    with open(out, 'w') as f:
        f.write('genome_id\tmash_dist\n')
        for genome_id, mash_dist in genomes:
            f.write(genome_id + '\t' + str(mash_dist) + '\n')
    print("   list of genomes: %s" % (out))
    print("   mash output: %s/mash.dist" % args['tmp_dir'])

    if len(genomes) < args['min_genomes']:
        sys.exit("\nError: insufficient number of conspecific genomes\n")

    if len(genomes) > args['max_genomes']:
        print("\n## Selecting top %s most-similar genomes" %
              args['max_genomes'])
        genomes = genomes[0:args['max_genomes']]
        out = '%s/conspecific_subset.list' % args['tmp_dir']
        with open(out, 'w') as f:
            f.write('genome_id\tmash_dist\n')
            for genome_id, mash_dist in genomes:
                f.write(genome_id + '\t' + str(mash_dist) + '\n')
        print("   list of genomes: %s" % (out))

    print(
        "\n## Performing pairwise alignment of contigs in bin to database genomes"
    )
    alignments = align_contigs(args, genomes)
    num_alns = sum([len(_.split('\n')) for _ in alignments])
    print("   total alignments: %s" % num_alns)

    print("\n## Summarizing alignments")
    contigs = find_contig_targets(args, genomes, alignments)
    out = '%s/contig_hits.tsv' % args['tmp_dir']
    with open(out, 'w') as f:
        f.write('contig_id\tlength\talignment_rate\n')
        for contig, values in contigs.items():
            row = [
                contig,
                str(values['len']),
                '%s/%s' % (values['hits'], len(genomes))
            ]
            f.write('\t'.join(row) + '\n')
    print("   contig features: %s" % out)

    print("\n## Identifying contigs with no conspecific alignments")
    flagged_contigs = flag_contigs(args, contigs)
    flagged_length = round(
        sum([contigs[id]['len'] for id in flagged_contigs]) / 1000.0, 2)
    print("   %s flagged contigs, %s Kbp" %
          (len(flagged_contigs), flagged_length))
    out = '%s/flagged_contigs' % args['tmp_dir']
    with open(out, 'w') as f:
        for contig in flagged_contigs:
            f.write(contig + '\n')
    print("   flagged contigs: %s" % out)
Esempio n. 6
0
def main():

	args = fetch_args()
	utility.add_tmp_dir(args)
	utility.check_input(args)
	utility.check_dependencies(['blastn'])
	utility.check_database(args)
	
	print "\n## Counting tetranucleotides"
	# init data
	kmer_counts = init_kmers()
	contigs = {}
	for rec in Bio.SeqIO.parse(args['fna'], 'fasta'):
		contig = Contig()
		contig.id = rec.id
		contig.seq = str(rec.seq)
		contig.kmers = kmer_counts.copy()
		contigs[rec.id] = contig

	# count kmers
	for contig in contigs.values():
		start, stop, step = 0, 4, 1
		while stop <= len(contig.seq):
			kmer_fwd = contig.seq[start:stop]
			kmer_rev = str(Bio.Seq.Seq(kmer_fwd).reverse_complement())
			if kmer_fwd in kmer_counts:
				contigs[rec.id].kmers[kmer_fwd] += 1
			elif kmer_rev in kmer_counts:
				contigs[rec.id].kmers[kmer_rev] += 1
			start += step
			stop += step

	print "\n## Normalizing counts"
	for contig in contigs.values():
		total = float(sum(contig.kmers.values()))
		for kmer, count in contig.kmers.items():
			if total > 0:
				contig.kmers[kmer] = 100*count/total
			else:
				contig.kmers[kmer] = 0.00

	print "\n## Performing PCA"
	df = pd.DataFrame(dict([(c.id, c.kmers) for c in contigs.values()]))
	pca = PCA(n_components=1)
	pca.fit(df)
	pc1 = pca.components_[0]

	print "\n## Computing per-contig deviation from the mean along the first principal component"
	mean_pc = np.mean(pc1)
	std_pc = np.std(pc1)
	for contig_id, contig_pc in zip(list(df.columns), pc1):
		contigs[contig_id].pc = contig_pc
		contigs[contig_id].values = {}
		contigs[contig_id].values['zscore'] = abs(contig_pc - mean_pc)/std_pc if std_pc > 0 else 0.0
		contigs[contig_id].values['delta'] = abs(contig_pc - mean_pc)
		contigs[contig_id].values['percent'] = 100*abs(contig_pc - mean_pc)/mean_pc

	print "\n## Identifying outlier contigs"
	flagged = []
	for contig in contigs.values():
		if contig.values['delta'] > args['cutoff']:
			flagged.append(contig.id)
	out = '%s/flagged_contigs' % args['tmp_dir']
	print ("   flagged contigs: %s" % out)
	with open(out, 'w') as f:
		for contig in flagged:
			f.write(contig+'\n')