Example #1
0
def compress(outdir):
    for module in ['pan_genomes', 'rep_genomes']:
        for species in os.listdir('%s/%s' % (outdir, module)):
            indir = '%s/%s/%s' % (outdir, module, species)
            for file in os.listdir(indir):
                inpath = '%s/%s' % (indir, file)
                if inpath.split('.')[-1] != 'gz':
                    outfile = utility.iopen('%s/%s.gz' % (indir, file), 'w')
                    for line in utility.iopen(inpath):
                        outfile.write(line)
                    outfile.close()
                    os.remove(inpath)
Example #2
0
def build_pangenome_db(args, species):
	""" Build FASTA and BT2 database from pangene species centroids """
	import Bio.SeqIO
	# fasta database
	outdir = '/'.join([args['outdir'], 'genes/temp'])
	pangenome_fasta = open('/'.join([outdir, 'pangenomes.fa']), 'w')
	pangenome_map = open('/'.join([outdir, 'pangenomes.map']), 'w')
	db_stats = {'total_length':0, 'total_seqs':0, 'species':0}
	for sp in species.values():
		db_stats['species'] += 1
		infile = utility.iopen(sp.paths['centroids.ffn'])
		for r in Bio.SeqIO.parse(infile, 'fasta'):
			pangenome_fasta.write('>%s\n%s\n' % (r.id, str(r.seq).upper()))
			pangenome_map.write('%s\t%s\n' % (r.id, sp.id))
			db_stats['total_length'] += len(r.seq)
			db_stats['total_seqs'] += 1
		infile.close()
	pangenome_fasta.close()
	pangenome_map.close()
	# print out database stats
	print("  total species: %s" % db_stats['species'])
	print("  total genes: %s" % db_stats['total_seqs'])
	print("  total base-pairs: %s" % db_stats['total_length'])
	# bowtie2 database
	inpath = '/'.join([outdir, 'pangenomes.fa'])
	outpath = '/'.join([outdir, 'pangenomes'])
	command = ' '.join([args['bowtie2-build'], inpath, outpath])
	args['log'].write('command: '+command+'\n')
	process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	utility.check_exit_code(process, command)
Example #3
0
File: snps.py Project: zhaoc1/MIDAS
def species_pileup(args, species_id, contigs):

    # Set global variables for read filtering
    global global_args  # need global for keep_read function
    global_args = args

    # summary stats
    global aln_stats
    aln_stats = {
        'genome_length': 0,
        'total_depth': 0,
        'covered_bases': 0,
        'aligned_reads': 0,
        'mapped_reads': 0
    }

    # open outfiles
    out_path = '%s/snps/output/%s.snps.gz' % (args['outdir'], species_id)
    out_file = utility.iopen(out_path, 'w')
    header = [
        'ref_id', 'ref_pos', 'ref_allele', 'depth', 'count_a', 'count_c',
        'count_g', 'count_t'
    ]
    out_file.write('\t'.join(header) + '\n')

    # compute coverage
    bampath = '%s/snps/temp/genomes.bam' % args['outdir']
    with pysam.AlignmentFile(bampath, 'rb') as bamfile:
        for contig_id in sorted(list(contigs.keys())):

            contig = contigs[contig_id]

            if contig.species_id != species_id:
                continue

            counts = bamfile.count_coverage(contig.id,
                                            start=0,
                                            end=contig.length,
                                            quality_threshold=args['baseq'],
                                            read_callback=keep_read)

            for i in range(0, contig.length):
                ref_pos = i + 1
                ref_allele = contig.seq[i]
                depth = sum([counts[_][i] for _ in range(4)])
                count_a = counts[0][i]
                count_c = counts[1][i]
                count_g = counts[2][i]
                count_t = counts[3][i]
                row = [
                    contig.id, ref_pos, ref_allele, depth, count_a, count_c,
                    count_g, count_t
                ]
                out_file.write('\t'.join([str(_) for _ in row]) + '\n')
                aln_stats['genome_length'] += 1
                aln_stats['total_depth'] += depth
                if depth > 0: aln_stats['covered_bases'] += 1

    out_file.close()
    return (species_id, aln_stats)
Example #4
0
File: snps.py Project: palc/MIDAS
def format_vcf(args):
    """ Format vcf files to snp files and fill in missing positions """
    inpath = os.path.join(args['outdir'], 'snps/temp/genomes.map')
    ref_to_species = utility.read_ref_to_cluster(inpath)
    for species_id in set(ref_to_species.values()):
        # open outfile
        outpath = '/'.join(
            [args['outdir'],
             'snps/output/%s.snps.gz' % species_id])
        outfile = utility.iopen(outpath, 'w')
        write_snp_record(outfile, header=True)
        # read sorted reference
        ref = read_ref_bases(args, species_id)
        ref_index = 0
        ref_length = len(ref)
        # write formatted records
        vcf_path = '/'.join(
            [args['outdir'],
             'snps/temp/vcf/%s.vcf' % species_id])
        for snp in parse_vcf(vcf_path):  # loop over formatted records from vcf
            while [snp['ref_id'], snp['ref_pos']
                   ] != ref[ref_index][0:2]:  # fill in missing snp positions
                write_snp_record(outfile, None,
                                 ref[ref_index])  # write missing record
                ref_index += 1
            write_snp_record(outfile, snp, None)  # write present record
            ref_index += 1
        while ref_index < ref_length:  # fill in trailing snps
            write_snp_record(outfile, None,
                             ref[ref_index])  # write trailing record
            ref_index += 1
Example #5
0
def write_species_info(args, species):
    outfile = utility.iopen('%s/species_info.txt' % args['outdir'], 'w')
    header = ['species_id', 'rep_genome', 'count_genomes']
    outfile.write('\t'.join(header) + '\n')
    for sp in species:
        values = [str(_) for _ in [sp.id, sp.rep_genome, sp.ngenomes]]
        outfile.write('\t'.join(values) + '\n')
Example #6
0
def main():
    """ Run main pipeline """
    args = parse_args()
    reads = 0
    bp = 0
    for inpath in args['input']:
        infile = utility.iopen(inpath)
        for name, seq, qual in readfq(infile):
            seq_len = len(seq)
            if args['read_length']:  # trim/filter reads
                if seq_len < args['read_length']:
                    continue
                else:
                    seq = seq[0:args['read_length']]
                    seq_len = len(seq)
            sys.stdout.write('>%s_%s\n%s\n' % (name, seq_len, seq))
            reads += 1
            bp += seq_len
            if reads == args['max_reads']:
                sys.stderr.write(
                    '%s\t%s' %
                    (reads, bp))  # write number of reads, bp to stderr
                return
    sys.stderr.write('%s\t%s' %
                     (reads, bp))  # write number of reads, bp to stderr
Example #7
0
def write_results(args, species, genes):
	""" Write results to disk """
	# open outfiles for each species_id
	header = ['gene_id', 'count_reads', 'coverage', 'copy_number']
	for sp in species.values():
		path = '/'.join([args['outdir'], 'genes/output/%s.genes.gz' % sp.id])
		sp.out = utility.iopen(path, 'w')
		sp.out.write('\t'.join(header)+'\n')
	# write to output files
	for gene_id in sorted(genes):
		gene = genes[gene_id]
		sp = species[gene.species_id]
		values = [gene.id, gene.reads, gene.depth, gene.copies]
		sp.out.write('\t'.join([str(_) for _ in values])+'\n')
	# close output files
	for sp in species.values():
		sp.out.close()
	# summary stats
	path = '/'.join([args['outdir'], 'genes/summary.txt'])
	file = open(path, 'w')
	header = ['species_id', 'pangenome_size', 'covered_genes', 'fraction_covered', 'mean_coverage', 'marker_coverage', 'count_reads']
	file.write('\t'.join(header)+'\n')
	for sp in species.values():
		values = [sp.id, sp.pangenome_size, sp.covered_genes, sp.fraction_covered, sp.mean_coverage, sp.marker_coverage, sp.reads]
		file.write('\t'.join([str(_) for _ in values])+'\n')
	file.close()
Example #8
0
File: genes.py Project: palc/MIDAS
def build_pangenome_db(args, genome_clusters):
	""" Build FASTA and BT2 database from pangene cluster centroids """
	import Bio.SeqIO
	# fasta database
	outdir = '/'.join([args['outdir'], 'genes/temp'])
	pangenome_fasta = open('/'.join([outdir, 'pangenomes.fa']), 'w')
	pangenome_map = open('/'.join([outdir, 'pangenome.map']), 'w')
	db_stats = {'total_length':0, 'total_seqs':0, 'genome_clusters':0}
	for species_id in genome_clusters:
		db_stats['genome_clusters'] += 1
		inpath = '/'.join([args['db'], 'genome_clusters', species_id, 'pangenome.fa.gz'])
		infile = utility.iopen(inpath)
		for r in Bio.SeqIO.parse(infile, 'fasta'):
			genome_id = '.'.join(r.id.split('.')[0:2])
			if not args['tax_mask'] or genome_id not in args['tax_mask']:
				pangenome_fasta.write('>%s\n%s\n' % (r.id, str(r.seq)))
				pangenome_map.write('%s\t%s\n' % (r.id, species_id))
				db_stats['total_length'] += len(r.seq)
				db_stats['total_seqs'] += 1
	pangenome_fasta.close()
	pangenome_map.close()
	# print out database stats
	print("  total species: %s" % db_stats['genome_clusters'])
	print("  total genes: %s" % db_stats['total_seqs'])
	print("  total base-pairs: %s" % db_stats['total_length'])
	# bowtie2 database
	inpath = '/'.join([outdir, 'pangenomes.fa'])
	outpath = '/'.join([outdir, 'pangenomes'])
	command = ' '.join([args['bowtie2-build'], inpath, outpath])
	args['log'].write('command: '+command+'\n')
	process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	utility.check_exit_code(process, command)
Example #9
0
File: genes.py Project: palc/MIDAS
def compute_pangenome_coverage(args):
	""" Compute coverage of pangenome for species_id and write results to disk """
	# map ref_id to species_id
	ref_to_cluster = {}
	for line in open('/'.join([args['outdir'], 'genes/temp/pangenome.map'])):
		ref_id, species_id = line.rstrip().split()
		ref_to_cluster[ref_id] = species_id
	# open outfiles for each species_id
	outfiles = {}
	genome_clusters = set(ref_to_cluster.values())
	for species_id in genome_clusters:
		outpath = '/'.join([args['outdir'], 'genes/output/%s.genes.gz' % species_id])
		outfiles[species_id] = utility.iopen(outpath, 'w')
		outfiles[species_id].write('\t'.join(['gene_id', 'coverage', 'copy_number'])+'\n')
	# parse bam into cov files for each species_id
	ref_to_cov = count_mapped_bp(args)
	# compute normalization factor
	cluster_to_norm = compute_phyeco_cov(args, genome_clusters, ref_to_cov, ref_to_cluster)
	# write to output files
	for ref_id in sorted(ref_to_cov):
		cov = ref_to_cov[ref_id]
		species_id = ref_to_cluster[ref_id]
		outfile = outfiles[species_id]
		normcov = cov/cluster_to_norm[species_id] if cluster_to_norm[species_id] > 0 else 0.0
		outfile.write('\t'.join([str(x) for x in [ref_id, cov, normcov]])+'\n')
Example #10
0
    def write_readme(self):
        """ Concatenate all genes from pangenome into sequence file """
        file = utility.iopen('%s/readme.txt' % self.dir, 'w')
        file.write("""
Description and statistics for pan-genome files

Summary Statistics
############

Genomes: %(genomes)s
Genes: %(genes)s
Gene clusters (95%% identity): %(centroids_95)s
Gene clusters (90%% identity): %(centroids_90)s
Gene clusters (85%% identity): %(centroids_85)s
Gene clusters (80%% identity): %(centroids_80)s
Gene clusters (75%% identity): %(centroids_75)s

Output files
############
genes.ffn
  all genes from specified genomes

centroids.ffn
  gene sequences from 95%% identity gene clusters
  used for recruiting metagenomic reads

gene_info.txt
  information for all genes from genes.ffn
  the fields centroid_{95,90,95,80,75} indicate mappings between gene_id and gene clusters
""" % self.stats)
        file.close()
Example #11
0
def build_genome_db(args, species):
    """ Build FASTA and BT2 database of representative genomes """
    import Bio.SeqIO
    # fasta database
    outfile = open('/'.join([args['outdir'], 'snps/temp/genomes.fa']), 'w')
    db_stats = {'total_length': 0, 'total_seqs': 0, 'species': 0}
    for sp in species.values():
        db_stats['species'] += 1
        infile = utility.iopen(sp.paths['fna'])
        for r in Bio.SeqIO.parse(infile, 'fasta'):
            outfile.write('>%s\n%s\n' % (r.id, str(r.seq).upper()))
            db_stats['total_length'] += len(r.seq)
            db_stats['total_seqs'] += 1
        infile.close()
    outfile.close()
    # print out database stats
    print("  total genomes: %s" % db_stats['species'])
    print("  total contigs: %s" % db_stats['total_seqs'])
    print("  total base-pairs: %s" % db_stats['total_length'])
    # bowtie2 database
    inpath = '/'.join([args['outdir'], 'snps/temp/genomes.fa'])
    outpath = '/'.join([args['outdir'], 'snps/temp/genomes'])
    command = ' '.join([args['bowtie2-build'], inpath, outpath])
    args['log'].write('command: ' + command + '\n')
    process = subprocess.Popen(command,
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    utility.check_exit_code(process, command)
Example #12
0
def parse_mapping_file(args):
    infile = utility.iopen(args['mapfile'])
    fields = next(infile).rstrip('\n').split('\t')
    for field in ['genome_id', 'species_id']:
        if field not in fields:
            sys.exit("Error: mapping file '%s' has no field labeled '%s'" %
                     (args['mapfile'], field))
    for field in fields:
        if field not in ['genome_id', 'species_id', 'rep_genome']:
            sys.exit(
                "Error: mapping file '%s' has unknown field labeled '%s'" %
                (args['mapfile'], field))
    for line in infile:
        if len(line.rstrip()) == 0: continue
        values = line.rstrip('\n').split('\t')
        record = dict([(f, v) for f, v in zip(fields, values)])
        if len(values) < len(fields):
            sys.exit(
                "Error: mapping file '%s' has different number of fields per row"
                % args['mapfile'])
        if 'rep_genome' in fields and record['rep_genome'] not in ['0', '1']:
            sys.exit(
                "Error: mapping file '%s' has unknown value '%s' for field 'rep_genome'"
                % (args['mapfile'], record['rep_genome']))
        yield record
Example #13
0
 def parse_fasta(self, p_in):
     """ Return lookup of seq_id to sequence for PATRIC genes """
     seqs = {}
     infile = utility.iopen(p_in)
     for r in Bio.SeqIO.parse(infile, "fasta"):
         seqs[r.id] = str(r.seq).upper()
     infile.close()
     return seqs
Example #14
0
File: snps.py Project: palc/MIDAS
def fetch_centroid(args, species_id):
    """ Get the genome_id corresponding to cluster centroid """
    inpath = '/'.join(
        [args['db'], 'genome_clusters', species_id, 'genomes.txt.gz'])
    infile = utility.iopen(inpath)
    for line in infile:
        if line.split()[2] == 'Y':
            return line.split()[1]
Example #15
0
def write_genome_info(args, species):
    outfile = utility.iopen('%s/genome_info.txt' % args['outdir'], 'w')
    header = ['genome_id', 'species_id', 'rep_genome']
    outfile.write('\t'.join(header) + '\n')
    for sp in species:
        for genome_id in sp.genomes:
            rep_genome = '1' if genome_id == sp.rep_genome else '0'
            values = [genome_id, sp.id, rep_genome]
            outfile.write('\t'.join(values) + '\n')
Example #16
0
def read_genome(db, species_id):
    """ Read in representative genome from reference database """
    inpath = '%s/rep_genomes/%s/genome.fna.gz' % (db, species_id)
    infile = utility.iopen(inpath)
    genome = {}
    for r in Bio.SeqIO.parse(infile, 'fasta'):
        genome[r.id] = r.seq.upper()
    infile.close()
    return genome
Example #17
0
def read_run_midas_snps(species_id, samples):
    """ Open SNP files for species across samples """
    infiles = []
    for sample in samples:
        path = '%s/snps/output/%s.snps.gz' % (sample.dir, species_id)
        file = utility.iopen(path)
        next(file)
        infiles.append(file)
    return infiles
Example #18
0
 def write_genes(self, resume):
     """ Concatenate all genes from pangenome into sequence file """
     ffn_path = '%s/genes.ffn' % self.dir
     if os.path.exists(
             ffn_path) and os.stat(ffn_path).st_size > 0 and resume:
         return
     file = utility.iopen('%s/genes.ffn' % self.dir, 'w')
     for gene in self.genes.values():
         file.write('>%s\n%s\n' % (gene.id, gene.seq))
     file.close()
Example #19
0
def parse_tsv(inpath):
	""" yield records from tab-delimited file with row and column names """
	infile = utility.iopen(inpath)
	header = next(infile).rstrip('\n').split('\t')
	for line in infile:
		split_line = line.rstrip('\n').split('\t')
		id = split_line[0]
		values = split_line[1:]
		yield id, values
	infile.close()
Example #20
0
File: snps.py Project: palc/MIDAS
def read_ref_bases(args, species_id):
    """ Read in reference genome by position """
    import Bio.SeqIO
    ref = []
    centroid_path = '/'.join(
        [args['db'], 'genome_clusters', species_id, 'genome.fna.gz'])
    infile = utility.iopen(centroid_path)
    for rec in Bio.SeqIO.parse(infile, 'fasta'):
        for pos in range(1, len(rec.seq) + 1):
            ref.append([rec.id, pos, rec.seq[pos - 1].upper()])
    return sorted(ref)
Example #21
0
def initialize_contigs(species):
    contigs = {}
    for sp in species.values():
        infile = utility.iopen(sp.paths['fna'])
        for rec in Bio.SeqIO.parse(infile, 'fasta'):
            contig = Contig(rec.id)
            contig.seq = str(rec.seq).upper()
            contig.length = len(contig.seq)
            contig.species_id = sp.id
            contigs[contig.id] = contig
        infile.close()
    return contigs
Example #22
0
def read_gene_map(species_id, args):
    """ Map 99% centroids to gene_ids at lower level """
    gene_to_family = {}
    inpath = '%s/genome_clusters/%s/pangenome.map.gz' % (args['db'],
                                                         species_id)
    infile = utility.iopen(inpath)
    fields = next(infile).rstrip().split()
    for line in infile:
        values = line.rstrip().split()
        map = dict([(f, v) for f, v in zip(fields, values)])
        gene_to_family[map['99']] = map[args['cluster_pid']]
    return gene_to_family
Example #23
0
def initialize_genes(args, species):
	""" Initialize Gene objects """
	genes = {}
	# fetch gene_id, species_id, gene length
	for sp in species.values():
		path = sp.paths['centroids.ffn']
		file = utility.iopen(path)
		for seq in Bio.SeqIO.parse(file, 'fasta'):
			genes[seq.id] = Gene(seq.id)
			genes[seq.id].species_id = sp.id
			genes[seq.id].length = len(seq.seq)
			sp.pangenome_size += 1
		file.close()
	# fetch marker_id
	path = '%s/marker_genes/phyeco.map' % args['db']
	file = utility.iopen(path)
	reader = csv.DictReader(file, delimiter='\t')
	for r in reader:
		if r['gene_id'] in genes:
			genes[r['gene_id']].marker_id=r['marker_id']
	file.close()
	return genes
Example #24
0
def read_function_map(ref_db, species_id, ontology):
    """ Map gene ids to functions for given ontology """
    gene_to_functions = {}
    inpath = '%s/genome_clusters/%s/pangenome.functions.gz' % (ref_db,
                                                               species_id)
    infile = utility.iopen(inpath)
    for index, line in enumerate(infile):
        gene_id, function_id, ont = line.rstrip().split()
        if ont == ontology:
            if gene_id not in gene_to_functions:
                gene_to_functions[gene_id] = []
            gene_to_functions[gene_id].append(function_id)
    return gene_to_functions
Example #25
0
 def parse_uclust(self, inpath):
     """ Yield formatted records from UCLUST output file """
     # centroids are type == 'S'
     # non-centroids are type == 'H'
     # clusters are type == 'C'
     fields = [
         'type', 'cluster_id', 'size', 'pid', 'strand', 'skip1', 'skip2',
         'skip3', 'gene_id', 'centroid_id'
     ]
     with utility.iopen(inpath) as infile:
         for index, line in enumerate(infile):
             values = line.rstrip('\n').split('\t')
             record = dict([(f, v) for f, v in zip(fields, values)])
             yield record
Example #26
0
 def write_gene_info(self):
     """ Record gene info in gene_info.txt """
     file = utility.iopen('%s/gene_info.txt' % self.dir, 'w')
     header = [
         'gene_id', 'genome_id', 'gene_length', 'centroid_99',
         'centroid_95', 'centroid_90', 'centroid_85', 'centroid_80',
         'centroid_75'
     ]
     file.write('\t'.join(header) + '\n')
     for gene_id in sorted(self.genes.keys()):
         g = self.genes[gene_id]
         values = [
             g.id, g.genome_id, g.length, g.centroid_99, g.centroid_95,
             g.centroid_90, g.centroid_85, g.centroid_80, g.centroid_75
         ]
         file.write('\t'.join([str(_) for _ in values]) + '\n')
     file.close()
Example #27
0
 def parse_hmmsearch(self, p_in):
     """ Parse HMMER domblout files. Return data-type formatted dictionary """
     f_in = utility.iopen(p_in)
     for line in f_in:
         if line[0] == '#': continue
         x = line.rstrip().split()
         query = x[0]
         target = x[3]
         evalue = float(x[12])
         qcov = (int(x[20]) - int(x[19]) + 1) / float(x[2])
         tcov = (int(x[16]) - int(x[15]) + 1) / float(x[5])
         yield {
             'query': query,
             'target': target,
             'evalue': evalue,
             'qcov': qcov,
             'tcov': tcov,
             'qlen': int(x[2]),
             'tlen': int(x[5])
         }
Example #28
0
File: snps.py Project: palc/MIDAS
def build_genome_db(args, genome_clusters):
    """ Build FASTA and BT2 database from genome cluster centroids """
    # fasta database
    genomes_fasta = open('/'.join([args['outdir'], 'snps/temp/genomes.fa']),
                         'w')
    genomes_map = open('/'.join([args['outdir'], 'snps/temp/genomes.map']),
                       'w')
    db_stats = {'total_length': 0, 'total_seqs': 0, 'genome_clusters': 0}
    for species_id in genome_clusters:
        if args['tax_mask'] and fetch_centroid(args,
                                               species_id) in args['tax_mask']:
            continue
        db_stats['genome_clusters'] += 1
        inpath = '/'.join(
            [args['db'], 'genome_clusters', species_id, 'genome.fna.gz'])
        infile = utility.iopen(inpath)
        for line in infile:
            genomes_fasta.write(line)
            db_stats['total_length'] += len(line.rstrip())
            if line[0] == '>':
                sid = line.rstrip().lstrip('>').split()[0]
                genomes_map.write(sid + '\t' + species_id + '\n')
                db_stats['total_seqs'] += 1
    # print out database stats
    print("  total genomes: %s" % db_stats['genome_clusters'])
    print("  total contigs: %s" % db_stats['total_seqs'])
    print("  total base-pairs: %s" % db_stats['total_length'])
    # bowtie2 database
    inpath = '/'.join([args['outdir'], 'snps/temp/genomes.fa'])
    outpath = '/'.join([args['outdir'], 'snps/temp/genomes'])
    command = ' '.join([args['bowtie2-build'], inpath, outpath])
    args['log'].write('command: ' + command + '\n')
    process = subprocess.Popen(command,
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    utility.check_exit_code(process, command)
Example #29
0
File: genes.py Project: palc/MIDAS
def compute_phyeco_cov(args, genome_clusters, ref_to_cov, ref_to_cluster):
	""" Count number of bp mapped to each PhyEco marker gene """
	from numpy import median
	# read in set of phyeco markers for normalization
	phyeco_ids = set([])
	inpath = '/'.join([args['db'], 'marker_genes/pid_cutoffs.txt'])
	if not os.path.isfile(inpath): sys.exit("File not found: %s" % inpath)
	for line in open(inpath):
		phyeco_id, pid = line.rstrip().split('\t')
		phyeco_ids.add(phyeco_id)
	# read in map of gene to phyeco marker
	ref_to_phyeco = {}
	for species_id in genome_clusters:
		inpath = '/'.join([args['db'], 'genome_clusters', species_id, 'pangenome.marker_genes.gz'])
		infile = utility.iopen(inpath)
		next(infile)
		for line in infile:
			gene_id, phyeco_id = line.rstrip().split()
			ref_to_phyeco[gene_id] = phyeco_id
	# init phyeco coverage
	cluster_to_phyeco_to_cov = {}
	for species_id in genome_clusters:
		cluster_to_phyeco_to_cov[species_id] = {}
		for phyeco_id in phyeco_ids:
			cluster_to_phyeco_to_cov[species_id][phyeco_id] = 0.0
	# compute phyeco coverages
	for ref_id, phyeco_id in ref_to_phyeco.items():
		species_id = ref_to_cluster[ref_id]
		if phyeco_id in phyeco_ids and ref_id in ref_to_cov:
			cluster_to_phyeco_to_cov[species_id][phyeco_id] += ref_to_cov[ref_id]
	# compute median phyeco cov
	cluster_to_norm = {}
	for species_id in cluster_to_phyeco_to_cov:
		covs = list(cluster_to_phyeco_to_cov[species_id].values())
		cluster_to_norm[species_id] = median(covs)
	return cluster_to_norm
Example #30
0
def format_pileup(args, species, contigs):
    """ Parse mpileups and fill in missing positions """

    # open outfiles
    for sp in species.values():
        sp.out = utility.iopen(
            '/'.join([args['outdir'],
                      'snps/output/%s.snps.gz' % sp.id]), 'w')
        header = [
            'ref_id', 'ref_pos', 'ref_allele', 'alt_allele', 'ref_freq',
            'depth', 'count_atcg'
        ]
        sp.out.write('\t'.join(header) + '\n')
        sp.contigs = sorted(
            [c.id for c in contigs.values() if c.species_id == sp.id])
        sp.i = 0  # contig index
        sp.j = 0  # position index

    # parse pileup
    pileup_path = '%s/snps/temp/genomes.mpileup.gz' % args['outdir']
    for p in parse_pileup.main(pileup_path):

        # fetch contig info
        sp = species[contigs[p.ref_id].species_id]
        contig = contigs[sp.contigs[sp.i]]

        # contig ids don't match
        #   indicates that one or more upstream contigs have zero coverage
        while p.ref_id != contig.id:
            write_missing(sp.out,
                          ref_id=contig.id,
                          ref_pos=str(sp.j + 1),
                          ref_allele=contig.seq[sp.j])
            sp.j += 1
            if sp.j >= contig.length:
                sp.i += 1
                sp.j = 0
                contig = contigs[sp.contigs[sp.i]]

        # positions don't match
        #   indicates that one or more upstream positions have zero coverage
        while p.ref_pos != sp.j + 1:
            write_missing(sp.out,
                          ref_id=contig.id,
                          ref_pos=str(sp.j + 1),
                          ref_allele=contig.seq[sp.j])
            sp.j += 1
            if sp.j >= contig.length:
                sp.i += 1
                sp.j = 0
                contig = contigs[sp.contigs[sp.i]]

        # match
        #   write info from pileup
        write_present(sp.out, pileup=p)
        sp.j += 1

        # end of contig
        if sp.j >= contig.length:
            sp.i += 1
            sp.j = 0

    # fill in downstream positions & contigs with zero coverage
    for sp in species.values():

        # no remaining contigs
        if sp.i < len(sp.contigs):

            # lefover positions on last contig
            #   indicates that one or more downstream positions have zero coverage
            contig = contigs[sp.contigs[sp.i]]
            while sp.j < contig.length:
                write_missing(sp.out,
                              ref_id=contig.id,
                              ref_pos=str(sp.j + 1),
                              ref_allele=contig.seq[sp.j])
                sp.j += 1

            # lefover contigs
            #   indicates that one or more downstream contigs have zero coverage
            sp.i += 1
            sp.j = 0
            while sp.i < len(sp.contigs):
                contig = contigs[sp.contigs[sp.i]]
                write_missing(sp.out,
                              ref_id=contig.id,
                              ref_pos=str(sp.j + 1),
                              ref_allele=contig.seq[sp.j])
                sp.j += 1
                if sp.j >= contig.length:
                    sp.i += 1
                    sp.j = 0

        # close output files
        sp.out.close()