def compress(outdir): for module in ['pan_genomes', 'rep_genomes']: for species in os.listdir('%s/%s' % (outdir, module)): indir = '%s/%s/%s' % (outdir, module, species) for file in os.listdir(indir): inpath = '%s/%s' % (indir, file) if inpath.split('.')[-1] != 'gz': outfile = utility.iopen('%s/%s.gz' % (indir, file), 'w') for line in utility.iopen(inpath): outfile.write(line) outfile.close() os.remove(inpath)
def build_pangenome_db(args, species): """ Build FASTA and BT2 database from pangene species centroids """ import Bio.SeqIO # fasta database outdir = '/'.join([args['outdir'], 'genes/temp']) pangenome_fasta = open('/'.join([outdir, 'pangenomes.fa']), 'w') pangenome_map = open('/'.join([outdir, 'pangenomes.map']), 'w') db_stats = {'total_length':0, 'total_seqs':0, 'species':0} for sp in species.values(): db_stats['species'] += 1 infile = utility.iopen(sp.paths['centroids.ffn']) for r in Bio.SeqIO.parse(infile, 'fasta'): pangenome_fasta.write('>%s\n%s\n' % (r.id, str(r.seq).upper())) pangenome_map.write('%s\t%s\n' % (r.id, sp.id)) db_stats['total_length'] += len(r.seq) db_stats['total_seqs'] += 1 infile.close() pangenome_fasta.close() pangenome_map.close() # print out database stats print(" total species: %s" % db_stats['species']) print(" total genes: %s" % db_stats['total_seqs']) print(" total base-pairs: %s" % db_stats['total_length']) # bowtie2 database inpath = '/'.join([outdir, 'pangenomes.fa']) outpath = '/'.join([outdir, 'pangenomes']) command = ' '.join([args['bowtie2-build'], inpath, outpath]) args['log'].write('command: '+command+'\n') process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) utility.check_exit_code(process, command)
def species_pileup(args, species_id, contigs): # Set global variables for read filtering global global_args # need global for keep_read function global_args = args # summary stats global aln_stats aln_stats = { 'genome_length': 0, 'total_depth': 0, 'covered_bases': 0, 'aligned_reads': 0, 'mapped_reads': 0 } # open outfiles out_path = '%s/snps/output/%s.snps.gz' % (args['outdir'], species_id) out_file = utility.iopen(out_path, 'w') header = [ 'ref_id', 'ref_pos', 'ref_allele', 'depth', 'count_a', 'count_c', 'count_g', 'count_t' ] out_file.write('\t'.join(header) + '\n') # compute coverage bampath = '%s/snps/temp/genomes.bam' % args['outdir'] with pysam.AlignmentFile(bampath, 'rb') as bamfile: for contig_id in sorted(list(contigs.keys())): contig = contigs[contig_id] if contig.species_id != species_id: continue counts = bamfile.count_coverage(contig.id, start=0, end=contig.length, quality_threshold=args['baseq'], read_callback=keep_read) for i in range(0, contig.length): ref_pos = i + 1 ref_allele = contig.seq[i] depth = sum([counts[_][i] for _ in range(4)]) count_a = counts[0][i] count_c = counts[1][i] count_g = counts[2][i] count_t = counts[3][i] row = [ contig.id, ref_pos, ref_allele, depth, count_a, count_c, count_g, count_t ] out_file.write('\t'.join([str(_) for _ in row]) + '\n') aln_stats['genome_length'] += 1 aln_stats['total_depth'] += depth if depth > 0: aln_stats['covered_bases'] += 1 out_file.close() return (species_id, aln_stats)
def format_vcf(args): """ Format vcf files to snp files and fill in missing positions """ inpath = os.path.join(args['outdir'], 'snps/temp/genomes.map') ref_to_species = utility.read_ref_to_cluster(inpath) for species_id in set(ref_to_species.values()): # open outfile outpath = '/'.join( [args['outdir'], 'snps/output/%s.snps.gz' % species_id]) outfile = utility.iopen(outpath, 'w') write_snp_record(outfile, header=True) # read sorted reference ref = read_ref_bases(args, species_id) ref_index = 0 ref_length = len(ref) # write formatted records vcf_path = '/'.join( [args['outdir'], 'snps/temp/vcf/%s.vcf' % species_id]) for snp in parse_vcf(vcf_path): # loop over formatted records from vcf while [snp['ref_id'], snp['ref_pos'] ] != ref[ref_index][0:2]: # fill in missing snp positions write_snp_record(outfile, None, ref[ref_index]) # write missing record ref_index += 1 write_snp_record(outfile, snp, None) # write present record ref_index += 1 while ref_index < ref_length: # fill in trailing snps write_snp_record(outfile, None, ref[ref_index]) # write trailing record ref_index += 1
def write_species_info(args, species): outfile = utility.iopen('%s/species_info.txt' % args['outdir'], 'w') header = ['species_id', 'rep_genome', 'count_genomes'] outfile.write('\t'.join(header) + '\n') for sp in species: values = [str(_) for _ in [sp.id, sp.rep_genome, sp.ngenomes]] outfile.write('\t'.join(values) + '\n')
def main(): """ Run main pipeline """ args = parse_args() reads = 0 bp = 0 for inpath in args['input']: infile = utility.iopen(inpath) for name, seq, qual in readfq(infile): seq_len = len(seq) if args['read_length']: # trim/filter reads if seq_len < args['read_length']: continue else: seq = seq[0:args['read_length']] seq_len = len(seq) sys.stdout.write('>%s_%s\n%s\n' % (name, seq_len, seq)) reads += 1 bp += seq_len if reads == args['max_reads']: sys.stderr.write( '%s\t%s' % (reads, bp)) # write number of reads, bp to stderr return sys.stderr.write('%s\t%s' % (reads, bp)) # write number of reads, bp to stderr
def write_results(args, species, genes): """ Write results to disk """ # open outfiles for each species_id header = ['gene_id', 'count_reads', 'coverage', 'copy_number'] for sp in species.values(): path = '/'.join([args['outdir'], 'genes/output/%s.genes.gz' % sp.id]) sp.out = utility.iopen(path, 'w') sp.out.write('\t'.join(header)+'\n') # write to output files for gene_id in sorted(genes): gene = genes[gene_id] sp = species[gene.species_id] values = [gene.id, gene.reads, gene.depth, gene.copies] sp.out.write('\t'.join([str(_) for _ in values])+'\n') # close output files for sp in species.values(): sp.out.close() # summary stats path = '/'.join([args['outdir'], 'genes/summary.txt']) file = open(path, 'w') header = ['species_id', 'pangenome_size', 'covered_genes', 'fraction_covered', 'mean_coverage', 'marker_coverage', 'count_reads'] file.write('\t'.join(header)+'\n') for sp in species.values(): values = [sp.id, sp.pangenome_size, sp.covered_genes, sp.fraction_covered, sp.mean_coverage, sp.marker_coverage, sp.reads] file.write('\t'.join([str(_) for _ in values])+'\n') file.close()
def build_pangenome_db(args, genome_clusters): """ Build FASTA and BT2 database from pangene cluster centroids """ import Bio.SeqIO # fasta database outdir = '/'.join([args['outdir'], 'genes/temp']) pangenome_fasta = open('/'.join([outdir, 'pangenomes.fa']), 'w') pangenome_map = open('/'.join([outdir, 'pangenome.map']), 'w') db_stats = {'total_length':0, 'total_seqs':0, 'genome_clusters':0} for species_id in genome_clusters: db_stats['genome_clusters'] += 1 inpath = '/'.join([args['db'], 'genome_clusters', species_id, 'pangenome.fa.gz']) infile = utility.iopen(inpath) for r in Bio.SeqIO.parse(infile, 'fasta'): genome_id = '.'.join(r.id.split('.')[0:2]) if not args['tax_mask'] or genome_id not in args['tax_mask']: pangenome_fasta.write('>%s\n%s\n' % (r.id, str(r.seq))) pangenome_map.write('%s\t%s\n' % (r.id, species_id)) db_stats['total_length'] += len(r.seq) db_stats['total_seqs'] += 1 pangenome_fasta.close() pangenome_map.close() # print out database stats print(" total species: %s" % db_stats['genome_clusters']) print(" total genes: %s" % db_stats['total_seqs']) print(" total base-pairs: %s" % db_stats['total_length']) # bowtie2 database inpath = '/'.join([outdir, 'pangenomes.fa']) outpath = '/'.join([outdir, 'pangenomes']) command = ' '.join([args['bowtie2-build'], inpath, outpath]) args['log'].write('command: '+command+'\n') process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) utility.check_exit_code(process, command)
def compute_pangenome_coverage(args): """ Compute coverage of pangenome for species_id and write results to disk """ # map ref_id to species_id ref_to_cluster = {} for line in open('/'.join([args['outdir'], 'genes/temp/pangenome.map'])): ref_id, species_id = line.rstrip().split() ref_to_cluster[ref_id] = species_id # open outfiles for each species_id outfiles = {} genome_clusters = set(ref_to_cluster.values()) for species_id in genome_clusters: outpath = '/'.join([args['outdir'], 'genes/output/%s.genes.gz' % species_id]) outfiles[species_id] = utility.iopen(outpath, 'w') outfiles[species_id].write('\t'.join(['gene_id', 'coverage', 'copy_number'])+'\n') # parse bam into cov files for each species_id ref_to_cov = count_mapped_bp(args) # compute normalization factor cluster_to_norm = compute_phyeco_cov(args, genome_clusters, ref_to_cov, ref_to_cluster) # write to output files for ref_id in sorted(ref_to_cov): cov = ref_to_cov[ref_id] species_id = ref_to_cluster[ref_id] outfile = outfiles[species_id] normcov = cov/cluster_to_norm[species_id] if cluster_to_norm[species_id] > 0 else 0.0 outfile.write('\t'.join([str(x) for x in [ref_id, cov, normcov]])+'\n')
def write_readme(self): """ Concatenate all genes from pangenome into sequence file """ file = utility.iopen('%s/readme.txt' % self.dir, 'w') file.write(""" Description and statistics for pan-genome files Summary Statistics ############ Genomes: %(genomes)s Genes: %(genes)s Gene clusters (95%% identity): %(centroids_95)s Gene clusters (90%% identity): %(centroids_90)s Gene clusters (85%% identity): %(centroids_85)s Gene clusters (80%% identity): %(centroids_80)s Gene clusters (75%% identity): %(centroids_75)s Output files ############ genes.ffn all genes from specified genomes centroids.ffn gene sequences from 95%% identity gene clusters used for recruiting metagenomic reads gene_info.txt information for all genes from genes.ffn the fields centroid_{95,90,95,80,75} indicate mappings between gene_id and gene clusters """ % self.stats) file.close()
def build_genome_db(args, species): """ Build FASTA and BT2 database of representative genomes """ import Bio.SeqIO # fasta database outfile = open('/'.join([args['outdir'], 'snps/temp/genomes.fa']), 'w') db_stats = {'total_length': 0, 'total_seqs': 0, 'species': 0} for sp in species.values(): db_stats['species'] += 1 infile = utility.iopen(sp.paths['fna']) for r in Bio.SeqIO.parse(infile, 'fasta'): outfile.write('>%s\n%s\n' % (r.id, str(r.seq).upper())) db_stats['total_length'] += len(r.seq) db_stats['total_seqs'] += 1 infile.close() outfile.close() # print out database stats print(" total genomes: %s" % db_stats['species']) print(" total contigs: %s" % db_stats['total_seqs']) print(" total base-pairs: %s" % db_stats['total_length']) # bowtie2 database inpath = '/'.join([args['outdir'], 'snps/temp/genomes.fa']) outpath = '/'.join([args['outdir'], 'snps/temp/genomes']) command = ' '.join([args['bowtie2-build'], inpath, outpath]) args['log'].write('command: ' + command + '\n') process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) utility.check_exit_code(process, command)
def parse_mapping_file(args): infile = utility.iopen(args['mapfile']) fields = next(infile).rstrip('\n').split('\t') for field in ['genome_id', 'species_id']: if field not in fields: sys.exit("Error: mapping file '%s' has no field labeled '%s'" % (args['mapfile'], field)) for field in fields: if field not in ['genome_id', 'species_id', 'rep_genome']: sys.exit( "Error: mapping file '%s' has unknown field labeled '%s'" % (args['mapfile'], field)) for line in infile: if len(line.rstrip()) == 0: continue values = line.rstrip('\n').split('\t') record = dict([(f, v) for f, v in zip(fields, values)]) if len(values) < len(fields): sys.exit( "Error: mapping file '%s' has different number of fields per row" % args['mapfile']) if 'rep_genome' in fields and record['rep_genome'] not in ['0', '1']: sys.exit( "Error: mapping file '%s' has unknown value '%s' for field 'rep_genome'" % (args['mapfile'], record['rep_genome'])) yield record
def parse_fasta(self, p_in): """ Return lookup of seq_id to sequence for PATRIC genes """ seqs = {} infile = utility.iopen(p_in) for r in Bio.SeqIO.parse(infile, "fasta"): seqs[r.id] = str(r.seq).upper() infile.close() return seqs
def fetch_centroid(args, species_id): """ Get the genome_id corresponding to cluster centroid """ inpath = '/'.join( [args['db'], 'genome_clusters', species_id, 'genomes.txt.gz']) infile = utility.iopen(inpath) for line in infile: if line.split()[2] == 'Y': return line.split()[1]
def write_genome_info(args, species): outfile = utility.iopen('%s/genome_info.txt' % args['outdir'], 'w') header = ['genome_id', 'species_id', 'rep_genome'] outfile.write('\t'.join(header) + '\n') for sp in species: for genome_id in sp.genomes: rep_genome = '1' if genome_id == sp.rep_genome else '0' values = [genome_id, sp.id, rep_genome] outfile.write('\t'.join(values) + '\n')
def read_genome(db, species_id): """ Read in representative genome from reference database """ inpath = '%s/rep_genomes/%s/genome.fna.gz' % (db, species_id) infile = utility.iopen(inpath) genome = {} for r in Bio.SeqIO.parse(infile, 'fasta'): genome[r.id] = r.seq.upper() infile.close() return genome
def read_run_midas_snps(species_id, samples): """ Open SNP files for species across samples """ infiles = [] for sample in samples: path = '%s/snps/output/%s.snps.gz' % (sample.dir, species_id) file = utility.iopen(path) next(file) infiles.append(file) return infiles
def write_genes(self, resume): """ Concatenate all genes from pangenome into sequence file """ ffn_path = '%s/genes.ffn' % self.dir if os.path.exists( ffn_path) and os.stat(ffn_path).st_size > 0 and resume: return file = utility.iopen('%s/genes.ffn' % self.dir, 'w') for gene in self.genes.values(): file.write('>%s\n%s\n' % (gene.id, gene.seq)) file.close()
def parse_tsv(inpath): """ yield records from tab-delimited file with row and column names """ infile = utility.iopen(inpath) header = next(infile).rstrip('\n').split('\t') for line in infile: split_line = line.rstrip('\n').split('\t') id = split_line[0] values = split_line[1:] yield id, values infile.close()
def read_ref_bases(args, species_id): """ Read in reference genome by position """ import Bio.SeqIO ref = [] centroid_path = '/'.join( [args['db'], 'genome_clusters', species_id, 'genome.fna.gz']) infile = utility.iopen(centroid_path) for rec in Bio.SeqIO.parse(infile, 'fasta'): for pos in range(1, len(rec.seq) + 1): ref.append([rec.id, pos, rec.seq[pos - 1].upper()]) return sorted(ref)
def initialize_contigs(species): contigs = {} for sp in species.values(): infile = utility.iopen(sp.paths['fna']) for rec in Bio.SeqIO.parse(infile, 'fasta'): contig = Contig(rec.id) contig.seq = str(rec.seq).upper() contig.length = len(contig.seq) contig.species_id = sp.id contigs[contig.id] = contig infile.close() return contigs
def read_gene_map(species_id, args): """ Map 99% centroids to gene_ids at lower level """ gene_to_family = {} inpath = '%s/genome_clusters/%s/pangenome.map.gz' % (args['db'], species_id) infile = utility.iopen(inpath) fields = next(infile).rstrip().split() for line in infile: values = line.rstrip().split() map = dict([(f, v) for f, v in zip(fields, values)]) gene_to_family[map['99']] = map[args['cluster_pid']] return gene_to_family
def initialize_genes(args, species): """ Initialize Gene objects """ genes = {} # fetch gene_id, species_id, gene length for sp in species.values(): path = sp.paths['centroids.ffn'] file = utility.iopen(path) for seq in Bio.SeqIO.parse(file, 'fasta'): genes[seq.id] = Gene(seq.id) genes[seq.id].species_id = sp.id genes[seq.id].length = len(seq.seq) sp.pangenome_size += 1 file.close() # fetch marker_id path = '%s/marker_genes/phyeco.map' % args['db'] file = utility.iopen(path) reader = csv.DictReader(file, delimiter='\t') for r in reader: if r['gene_id'] in genes: genes[r['gene_id']].marker_id=r['marker_id'] file.close() return genes
def read_function_map(ref_db, species_id, ontology): """ Map gene ids to functions for given ontology """ gene_to_functions = {} inpath = '%s/genome_clusters/%s/pangenome.functions.gz' % (ref_db, species_id) infile = utility.iopen(inpath) for index, line in enumerate(infile): gene_id, function_id, ont = line.rstrip().split() if ont == ontology: if gene_id not in gene_to_functions: gene_to_functions[gene_id] = [] gene_to_functions[gene_id].append(function_id) return gene_to_functions
def parse_uclust(self, inpath): """ Yield formatted records from UCLUST output file """ # centroids are type == 'S' # non-centroids are type == 'H' # clusters are type == 'C' fields = [ 'type', 'cluster_id', 'size', 'pid', 'strand', 'skip1', 'skip2', 'skip3', 'gene_id', 'centroid_id' ] with utility.iopen(inpath) as infile: for index, line in enumerate(infile): values = line.rstrip('\n').split('\t') record = dict([(f, v) for f, v in zip(fields, values)]) yield record
def write_gene_info(self): """ Record gene info in gene_info.txt """ file = utility.iopen('%s/gene_info.txt' % self.dir, 'w') header = [ 'gene_id', 'genome_id', 'gene_length', 'centroid_99', 'centroid_95', 'centroid_90', 'centroid_85', 'centroid_80', 'centroid_75' ] file.write('\t'.join(header) + '\n') for gene_id in sorted(self.genes.keys()): g = self.genes[gene_id] values = [ g.id, g.genome_id, g.length, g.centroid_99, g.centroid_95, g.centroid_90, g.centroid_85, g.centroid_80, g.centroid_75 ] file.write('\t'.join([str(_) for _ in values]) + '\n') file.close()
def parse_hmmsearch(self, p_in): """ Parse HMMER domblout files. Return data-type formatted dictionary """ f_in = utility.iopen(p_in) for line in f_in: if line[0] == '#': continue x = line.rstrip().split() query = x[0] target = x[3] evalue = float(x[12]) qcov = (int(x[20]) - int(x[19]) + 1) / float(x[2]) tcov = (int(x[16]) - int(x[15]) + 1) / float(x[5]) yield { 'query': query, 'target': target, 'evalue': evalue, 'qcov': qcov, 'tcov': tcov, 'qlen': int(x[2]), 'tlen': int(x[5]) }
def build_genome_db(args, genome_clusters): """ Build FASTA and BT2 database from genome cluster centroids """ # fasta database genomes_fasta = open('/'.join([args['outdir'], 'snps/temp/genomes.fa']), 'w') genomes_map = open('/'.join([args['outdir'], 'snps/temp/genomes.map']), 'w') db_stats = {'total_length': 0, 'total_seqs': 0, 'genome_clusters': 0} for species_id in genome_clusters: if args['tax_mask'] and fetch_centroid(args, species_id) in args['tax_mask']: continue db_stats['genome_clusters'] += 1 inpath = '/'.join( [args['db'], 'genome_clusters', species_id, 'genome.fna.gz']) infile = utility.iopen(inpath) for line in infile: genomes_fasta.write(line) db_stats['total_length'] += len(line.rstrip()) if line[0] == '>': sid = line.rstrip().lstrip('>').split()[0] genomes_map.write(sid + '\t' + species_id + '\n') db_stats['total_seqs'] += 1 # print out database stats print(" total genomes: %s" % db_stats['genome_clusters']) print(" total contigs: %s" % db_stats['total_seqs']) print(" total base-pairs: %s" % db_stats['total_length']) # bowtie2 database inpath = '/'.join([args['outdir'], 'snps/temp/genomes.fa']) outpath = '/'.join([args['outdir'], 'snps/temp/genomes']) command = ' '.join([args['bowtie2-build'], inpath, outpath]) args['log'].write('command: ' + command + '\n') process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) utility.check_exit_code(process, command)
def compute_phyeco_cov(args, genome_clusters, ref_to_cov, ref_to_cluster): """ Count number of bp mapped to each PhyEco marker gene """ from numpy import median # read in set of phyeco markers for normalization phyeco_ids = set([]) inpath = '/'.join([args['db'], 'marker_genes/pid_cutoffs.txt']) if not os.path.isfile(inpath): sys.exit("File not found: %s" % inpath) for line in open(inpath): phyeco_id, pid = line.rstrip().split('\t') phyeco_ids.add(phyeco_id) # read in map of gene to phyeco marker ref_to_phyeco = {} for species_id in genome_clusters: inpath = '/'.join([args['db'], 'genome_clusters', species_id, 'pangenome.marker_genes.gz']) infile = utility.iopen(inpath) next(infile) for line in infile: gene_id, phyeco_id = line.rstrip().split() ref_to_phyeco[gene_id] = phyeco_id # init phyeco coverage cluster_to_phyeco_to_cov = {} for species_id in genome_clusters: cluster_to_phyeco_to_cov[species_id] = {} for phyeco_id in phyeco_ids: cluster_to_phyeco_to_cov[species_id][phyeco_id] = 0.0 # compute phyeco coverages for ref_id, phyeco_id in ref_to_phyeco.items(): species_id = ref_to_cluster[ref_id] if phyeco_id in phyeco_ids and ref_id in ref_to_cov: cluster_to_phyeco_to_cov[species_id][phyeco_id] += ref_to_cov[ref_id] # compute median phyeco cov cluster_to_norm = {} for species_id in cluster_to_phyeco_to_cov: covs = list(cluster_to_phyeco_to_cov[species_id].values()) cluster_to_norm[species_id] = median(covs) return cluster_to_norm
def format_pileup(args, species, contigs): """ Parse mpileups and fill in missing positions """ # open outfiles for sp in species.values(): sp.out = utility.iopen( '/'.join([args['outdir'], 'snps/output/%s.snps.gz' % sp.id]), 'w') header = [ 'ref_id', 'ref_pos', 'ref_allele', 'alt_allele', 'ref_freq', 'depth', 'count_atcg' ] sp.out.write('\t'.join(header) + '\n') sp.contigs = sorted( [c.id for c in contigs.values() if c.species_id == sp.id]) sp.i = 0 # contig index sp.j = 0 # position index # parse pileup pileup_path = '%s/snps/temp/genomes.mpileup.gz' % args['outdir'] for p in parse_pileup.main(pileup_path): # fetch contig info sp = species[contigs[p.ref_id].species_id] contig = contigs[sp.contigs[sp.i]] # contig ids don't match # indicates that one or more upstream contigs have zero coverage while p.ref_id != contig.id: write_missing(sp.out, ref_id=contig.id, ref_pos=str(sp.j + 1), ref_allele=contig.seq[sp.j]) sp.j += 1 if sp.j >= contig.length: sp.i += 1 sp.j = 0 contig = contigs[sp.contigs[sp.i]] # positions don't match # indicates that one or more upstream positions have zero coverage while p.ref_pos != sp.j + 1: write_missing(sp.out, ref_id=contig.id, ref_pos=str(sp.j + 1), ref_allele=contig.seq[sp.j]) sp.j += 1 if sp.j >= contig.length: sp.i += 1 sp.j = 0 contig = contigs[sp.contigs[sp.i]] # match # write info from pileup write_present(sp.out, pileup=p) sp.j += 1 # end of contig if sp.j >= contig.length: sp.i += 1 sp.j = 0 # fill in downstream positions & contigs with zero coverage for sp in species.values(): # no remaining contigs if sp.i < len(sp.contigs): # lefover positions on last contig # indicates that one or more downstream positions have zero coverage contig = contigs[sp.contigs[sp.i]] while sp.j < contig.length: write_missing(sp.out, ref_id=contig.id, ref_pos=str(sp.j + 1), ref_allele=contig.seq[sp.j]) sp.j += 1 # lefover contigs # indicates that one or more downstream contigs have zero coverage sp.i += 1 sp.j = 0 while sp.i < len(sp.contigs): contig = contigs[sp.contigs[sp.i]] write_missing(sp.out, ref_id=contig.id, ref_pos=str(sp.j + 1), ref_allele=contig.seq[sp.j]) sp.j += 1 if sp.j >= contig.length: sp.i += 1 sp.j = 0 # close output files sp.out.close()