def get_counts(alignment: AlignmentFile, chromosome: str, position: int, minimum_base_quality: int) -> Dict[str, int]: coverage = alignment.count_coverage(contig=chromosome, start=position, stop=position + 1, quality_threshold=minimum_base_quality) return {'A': coverage[0][0], 'C': coverage[1][0], 'G': coverage[2][0], 'T': coverage[3][0]}
def main(argv): #parse reference file..? #parse samtools header bam = AlignmentFile(sys.argv[1]) observed = [] for read in bam.fetch(): if read.reference_name not in observed: observed.append(read.reference_name) ref_seqs = {} for seq in SeqIO.parse(sys.argv[2], 'fasta'): if seq.id in observed: ref_seqs[seq.id] = str(seq.seq) #count coverage print( "Subject\tReadcount\tCorrect_bases\tIncorrect_bases\tTotal_bases\tSubjlen\tCoverage\tPercent_ID" ) for o in observed: contig_counts = bam.count(o, start=0, end=len(ref_seqs[o])) counts = bam.count_coverage(o, start=0, end=len(ref_seqs[o])) pos_ids = [] trues = 0 falses = 0 total = 0 for ref_pos in range(0, len(ref_seqs[o])): total += sum(counts[nt][ref_pos] for nt in range(4)) if total == 0: continue for ref_pos in range(0, len(ref_seqs[o])): ref_allele = ref_seqs[o][ref_pos] depth = sum(counts[nt][ref_pos] for nt in range(4)) count_a = counts[0][ref_pos] count_c = counts[1][ref_pos] count_g = counts[2][ref_pos] count_t = counts[3][ref_pos] values = [ o, ref_pos + 1, ref_allele, depth, count_a, count_c, count_g, count_t ] #if o == "protist-Blastocystis_sp_subtype_2-1079827at2759-S1": #print(depth) # print(ref_allele) #print(count_a) #print(count_c) #print(count_g) #print(count_t) if depth > 0: #now we calculate the percentage not_n = True if ref_allele == "A": true = count_a false = count_c + count_g + count_t elif ref_allele == "C": true = count_c false = count_a + count_g + count_t elif ref_allele == "G": true = count_g false = count_a + count_c + count_t elif ref_allele == "T": true = count_t false = count_a + count_c + count_g else: #it's an n, skip it not_n = False #maybe just have it as an absolute. if there's one mismatch it's all wrong. if not_n: if false > 0: falses += 1 else: trues += 1 #trues += true #falses += false #ratio = true /(true + false) #need the trues and positives for each ref_pos #print('\t'.join(str(val) for val in values) + '\t' + str(ratio)) #pid = round(sum(pos_ids) / len(pos_ids) * 100, 2) #print(o + '\t' + str(contig_counts) + '\t' + str(pid)) seqlen = len(ref_seqs[o]) # print(seqlen) # print(trues) # print(falses) # print(o) coverage = round(((trues + falses) / seqlen) * 100, 2) if trues == 0 and falses == 0: pid = 0 else: pid = round((trues / (trues + falses)) * 100, 2) print(o + '\t' + str(contig_counts) + '\t' + str(trues) + '\t' + str(falses) + '\t' + str(trues + falses) + '\t' + str(seqlen) + '\t' + str(coverage) + '\t' + str(pid))