def _generate_pileups(self): bam_file = pysam.AlignmentFile(self.bam_file_path) info_of_interest = ['A', 'C', 'G', 'T', 'insertions', 'deletions'] indel_positions = [4, 5] # Last number in shape - 5 - is for letters other than A, C, G and T. mapping = { 'A': 0, 'a': 0, 'C': 1, 'c': 1, 'G': 2, 'g': 2, 'T': 3, 't': 3 } total_options = len(info_of_interest) + 1 pileups = [ np.zeros((bam_file.get_reference_length(contig_name), len(info_of_interest))) for contig_name in bam_file.references ] y_oh = [ np.zeros( (bam_file.get_reference_length(contig_name), total_options)) for contig_name in bam_file.references ] total_length = np.sum([ bam_file.get_reference_length(contig_name) for contig_name in bam_file.references ]) progress_counter = 0 contig_names = bam_file.references with progressbar.ProgressBar(max_value=total_length) as progress_bar: for contig_id, contig_name in enumerate(contig_names): for record in pysamstats.stat_variation( bam_file, chrom=contig_name, fafile=self.reference_fasta_path): progress_bar.update(progress_counter) progress_counter += 1 curr_position = record['pos'] for i, info in enumerate(info_of_interest): pileups[contig_id][curr_position][i] += record[info] pileup_argmax = np.argmax( pileups[contig_id][curr_position]) if pileup_argmax in indel_positions: y_oh[contig_id][curr_position][pileup_argmax] = 1 else: y_oh[contig_id][curr_position][mapping.get( record['ref'], -1)] = 1 return pileups, y_oh, contig_names
def populateDict(): global holdingDict if not os.path.isfile(JSONProfile): for rec in pysamstats.stat_variation(alignmentfile=bamFile, fafile=target): # holdingDict[rec['chrom'].split("_")[0]] = {} # geneName = rec['chrom'].split(".")[0] aros = re.findall("(?<=ARO:)3\d{6}", rec['chrom']) aro = " ".join(aros) # if len(aros) > 1: # print str(aros) # aros = rec['chrom'].split("!")[1].split("_")[0] if rec['matches'] - rec['mismatches'] > 0: holdingDict[rec['chrom']][int(rec['pos'])] = rec['reads_all'] dotter() JSONreport = open(JSONProfile, "wb") output = json.dumps(holdingDict, sort_keys=True, indent=4, separators=(',', ': ')) JSONreport.write(output) JSONreport.close() else: with open(JSONProfile, "rb") as jsonReport: # Load the data # print strain, targetName, JSONProfile, jsonReport holdingDict.update(json.load(jsonReport))
def parse_bam(bam, Chr, Pos, End, Ref, Alt, REF, ): # exec_pysamstats = '/home/liaoth/tools/pysamstats_venv/bin/pysamstats' # # t_result = os.popen( # '{exe} --fasta {fa} --type variation {t_f} -c {chr} -s {start} -e {end} -u' # .format(exe=exec_pysamstats, fa=fasta, t_f=bam_path, chr=Chr, start=Pos, end=End + 1)).read() result = pysamstats.stat_variation(bam, REF, chrom=Chr, start=Pos, end=End + 1, truncate=True, one_based=True) result_dict = {} for idx, record in enumerate(result): result_dict[idx] = record data_df = pd.DataFrame.from_dict(result_dict, orient='index') if data_df.shape[0] == 0: data_df = data_df.append(pd.DataFrame(columns=tmp)) data_df.loc[0, :] = 0 if Ref in ['A', 'C', 'G', 'T'] and Alt in ['A', 'C', 'G', 'T']: ref_cov, mut_cov, cov = data_df.loc[0, [Ref + '_pp', Alt + '_pp', 'reads_pp']] if cov != 0: mut_per = mut_cov / float(cov) else: mut_per = 0 else: if data_df.sum()['reads_pp'] > 0 and len(data_df) > 0: if Ref == '-' or len(Alt) > len(Ref): # insertions mut_per = data_df.sum()['insertions_pp'] / float(data_df.sum()['reads_pp']) ref_cov = data_df.sum()['matches_pp'] / float(len(data_df)) mut_cov = data_df.sum()['insertions_pp'] / float(len(data_df)) elif Alt == '-' or len(Ref) > len(Alt): # deletions mut_per = data_df.sum(0)['deletions_pp'] / float(data_df.sum()['reads_pp']) ref_cov = data_df.sum(0)['matches_pp'] / float(len(data_df)) mut_cov = data_df.sum(0)['deletions_pp'] / float(len(data_df)) else: return ['Wrong pos'] * 3 else: return ['Off target'] * 3 return mut_cov, mut_per, ref_cov
def filter_germline(chr, t_columns_pos, ref_seq, alt_base): n_alt_A=0; n_alt_C=0; n_alt_G=0; n_alt_T=0; n_refcount=0 for rec in pysamstats.stat_variation(n_samfile, ref, chrom=str(chr), start=int(t_columns_pos), end=int(t_columns_pos)+1): chr = rec['chrom']; pos = rec['pos']; ref_seq = rec['ref'] if pos == t_columns_pos: if rec['reads_pp'] > 4: if float(rec['mismatches_pp']/rec['reads_pp']) <= 0.10: for n_columns in n_samfile.pileup(chr, int(t_columns_pos), int(t_columns_pos)+1, truncate=True): if n_columns.pos == t_columns_pos: (n_alt_A, n_alt_C, n_alt_G, n_alt_T, n_refcount) = process_reads(n_columns, n_columns.pos, ref_seq, "N"); else: n_alt_A = rec['mismatches_pp']; n_alt_C = rec['mismatches_pp']; n_alt_G = rec['mismatches_pp']; n_alt_T = rec['mismatches_pp']; n_refcount = rec['reads_pp']-rec['mismatches_pp'] else: n_alt_A=0; n_alt_G=0; n_alt_C=0; n_alt_T=0; n_refcount=rec['reads_pp'] if alt_base == "A": return n_alt_A, n_refcount if alt_base == "G": return n_alt_G, n_refcount if alt_base == "T": return n_alt_T, n_refcount if alt_base == "C": return n_alt_C, n_refcount
def parse(self): import pysamstats import operator import numpy while True: sample, analysistype = self.parsequeue.get() # Initialise dictionaries to store parsed data matchdict = dict() depthdict = dict() seqdict = dict() snpdict = dict() gapdict = dict() maxdict = dict() mindict = dict() deviationdict = dict() sample[analysistype].results = dict() sample[analysistype].avgdepth = dict() sample[analysistype].resultssnp = dict() sample[analysistype].resultsgap = dict() sample[analysistype].sequences = dict() sample[analysistype].maxcoverage = dict() sample[analysistype].mincoverage = dict() sample[analysistype].standarddev = dict() # Variable to store the expected position in gene/allele pos = 0 try: # Use the stat_variation function of pysam stats to return records parsed from sorted bam files # Values of interest can be retrieved using the appropriate keys for rec in pysamstats.stat_variation(alignmentfile=sample[analysistype].sortedbam, fafile=sample[analysistype].baitfile, max_depth=1000000): # Initialise seqdict with the current gene/allele if necessary with an empty string if rec['chrom'] not in seqdict: seqdict[rec['chrom']] = str() # Since this is the first position in a "new" gene/allele, reset the pos variable to 0 pos = 0 # Initialise gap dict with 0 gaps if rec['chrom'] not in gapdict: gapdict[rec['chrom']] = 0 # If there is a gap in the alignment, record the size of the gap in gapdict if int(rec['pos']) > pos: # Add the gap size to gap dict gapdict[rec['chrom']] += rec['pos'] - pos # Set the expected position to the current position pos = int(rec['pos']) # Increment pos in preparation for the next iteration pos += 1 # Initialise snpdict if necessary if rec['chrom'] not in snpdict: snpdict[rec['chrom']] = 0 # Initialise the current gene/allele in depthdict with the depth (reads_all) if necessary, # otherwise add the current depth to the running total if rec['chrom'] not in depthdict: depthdict[rec['chrom']] = int(rec['reads_all']) else: depthdict[rec['chrom']] += int(rec['reads_all']) # Dictionary of bases and the number of times each base was observed per position bases = {'A': rec['A'], 'C': rec['C'], 'G': rec['G'], 'T': rec['T']} # If the most prevalent base (calculated with max() and operator.itemgetter()) does not match the # reference base, add this prevalent base to seqdict if max(bases.items(), key=operator.itemgetter(1))[0] != rec['ref']: seqdict[rec['chrom']] += max(bases.items(), key=operator.itemgetter(1))[0] # Increment the running total of the number of SNPs snpdict[rec['chrom']] += 1 else: # If the bases match, add the reference base to seqdict seqdict[rec['chrom']] += (rec['ref']) # Initialise posdict if necessary, otherwise, increment the running total of matches if rec['chrom'] not in matchdict: matchdict[rec['chrom']] = 1 else: matchdict[rec['chrom']] += 1 # Find the max and min coverage for each strain/gene combo try: maxdict[rec['chrom']] = int(rec['reads_all']) if \ int(rec['reads_all']) >= maxdict[rec['chrom']] else maxdict[rec['chrom']] except KeyError: maxdict[rec['chrom']] = int(rec['reads_all']) try: mindict[rec['chrom']] = int(rec['reads_all']) if \ int(rec['reads_all']) <= mindict[rec['chrom']] else mindict[rec['chrom']] except KeyError: mindict[rec['chrom']] = int(rec['reads_all']) # Create a list of all the depths in order to calculate the standard deviation try: deviationdict[rec['chrom']].append(int(rec['reads_all'])) except KeyError: deviationdict[rec['chrom']] = list() deviationdict[rec['chrom']].append(int(rec['reads_all'])) # If there are no results in the bam file, then pass over the strain except ValueError: pass # Iterate through all the genes/alleles with results above for allele in sorted(matchdict): # If the length of the match is greater or equal to the length of the gene/allele (multiplied by the # cutoff value) as determined using faidx indexing, then proceed if matchdict[allele] >= sample[analysistype].faidict[allele] * self.cutoff: # Calculate the average depth by dividing the total number of reads observed by the # length of the gene averagedepth = float(depthdict[allele]) / float(matchdict[allele]) percentidentity = float(matchdict[allele]) / float(sample[analysistype].faidict[allele]) * 100 # Only report a positive result if this average depth is greater than 10X if averagedepth > 10: # Populate resultsdict with the gene/allele name, the percent identity, and the average depth sample[analysistype].results.update({allele: '{:.2f}'.format(percentidentity)}) sample[analysistype].avgdepth.update({allele: '{:.2f}'.format(averagedepth)}) # Add the SNP and gap results to dictionaries sample[analysistype].resultssnp.update({allele: snpdict[allele]}) sample[analysistype].resultsgap.update({allele: gapdict[allele]}) sample[analysistype].sequences.update({allele: seqdict[allele]}) sample[analysistype].maxcoverage.update({allele: maxdict[allele]}) sample[analysistype].mincoverage.update({allele: mindict[allele]}) sample[analysistype]\ .standarddev.update({allele: '{:.2f}'.format(numpy.std(deviationdict[allele], ddof=1))}) self.parsequeue.task_done()
ref_genome='' if 'mature' in file: ref_genome='../../../Reference_Genomes/Modifications_files/Ref_seq/families_tRNA_refgenome.fa' if 'PG' in file: ref_genome='../../../Reference_Genomes/Modifications_files/Ref_seq/precursor_tRNA_refgenome.fa' file_base_call=open(sample+'_'+bam_type+'_'+'base_calling_CORRECT_OK.txt','w') #BASE CALLING #pysam will give us the base calling for each trna (max_depth is used becouse by default the maximum number of reads that will read in a position is 8000, so we increse the number to a high number in order to have the total number of reads correct in a ceratin position.) for record in pysamstats.stat_variation(bamfile, fafile=ref_genome, max_depth=10000000): ref_base=record['ref'] tRNA_info='REF-'+str(record['ref'])+':'+str(record[ref_base])+' '+'A:'+str(record['A'])+' '+'C:'+str(record['C'])+' '+'G:'+str(record['G'])+' '+'T:'+str(record['T']) tRNA=record['chrom'] pos='' ref=record['ref'] #Positions of the precursor are not the same as the positions in the mature genome since we have the leading and trailing regions and the introns. So we have to transform the positions from the precursor genome to be like the mature ones. if bam_type=='precursor': pos_i=record['pos'] #take in to acount the leading and trailing regions. if pos_i > 49 and pos_i < int(prec_length[tRNA])-50: pos=pos_i-49
def parse_bam( bam, Chr, Pos, End, Ref, Alt, REF, sig='N', ): #exec_pysamstats = '/home/liaoth/tools/pysamstats_venv/bin/pysamstats' # # t_result = os.popen( # '{exe} --fasta {fa} --type variation {t_f} -c {chr} -s {start} -e {end} -u' # .format(exe=exec_pysamstats, fa=fasta, t_f=bam_path, chr=Chr, start=Pos, end=End + 1)).read() result = pysamstats.stat_variation(bam, REF, chrom=Chr, start=Pos, end=End + 1, truncate=True, one_based=True) result_dict = {} for idx, record in enumerate(result): result_dict[idx] = record t_data_df = pd.DataFrame.from_dict(result_dict, orient='index') if t_data_df.shape[0] == 0: t_data_df = t_data_df.append(pd.DataFrame(columns=tmp)) t_data_df.loc[0, :] = 0 if Ref in ['A', 'C', 'G', 'T'] and Alt in ['A', 'C', 'G', 'T']: T_ref_cov, T_mut_cov, T_cov = t_data_df.loc[ 0, [Ref + '_pp', Alt + '_pp', 'reads_pp']] if T_cov != 0: T_mut_per = T_mut_cov / float(T_cov) else: T_mut_per = 0 added_col['%s_mut_per' % sig].append(T_mut_per) added_col['%s_ref_cov' % sig].append(T_ref_cov) added_col['%s_mut_cov' % sig].append(T_mut_cov) else: if t_data_df.sum()['reads_pp'] > 0 and len(t_data_df) > 0: if Ref == '-' or len(Alt) > len(Ref): # insertions added_col['%s_mut_per' % sig].append(t_data_df.sum()['insertions_pp'] / float(t_data_df.sum()['reads_pp'])) added_col['%s_ref_cov' % sig].append( t_data_df.sum()['matches_pp'] / float(len(t_data_df))) added_col['%s_mut_cov' % sig].append( t_data_df.sum()['insertions_pp'] / float(len(t_data_df))) elif Alt == '-' or len(Ref) > len(Alt): # deletions added_col['%s_mut_per' % sig].append( t_data_df.sum(0)['deletions_pp'] / float(t_data_df.sum()['reads_pp'])) added_col['%s_ref_cov' % sig].append( t_data_df.sum(0)['matches_pp'] / float(len(t_data_df))) added_col['%s_mut_cov' % sig].append( t_data_df.sum(0)['deletions_pp'] / float(len(t_data_df))) else: for _key in [_ for _ in added_col.keys() if sig in _]: added_col[_key].append('Wrong pos') else: for _key in [_ for _ in added_col.keys() if sig in _]: added_col[_key].append('Off target')
import pysam import pysamstats bamfile = pysam.AlignmentFile('picared.bam') print("P"), print("A"), print("C"), print("G"), print("T"), print("I"), print("D") for record in pysamstats.stat_variation(bamfile, chrom='gi|11|ref|TL|E1E2J6', fafile="ref_E1E2.fa"): print(record['pos']), print(record['A']), print(record['C']), print(record['G']), print(record['T']), print(record['insertions']), print(record['deletions'])
if alt_base == "A": return n_alt_A, n_refcount if alt_base == "G": return n_alt_G, n_refcount if alt_base == "T": return n_alt_T, n_refcount if alt_base == "C": return n_alt_C, n_refcount foundReg=[]; raws=[] cov = 0 ftemp = open(tempfile, 'w') print >>ftemp, "#chr\tpos\tref\talt\tt_ref\tt_alt\tn_ref\tn_alt\tjudge" print "calling raw variants..." for bedline in ( raw.strip().split() for raw in open(bed)): for rec in pysamstats.stat_variation(t_samfile, ref, chrom=str(bedline[0]), start=int(bedline[1]), end=int(bedline[2])): if rec['reads_pp'] > 4: if (rec['mismatches_pp'] > 2) and (rec['insertions'] <= 3) and (rec['deletions'] <= 3): chr = rec['chrom']; pos = rec['pos']; ref_seq = rec['ref'] for t_columns in t_samfile.pileup(chr, int(pos), int(pos)+1, truncate=True): try: foundReg.index(str(chr)+"\t"+str(t_columns.pos)) except ValueError: foundReg.append(str(chr)+"\t"+str(t_columns.pos)) (t_alt_A, t_alt_C, t_alt_G, t_alt_T, t_refcount, altReadPosE_f_A, altReadPosE_r_A, altReadPosE_f_G, altReadPosE_r_G, altReadPosE_f_C, altReadPosE_r_C, altReadPosE_f_T, altReadPosE_r_T, altReadPosS_f_A, altReadPosS_r_A, altReadPosS_f_G, altReadPosS_r_G, altReadPosS_f_C, altReadPosS_r_C, altReadPosS_f_T, altReadPosS_r_T) = process_reads(t_columns, t_columns.pos, ref_seq, "Y") totalcount = t_refcount + t_alt_A + t_alt_G + t_alt_C + t_alt_T if totalcount > 4: if (t_alt_A > tumor_alt_cutoff) and ((t_alt_A/totalcount) > tumor_alf_cutoff): if (all(5 >= i for i in altReadPosS_f_A) and all(5 >= i for i in altReadPosS_r_A)) or (all(5 >= i for i in altReadPosE_f_A) and all(5 >= i for i in altReadPosE_r_A)): raw_calls = (chr+"\t"+str(pos+1)+"\t"+ref_seq.upper()+"\t"+"A"+"\t"+str(t_refcount)+"\t"+str(t_alt_A)+"\t"+"0"+"\t"+"0"+"\t"+"clustered_pos")
def parse(self): import pysamstats import operator while True: sample, vtx = self.parsequeue.get() # Initialise dictionaries to store parsed data matchdict = dict() depthdict = dict() seqdict = dict() resultsdict = dict() snpdict = dict() gapdict = dict() faidict = dict() uniqueresults = dict() refdict = dict() # Variable to store the expected position in gene/allele pos = 0 # Get the fai file into a dictionary to be used in parsing results with open(sample[self.analysistype].faifile[vtx], 'rb') as faifile: for line in faifile: data = line.split('\t') faidict[data[0]] = int(data[1]) try: # Use the stat_variation function of pysam stats to return records parsed from sorted bam files # Values of interest can be retrieved using the appropriate keys correction = 0 for rec in pysamstats.stat_variation(alignmentfile=sample[self.analysistype].sortedbam[vtx], fafile=sample[self.analysistype].targetfiles[vtx], max_depth=1000000): # Add the reference sequence to the dictionary if rec['chrom'] not in refdict: refdict[rec['chrom']] = str() refdict[rec['chrom']] += rec['ref'] # Initialise seqdict with the current gene/allele if necessary with an empty string if rec['chrom'] not in seqdict: seqdict[rec['chrom']] = str() # Since this is the first position in a "new" gene/allele, reset the pos variable to 0 pos = 0 # There seems to be a bug in pysamstats with how gaps at the start of the sequence are treated. # Although the position is correct, the whole reference sequence is still included, rather than # starting at where the gap ends if rec['pos'] > pos: # If there is a gap of 173 bases at the beginning of the match, the reference sequence # still should start at 0, but it starts at 173, therefore, the match actually starts at # 2 * 173 = 346 correction = 2 * rec['pos'] # The number of gaps is equal to the starting position gapdict[rec['chrom']] = rec['pos'] # The actual position will be rec['pos'] pos = rec['pos'] # Allow the position to reach the calculated correction factor if rec['pos'] >= correction: # Initialise gap dict with 0 gaps if rec['chrom'] not in gapdict: gapdict[rec['chrom']] = 0 # If there is a gap in the alignment, record the size of the gap in gapdict if int(rec['pos']) > pos: # Add the gap size to gap dict gapdict[rec['chrom']] += rec['pos'] - pos # Add dashes to the sequence to indicate the gap seqdict[rec['chrom']] += 'N' * (int(rec['pos'] - pos)) # Set the expected position to the current position pos = int(rec['pos']) # Increment pos in preparation for the next iteration pos += 1 # Initialise snpdict if necessary if rec['chrom'] not in snpdict: snpdict[rec['chrom']] = 0 # Initialise the current gene/allele in depthdict with the depth (reads_all) if necessary, # otherwise add the current depth to the running total if rec['chrom'] not in depthdict: depthdict[rec['chrom']] = int(rec['reads_all']) else: depthdict[rec['chrom']] += int(rec['reads_all']) # Dictionary of bases and the number of times each base was observed per position bases = {'A': rec['A'], 'C': rec['C'], 'G': rec['G'], 'T': rec['T']} # Track any deletions prior to the sequence if rec['deletions'] > rec['matches']: seqdict[rec['chrom']] += 'N' # Increment the running total of the number of SNPs snpdict[rec['chrom']] += 1 else: if rec['matches'] > 0 or rec['mismatches'] > 0: # If the most prevalent base (calculated with max() and operator.itemgetter()) # doesn't match the reference base, add this prevalent base to seqdict if max(bases.iteritems(), key=operator.itemgetter(1))[0] != rec['ref']: seqdict[rec['chrom']] += max(bases.iteritems(), key=operator.itemgetter(1))[0] # Increment the running total of the number of SNPs snpdict[rec['chrom']] += 1 else: # If the bases match, add the reference base to seqdict seqdict[rec['chrom']] += (rec['ref']) # Initialise posdict if necessary, otherwise, increment the running total of matches if rec['chrom'] not in matchdict: matchdict[rec['chrom']] = 1 else: matchdict[rec['chrom']] += 1 # If there are no results in the bam file, then pass over the strain except ValueError: pass # Iterate through all the genes/alleles with results above for allele in sorted(matchdict): # If the length of the match is greater or equal to the length of the gene/allele (multiplied by the # cutoff value) as determined using faidx indexing, then proceed # Calculate the average depth by dividing the total number of reads observed by the length of the gene averagedepth = float(depthdict[allele]) / float(matchdict[allele]) percentidentity = float(matchdict[allele]) / float(faidict[allele]) * 100 if percentidentity == self.cutoff * 100: # Only report a positive result if this average depth is greater than 4X if averagedepth > 4: # Populate resultsdict with the gene/allele name, the percent identity, and the average depth resultsdict.update({allele: {'{:.2f}'.format(percentidentity): '{:.2f}'.format(averagedepth)}}) # Add the results to the object sample[self.analysistype].allelematches[vtx] = resultsdict # Determine if there are alleles without a 100% match if not resultsdict: for allele in sorted(matchdict): # Filter the alleles to only include the vtx subunit if vtx in allele: percentidentity = float(matchdict[allele]) / float(faidict[allele]) * 100 # Use a more relaxed cutoff to find the closest alleles if percentidentity >= self.cutoff * 50: uniqueresults.update({allele: percentidentity}) try: # Find the best match (highest percent identity) closestallele = max(uniqueresults.iteritems(), key=operator.itemgetter(1))[0] percentidentity = max(uniqueresults.iteritems(), key=operator.itemgetter(1))[1] averagedepth = float(depthdict[closestallele]) / float(matchdict[closestallele]) # Populate the metadata with the results sample[self.analysistype].newsequences[vtx] = seqdict[closestallele] sample[self.analysistype].newseqclosestmatch[vtx] = \ {closestallele: {'{:.2f}'.format(percentidentity): '{:.2f}'.format(averagedepth)}} sample[self.analysistype].newseqclosestseq[vtx] = {closestallele: refdict[closestallele]} except ValueError: pass self.parsequeue.task_done()