def bam2fastq(input, output): import pybam with open(output, 'w') as fastqout: with open(input, 'rb') as bamin: for title, seq, qual in pybam.read( bamin, ['sam_qname', 'sam_seq', 'sam_qual']): fastqout.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
def total_phy_coverage(): print("Total Physical Coverage started...\n") f = open('../wig-tracks/tot_phy_coverage.wig', 'w') # initialize genome_change variable as a list constituted by 0 with length = genomelength genome_length = 3079196 genome_change = [0] * genome_length for alignment in pybam.read("../data/lact_sorted.bam"): # conversion of flag from integer to binary flag = bin(int(alignment.sam_flag)) # get start position and tlen value start_pos = int(alignment.sam_pos1) # 4th column tlen = int(alignment.sam_tlen) # 9th column if tlen <= 3000 and flag.endswith('1'): if tlen > 0: genome_change[start_pos] += 1 genome_change[start_pos + tlen] -= 1 else: genome_change[start_pos + tlen + 1] += 1 genome_change[start_pos + 1] -= 1 print("Generating .wig file\n") # print genomic profile as a wiggle file f.write("fixedStep chrom=genome start=1 step=1 span=1 \n") current_coverage = 0 # cycle over all positions of the genome for position in range(genome_length): current_coverage += genome_change[position] f.write(str(current_coverage) + '\n') f.close() print("done!")
def Read_alignment(titleBam, dicoInit, lstError): try: pathSCjson = os.path.join(dicoInit['pathTmpDir'], titleBam + "_SC.json") pathSCfasta = os.path.join(dicoInit['pathTmpDir'], titleBam + ".fasta") FASTA = open(pathSCfasta, 'w') # Init dicoBam dicoBam = {} for pos in range(1, dicoInit["dicoGbk"]['refLength'] + 1, 1): dicoBam[pos] = { 'nb_reads_F':0, 'nb_reads_R':0,\ 'nb_sc_reads_F':0, 'nb_sc_reads_R':0,\ 'nb_sc_fasta_F':0, 'nb_sc_fasta_R':0 } #***** BROWSE READS & SEARCH SCR *****# # Switch to downsampled BAM if exist if dicoInit['dicoBam'][titleBam]['path_downsampling'] != "": dicoInit['dicoBam'][titleBam]['path'] = dicoInit['dicoBam'][ titleBam]['path_downsampling'] for alignment in pybam.read(dicoInit['dicoBam'][titleBam]['path']): if alignment.file_chromosomes[ alignment.sam_refID] == dicoInit["dicoBam"][titleBam][ 'refName'] and alignment.sam_mapq >= dicoInit[ 'minQ'] and not alignment.sam_cigar_string.__contains__( "H" ): # and not explain_sam_flags(alignment.sam_flag).__contains__("second in pair") and not explain_sam_flags(alignment.sam_flag).__contains__("supplementary"): #***** RETRIEVE positions tuple & lastMapped infos *****# positionsLstTuple, lastMappedPos, lastMappedPosRead = cigar_list_to_tuple( alignment.sam_cigar_list, alignment.sam_pos0) #***** FORWARD reads *****# if explain_sam_flags( alignment.sam_flag) == "" or explain_sam_flags( alignment.sam_flag).__contains__( "mate reverse strand"): # Count reads for posTuple in positionsLstTuple: try: dicoBam[posTuple[1] + 1]['nb_reads_F'] += 1 except: pass # None case # Count softclipped (right soft-clipping) length, operation = alignment.sam_cigar_list[ len(alignment.sam_cigar_list) - 1] if operation == "S": dicoBam[lastMappedPos]['nb_sc_reads_F'] += 1 # Write to Fasta (apply filter) / not consider 'N' if length - alignment.sam_seq[len( alignment.sam_seq ) - length:].count("N") >= dicoInit[ 'SCsize'] and lastMappedPos + length <= dicoInit[ "dicoGbk"]['refLength']: nb_mapped_part = min(dicoInit["MappedPart"], lastMappedPosRead) FASTA.write( ">" + str(lastMappedPos) + "_" + str(nb_mapped_part) + "_scrF_" + alignment.sam_qname + "\n" + alignment.sam_seq[len(alignment.sam_seq) - length - nb_mapped_part:] + "\n") dicoBam[lastMappedPos]['nb_sc_fasta_F'] += 1 #***** REVERSE reads *****# else: # Count reads for posTuple in positionsLstTuple: try: dicoBam[posTuple[1] + 1]['nb_reads_R'] += 1 except: pass # Count softclipped (left soft-clipping) length, operation = alignment.sam_cigar_list[0] if operation == "S": dicoBam[alignment.sam_pos0 + 1]['nb_sc_reads_R'] += 1 # Write to Fasta (apply filter) if length - alignment.sam_seq[0:length].count( "N" ) >= dicoInit[ 'SCsize'] and alignment.sam_pos0 + 1 - length >= 0: nb_mapped_part = min(dicoInit["MappedPart"], length) FASTA.write(">" + str(alignment.sam_pos0 + 1) + "_" + str(nb_mapped_part) + "_scrR_" + alignment.sam_qname + "\n" + alignment.sam_seq[0:length + nb_mapped_part] + "\n") dicoBam[alignment.sam_pos0 + 1]['nb_sc_fasta_R'] += 1 # CLOSE files FASTA.close() # WRITE .json results JSON = open(pathSCjson, 'wb') JSON.write(json.dumps(dicoBam)) JSON.close() except: exc_type, exc_value, exc_traceback = sys.exc_info() lstError.append("ReadThread \"" + titleBam + "\": " + str(exc_value) + " (line " + str(exc_traceback.tb_lineno) + ")")
print("debugging...\n") cont = 0 for row in bam_f: length = row.sam_tlen flag = bin(int(row.sam_flag)) print(flag, length, cont) #################### MAIN ###################### if __name__ == '__main__': ########################## PART 2 ########################### # load sorted Lactobacillus bam file (using pybam library): # refer to a BAM file sorted by genomic position! sorted_bam = pybam.read('../data/lact_sorted.bam') # 9) Calculate PHYSICAL COVERAGE, creating related wig file # phy_coverage(sorted_bam) # 10) Calculate SEQUENCE COVERAGE, creating related wig file # sequence_coverage(sorted_bam) # 11) Calculate INSERT STATS # get_genome_stats(sorted_bam) # 12) Calculate AVERAGE INSERTS LENGTH, creating related wig file # avg_inserts_coverage(sorted_bam) # Saved values from get_genome_stats() function above avg = 2101.0225496051385
import pybam bam_data = pybam.read('./pb_467_2_sr_blasr.bam') bam_rowData = [] for alignment in bam_data: bam_rowData.append(alignment.sam_seq) #print alignment.sam_seq #print alignment.sam_mapq print bam_rowData[0]