def sj_read_support_TEST(bam_reader, genomic_range): """ Just playing around """ hash_gr = Isoform.split_genome_pos( genomic_range) #hash_gr = hash table of genomic range #these are the counts count = 0 #count all reads that map to "genomic_range" uniq_count = 0 #count all uniquely mapped reads to "genomic_range" align_score_max_count = 0 #this also counts all uniquely mapped reads to "genomic_range" for i, a in enumerate(bam_reader.fetch(region=genomic_range)): # score = 0 # if a.optional_field( "NH" ) == 1: # score += 1 print i, "read ", i, " - ", a print i, "dir(a) = ", dir(a) print i, "a.aligned = ", a.aligned print i, "a.read = ", a.read print i, "a.read_as_aligned = ", a.read_as_aligned print i, "a._read = ", a._read print i, "a._read_as_sequenced = ", a._read_as_sequenced print i, "a.get_sam_line = ", a.get_sam_line( ) #this retrieves the read information & presents it as a .sam file line print i, ": a.cigar = ", a.cigar #this is an array that contains information for each cigar. For example, if the CIGAR is 99M, then there is only 1 element in array "a.cigar" [99M. If CIGAR is 2M4926N74M, then there are 3 elements in the array "a.cigar" [2M, 4926N, 74M] #I can split the information for each read by splitting by tab-delimiter '\t' sam_line = a.get_sam_line() list_sam = sam_line.split('\t') print i, "list_sam = ", list_sam print "----------------\n"
def create_obj_sj(hash_sj_info): """ Args: hash_sj_info = a hash_sj_info from pandas Dataframe, where each hash_sj_info is indexed by the column labels Function: creates a SpliceJunction instance for the splice junction recorded in the file. Information about the splice junction is recorded in a hash_sj_info in the file contained in the variable "arr_rc" """ sj_id = hash_sj_info['sj_id'] hash_sj_pos = Isoform.split_genome_pos(hash_sj_info['sj_range']) chrom = hash_sj_pos['chrom'] start = hash_sj_pos['start'] end = hash_sj_pos['end'] strand = '-' if int( hash_sj_info['strand']) == -1 else '+' #needs to be in string format read_count = hash_sj_info['read_count'] gene_sym = hash_sj_info['gene_name'] # isoform_id = hash_sj_info['isoform_id'] isoform_id = None sample_prevalence = hash_sj_info['prevalence_all'] control_prevalence = hash_sj_info['prevalence_control'] bool_intronic = True # obj_sj = SpliceJunction( sj_id, chrom, start, end, strand, read_count, gene_sym, isoform_id = None, sample_prevalence = 0, control_prevalence = 0, bool_intronic = False ) obj_sj = SpliceJunction(sj_id, chrom, start, end, strand, read_count, gene_sym, isoform_id, sample_prevalence, control_prevalence, bool_intronic) return obj_sj
def sj_read_support_pysam(pysam_file, genomic_range, uniq_only=False): """ Args: pysam_file = pysam.AlignmentFile that opens up the mapped reads bam file (e.g. accepted_hits.bam) genomic_range = string that is the position of interest (format = chrom:start-end) uniq_only = boolean -True = will only quantify the uniquely-mapped gapped reads that map to 'genomic_range' -False = will quantify the uniquely & non-uniquely-mapped gapped reads that map to 'genomic_range' Function: this function retrieves gapped reads that supports range 'genomic_range'. NOTE that this function is much faster (maybe 6x faster) than SpliceJunction's def sj_read_support() """ # if not genomic_range: # genomic_range = self.chrom + ':' + str( self.start ) + '-' + str( self.end ) all_count = 0 unique_count = 0 # hash_gr = Isoform.split_genome_pos( genomic_range ) hash_gr = Isoform.split_genome_pos(genomic_range) for i, read in enumerate( pysam_file.fetch(hash_gr['chrom'], hash_gr['start'], hash_gr['end'])): #this if if I only want to quantify uniquely-mapped reads -> slightly faster ##TEST:: print i, " - read = ", read if uniq_only: if read.mapq != 50: continue if not any(x for x in read.blocks if hash_gr['start'] in x): continue if not any(x for x in read.blocks if hash_gr['end'] in x): continue unique_count += 1 else: #this if if I only want to quantify non-unique & uniquely-mapped reads -> slightly slower if not any(x for x in read.blocks if hash_gr['start'] in x): continue if not any(x for x in read.blocks if hash_gr['end'] in x): continue all_count += 1 if read.mapq != 50: continue unique_count += 1 # return [{'all_count': all_count, 'unique_count': unique_count}, hash_query_test, hash_query_pos] return {'all_count': all_count, 'unique_count': unique_count}
def sj_read_support(bam_reader, genomic_range): """ Args: bam_reader = HTSeq.BAM_Reader instance, used to quantify the number of reads that map to SJ genomic range 'genomic_range' (command: bam_reader = HTSeq.BAM_Reader(path_to_bam_file) ) genomic_range = string in format chrom:start-end. If None, then uses the SJ position recorded in "self" Function: finds reads that support splice junctions by finding reads that uniquely map to splice junction position """ # if not genomic_range: # genomic_range = self.chrom + ':' + str( self.start ) + '-' + str( self.end ) hash_gr = Isoform.split_genome_pos( genomic_range) #hash_gr = hash table of genomic range #these are the counts count = 0 #count all reads that map to "genomic_range" uniq_count = 0 #count all uniquely mapped reads to "genomic_range" align_score_max_count = 0 #this also counts all uniquely mapped reads to "genomic_range" for i, a in enumerate(bam_reader.fetch(region=genomic_range)): # score = 0 # if a.optional_field( "NH" ) == 1: # score += 1 """ NOTE: a.cigar breaks down the meaning for the cigar, finding the end position for each cigar. """ for cigop in a.cigar: if cigop.type == 'N' and cigop.ref_iv.start == hash_gr[ 'start'] and cigop.ref_iv.end == hash_gr['end']: print i, ": a.get_sam_line = ", a.get_sam_line( ) #this retrieves the read information & presents it as a sam line print i, ": cigop = ", cigop print i, ": a.cigar = ", a.cigar print i, ">>>>>>>>>>>>>>>>>>>>\n" # if a.optional_field( "NH" ) == 1: #check if read is uniquely-mapped read ##TEST:: see the output of read print i, ": a = ", dir( a) #see all possible properties of object print i, ": a.get_sam_line = ", a.get_sam_line( ) #this retrieves the read information & presents it as a sam line print i, ": a.from_SAM_line = ", a.from_SAM_line print i, ": dir( a.from_SAM_line ) = ", dir(a.from_SAM_line) print i, ": a.aligned = ", a.aligned print i, ": a.flag = ", a.flag print i, ": a.get_sam_line = ", a.get_sam_line, " & dir = ", dir( a.get_sam_line) print i, ": a.read = ", a.read #this is the nucleotide sequence print i, ": a.read_as_aligned = ", a.read_as_aligned #this is the nucleotide sequence. "a.read_as_aligned" could be the reverse-complement to "a.read" print i, ": a.from_pysam_AlignedRead = ", a.from_pysam_AlignedRead, " & dir = ", dir( a.from_pysam_AlignedRead) print i, ": NH = ", a.optional_field("NH") print i, ": aligned = ", a.aligned, " & aQual = ", a.aQual, " & read quality = ", a.read.qual print i, ": dir( a.cigar ) = ", dir(a.cigar) print i, ": dir( cigop ) = ", dir(cigop) print i, ": cigop.ref_iv = ", cigop.ref_iv, " & chrom = ", cigop.ref_iv.chrom, " & start = ", cigop.ref_iv.start, " & end = ", cigop.ref_iv.end print i, ": cigop.size = ", cigop.size print i, ": cigop.type = ", cigop.type print i, ": cigop.check = ", cigop.check, " & dir = ", dir( cigop.check) print i, ": cigop.query_from = ", cigop.query_from #CONJ: I think this refers the start of the range of nucleotides that map to the genome print i, ": cigop.query_to = ", cigop.query_to #CONJ: I think this refers the end of the range of nucleotides that map to the genome #I can split the information for each read by splitting by tab-delimiter '\t' sam_line = a.get_sam_line() list_sam = sam_line.split('\t') print i, "list_sam = ", list_sam print '------------------\n\n' count += 1 if a.optional_field("NH") == 1: uniq_count += 1 #this also counts uniquely mapped reads if a.aQual == 50: align_score_max_count += 1 ##TEST:: # print " | total reads = ", count, # print " | unique map = ", count_uniq, # print " | quality 50 count = ", align_score_max_count return { "all_count": count, "unique_count": uniq_count, "unique_count_50": align_score_max_count }
def sj_read_support_variety_reads(bam_reader, genomic_range): """ Args: bam_reader = HTSeq.BAM_Reader instance, used to quantify the number of reads that map to SJ genomic range 'genomic_range' (command: bam_reader = HTSeq.BAM_Reader(path_to_bam_file) ) genomic_range = string in format chrom:start-end. If None, then uses the SJ position recorded in "self" Function: finds reads that support splice junctions by finding reads that uniquely map to splice junction position Output: {"all_count": count, "unique_count": uniq_count, "unique_count_50": align_score_max_count, "count_all_variety_reads": len( list_variety_reads ), "count_unique_variety_reads": len( unique_variety_reads ) } returns a hash with the following values: -"all_count" = all reads that support splicing event with position 'genomic_range' -"unique_count" = uniquely-mapped reads supporting splicing event with position 'genomic_range'. This looks for NH == 1 -"unique_count_50" = same as "unique_count", but looks at quality of read (aQual == 50), where 50 means uniquely mapped reads -see "Protocol: 15.10.30 - Samtools" -50 (or 255): unique mapping (NH:i:1) -3: maps to 2 locations in the target (NH:i:2, but I’ve also seen NH:i:3) -2: maps to 3 locations -1: maps to 4-9 locations (NH:i:4 or higher) -0: maps to 10 or more locations TO RETRIEVE UNIQUELY MAPPED READS: use command “samtools -q 4 file.bam” means any values above 4 are unique, where for tophat2 values 0 <= x <= 3 means multiple mapping at 50 means unique -"count_all_variety_reads" = total count of all reads with different end positions that support the splicing event. This is considering all reads, therefore there will be duplicates (meaning they will have the same start & end points - think of "thickBlocks" & "thinBlocks" for UCSC Genome Browser) -"count_unique_variety_reads" = count of # of reads with different end positions. This is the actually number of reads that support the splicing event & has different end points. -The hypothesis is the reads with the same end points are just the same RNA fragments sequenced during RNA-sequencing, therefore the more different types of reads supporting a splicing event, the more convincine the support. """ # if not genomic_range: # genomic_range = self.chrom + ':' + str( self.start ) + '-' + str( self.end ) hash_gr = Isoform.split_genome_pos( genomic_range) #hash_gr = hash table of genomic range #these are the counts count = 0 #count all reads that map to "genomic_range" uniq_count = 0 #count all uniquely mapped reads to "genomic_range" align_score_max_count = 0 #this also counts all uniquely mapped reads to "genomic_range" list_variety_reads = [] for i, a in enumerate(bam_reader.fetch(region=genomic_range)): # score = 0 # if a.optional_field( "NH" ) == 1: # score += 1 """ NOTE: a.cigar breaks down the meaning for the cigar, finding the end position for each cigar. """ for cigop in a.cigar: if cigop.type == 'N' and cigop.ref_iv.start == hash_gr[ 'start'] and cigop.ref_iv.end == hash_gr['end']: #if splicing event matches end positions #I can split the information for each read by splitting by tab-delimiter '\t' sam_line = a.get_sam_line() list_sam = sam_line.split('\t') str_read_info = list_sam[2] + "|" + list_sam[ 3] + "|" + list_sam[5] list_variety_reads.append(str_read_info) count += 1 if a.optional_field("NH") == 1: uniq_count += 1 #this also counts uniquely mapped reads if a.aQual == 50: align_score_max_count += 1 unique_variety_reads = list(set(list_variety_reads)) print "show unique_variety_reads: " print unique_variety_reads print "list_variety_reads = ", len(list_variety_reads) print "# of unique variety reads = ", len(unique_variety_reads)
g = Genome('sqlite:////tmp/hg19_v2.db') Isoform.set_cruzdb(g) sample_name = 'yuhimo' path_bam = DIR_RNASEQ + '/tophat_sample_' + sample_name + '/accepted_hits.bam' bam_reader = HTSeq.BAM_Reader(path_bam) #find overlapping splice junctions # gene_sym = 'CDK11B' # sj_pos = 'chr1:1573952-1575753' gene_sym = 'MLIP' isoform_info = g.refGene.filter_by(name2=gene_sym).all() isoform_id = isoform_info[0].name sj_pos = 'chr6:54025230-54034325' hash_pos = Isoform.split_genome_pos(sj_pos) chrom = hash_pos['chrom'] start = hash_pos['start'] end = hash_pos['end'] #other parameters that aren't that important for this testing strand = '+' sj_id = "TEST" read_count = 0 sample_prevalence = 0 control_prevalence = 0 bool_intronic = True obj_mi = MultiIsoform(chrom, start, end, gene_sym) obj_sj = SpliceJunction(sj_id, chrom, start, end, strand, read_count, gene_sym, sample_prevalence, control_prevalence, bool_intronic)
#NOTE: for mutation, assume it is on the + strand, regardless if the gene is on the + or - strand -> I take care of the strand difference later snv_strand = 1 #column 'variant genotype' reports mutation on + strand only, regardless of what strand the mutation or gene is on base_orig = 'C' base_mut = 'T' #mutation 3 # snv_genome_pos = "chr7:151704932-151704932" # gene_sym = "GALNTL5" # snv_strand = 1 # base_orig = 'G' # base_mut = 'A' g = Genome('sqlite:////tmp/hg19_v2.db') Isoform.set_cruzdb(g) hash_snv_pos = Isoform.split_genome_pos(snv_genome_pos) obj_mi = MultiIsoform(hash_snv_pos['chrom'], hash_snv_pos['start'], hash_snv_pos['end'], gene_sym) hash_pos = { 'chrom': hash_snv_pos['chrom'], 'pos_oi': hash_snv_pos['start'] } #use this for IsoformSJ - as an isoform may contain many "versions" (one isoform actually still has multiple isoforms funny enough), find the isoform closest to this position for i3, (k3, v3) in enumerate(obj_mi.hash_isoforms.iteritems() ): #k3 = isoform ID, v3 = Isoform Instance if i3 > 0: break #create the canonical transcript iso_sj = IsoformSJ(k3, [], -10, hash_pos, True) canon_transcript = iso_sj.create_canon_transcript(False)