def writeStopCounts(stop_counts, coverage, references, output_file): """ Write stop counts and coverage to output file. Modified as Hongjing want: Jitender 071217 ## added as she Want full length counts Args: stop_counts: GenomicArray for stop counts. coverage: GenomicArray for coverage. references: dict, Dictionary for reference sequences, mapping from reference_name to the length of reference sequence. output_file: string, output file to write stop counts and coverage. """ fp = open(output_file, "w") for chrome, counts_vector in stop_counts.chrom_vectors.items(): fp.write("%s\n" % chrome) chrom_size = references[chrome] for index in range(0, chrom_size): fp.write("%d\t" % stop_counts[HTSeq.GenomicPosition(chrome, index, ".")]) #fp.write("0") fp.write("\n") for index in range(0, chrom_size): fp.write("%d\t" % coverage[HTSeq.GenomicPosition(chrome, index, ".")]) #fp.write("0") fp.write("\n\n") fp.close()
def softclipping_realignment(mapq_cutoff, max_del_len, input, output, ref_genome, gtf, splice_bin): bwa_bam = pysam.Samfile(input,'rb') output_bam = pysam.Samfile(output+'.temp.bam','wb', template=bwa_bam) # for RNAseq splice_motif = ['GTAG', 'CTAC', 'GCAG', 'CTGC', 'ATAC', 'GTAT'] try: fastafile = pysam.Fastafile(ref_genome) except IOError as e: print 'read reference genome '+ref_genome+' error!',e sys.exit(1) try: cvg = extract_splice_sites(gtf, splice_bin) except IOError as e: print 'read GTF file '+gtf+' error!',e sys.exit(1) try: for read in bwa_bam.fetch(until_eof=True): if read.mapq >= mapq_cutoff and not read.is_secondary and not read.has_tag('XA'): chr = bwa_bam.getrname(read.rname) newcigar,newpos = detect_sv_from_cigar(chr,read,mapq_cutoff,max_del_len) if newcigar != 'NA' and newcigar != read.cigar: old_cigarstring, old_cigar, old_pos = read.cigarstring, read.cigar, read.pos read.cigar, read.pos = newcigar, newpos if 'D' in read.cigarstring: junc_start, junc_end = read.blocks[0][1], read.blocks[1][0] htpos1 = HTSeq.GenomicPosition(chr,junc_start,'.') htpos2 = HTSeq.GenomicPosition(chr,junc_end,'.') if cvg[htpos1] > 0 or cvg[htpos2] > 0: read.cigar, read.pos = old_cigar, old_pos read.setTag('JM', 'GTF') output_bam.write(read) continue m1=fastafile.fetch(chr,junc_start,junc_start+2) m2=fastafile.fetch(chr,junc_end-2,junc_end) motif = m1.upper()+m2.upper() if motif in splice_motif: read.cigar, read.pos = old_cigar, old_pos read.setTag('JM', motif) output_bam.write(read) continue read.setTag('OA', str(old_pos+1)+','+old_cigarstring) output_bam.write(read) except ValueError as e: print >> sys.stderr, 'Bam index file is not found!', e sys.exit(1) bwa_bam.close() output_bam.close() try: subprocess.check_call("samtools sort {0}.temp.bam -o {0}".format(output), shell=True) except subprocess.CalledProcessError as e: print >> sys.stderr, 'Execution failed for samtools:', e sys.exit(1) subprocess.check_call("samtools index {}".format(output), shell=True)
def gf_merge(gf_i, gf_j): # Centre of A is contained in core of B, and centre of B is contained in core of A gp_i_centre = hts.GenomicPosition( gf_i.iv.chrom, (gf_i.iv.start + gf_i.iv.end) / 2, gf_i.iv.strand) gp_j_centre = hts.GenomicPosition( gf_j.iv.chrom, (gf_j.iv.start + gf_j.iv.end) / 2, gf_j.iv.strand) return (gp_i_centre.is_contained_in(gf_j.iv) and gp_j_centre.is_contained_in(gf_i.iv))
def extract_break_genes(breaks, transcript): """find genes where breaks happen.""" """build gene model""" """if break in exon, report exon number and codon position""" """if break in intron, report closest intron""" break5=breaks[0][2] break3=breaks[1][0] gene5=list(transcript[HTSeq.GenomicPosition(break5[0], int(break5[1]), ".")]) gene3=list(transcript[HTSeq.GenomicPosition(break3[0], int(break3[1]), ".")]) return HTSeq.GenomicPosition(break5[0], int(break5[1]), "."), HTSeq.GenomicPosition(break3[0], int(break3[1]), "."), gene5, gene3
def add_to_ga(infile, global_ga): ga = HTSeq.GenomicArray('auto', stranded=True) with open(infile, 'r') as f: for li in f: s = li.rstrip('\n').split('\t') if s[5] == '+': iv = HTSeq.GenomicPosition( s[0], int(s[1]), s[5]) if s[5] == '-': iv = HTSeq.GenomicPosition( s[0], int(s[1]) - 1, s[5]) ga[iv] += 1 global_ga[iv] += 1 return ga
def read_bed(fname='', use_first_n_lines=False, **kwargs): """Bed format is 0-based and [a,b). So a read on the + strand has it's 5' end at [a], and a read on the - strand has its read at [b-1]. """ ga = HTSeq.GenomicArray('auto', stranded=True, typecode='i') if ('verbose' in kwargs) and (kwargs['verbose']): print('Loading bed file {0}...'.format(fname)) if 'use' not in kwargs: use = 'read start' else: use = kwargs['use'] # If using the first n lines, read the bed: if (use_first_n_lines) and (use_first_n_lines>=1): if use == 'bed coord': for n, s in enumerate(for_split_bed_lines(fname)): ga[HTSeq.GenomicInterval(s[0], s[1], s[2], s[3])] += 1 if n >= use_first_n_lines: break if use == 'read start': for n, s in enumerate(for_split_bed_lines(fname)): if s[3] == '+': ga[HTSeq.GenomicPosition(s[0], s[1], s[3])] += 1 else: ga[HTSeq.GenomicPosition(s[0], s[2]-1, s[3])] += 1 if n >= use_first_n_lines: break # If using the entire bed file, read the bed: else: if use == 'bed coord': for s in for_split_bed_lines(fname): ga[HTSeq.GenomicInterval(s[0], s[1], s[2], s[3])] += 1 if use == 'read start': for s in for_split_bed_lines(fname): if s[3] == '+': ga[HTSeq.GenomicPosition(s[0], s[1], s[3])] += 1 else: ga[HTSeq.GenomicPosition(s[0], s[2]-1, s[3])] += 1 return ga
def parse_MAF(self): ''' maf filetype parser function. Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn position = int( str(row[fieldId['Start_position']]).split('.')[0] ) # case sensitive. what if, 'Start_Position' instead? case-insensitive hash lookup, or make everything lowercase befor making comparisons? dp = int(str(row[fieldId['TTotCov']]).split('.')[0]) vf = float(float(row[fieldId['TVarCov']]) / float(dp)) chrom = str(row[fieldId['Chromosome']]) ref = str(row[fieldId['Reference_Allele']]) alt = str(row[fieldId['Tumor_Seq_Allele2']]) effect = self.eff fc = self.fc if ref == "-": ref = "" if alt == "-": alt = "" var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var
def parse_MuTectOUT(self): ''' MuTect '.out' parser function. Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn chrom = row[0] ref = row[3] alt = row[4] effect = self.eff fc = self.fc vf = float(row[fieldId['tumor_f']]) dp = int( int(str(row[fieldId['t_ref_count']]).strip()) + int(str(row[fieldId['t_alt_count']]).strip())) position = int(row[fieldId['position']]) var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var
def parse_SomaticIndelDetector(self): ''' GATK SomaticIndelDetector vcf parser function. Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn chrom = row[0] ref = row[3] alt = row[4] effect = self.eff fc = self.fc j = 0 # Below attempts to grab sample ID. # assumes that sample ID is the final column in the self.header. always true? # if not always true, adopt the parse_mutect solution here as well tmpsampID = header[-1] for i in row[fieldId['FORMAT']].split(':'): if i == "AD": ALT_count = row[fieldId[tmpsampID]].split(':')[j].split(',')[1] elif i == "DP": dp = row[fieldId[tmpsampID]].split(':')[j] vf = float(float(ALT_count) / float(dp)) j += 1 position = int(row[fieldId['POS']]) var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var
def parse_SamTools(self): ''' samtools vcf parser function. Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn chrom = row[0] ref = row[3] alt = row[4] effect = self.eff fc = self.fc position = int(row[fieldId['POS']]) for i in row[fieldId['INFO']].split(';'): if i.startswith("DP4="): j = i.split('=')[1].split(',') ro = int(int(j[0]) + int(j[1])) ao = int(int(j[2]) + int(j[3])) dp = int(int(ro) + int(ao)) vf = float(float(ao) / float(dp)) var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var
def read_region(bam_filename, iv): bamfile = pysam.AlignmentFile(bam_filename, "rb") s = bamfile.fetch(iv[0], max(0, int(iv[1])), iv[2]) reads = list() ga = HTSeq.GenomicArray([iv[0]], stranded=True) for r in s: if(iv[3]=="+" and not r.is_reverse): r_pos = HTSeq.GenomicPosition( iv[0], r.reference_start, iv[3]) ga[r_pos] += 1 if (iv[3]=="-" and r.is_reverse): r_pos = HTSeq.GenomicPosition( iv[0], r.reference_end-1, iv[3]) ga[r_pos] += 1 bamfile.close() return ga
def seq_around_point_of_highest_coverage(self, point): if point: try: point.strand except: try: _str = point.split(':') (pos, strand) = _str[-1].split('/') point = HTSeq.GenomicPosition(_str[0], int(pos), strand) except: return '' if point.strand == '+': iv_for_seq = HTSeq.GenomicInterval( point.chrom, point.start - 15, point.end + 30, point.strand) # iv_for_seq.start -= 15 # 5 # iv_for_seq.end += 30 # 20 else: iv_for_seq = HTSeq.GenomicInterval( point.chrom, point.start - 30, point.end + 15, point.strand) # iv_for_seq.start -= 30 # 20 # iv_for_seq.end += 15 # 5 return self.grab_sequence_from_iv_with_offset(iv_for_seq) else: return ''
def parse_VarScan(self): ''' varscan vcf parser function. Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn chrom = row[0] ref = row[3] alt = row[4] effect = self.eff fc = self.fc j = 0 position = int(row[fieldId['POS']]) for i in row[fieldId['FORMAT']].split(':'): if str(i) == "DP": dp = int(row[fieldId[header[-1]]].split(':')[j]) if str(i) == "FREQ": vf = float( float( str(row[fieldId[header[-1]]].split(':')[j]).strip('%')) / float(100)) j += 1 var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var
def parse_HapCaller(self): ''' GATK haplotype caller vcf parser function. Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn chrom = row[0] ref = row[3] alt = row[4] effect = self.eff fc = self.fc j = 0 position = int(row[fieldId['POS']]) ''' for i in row[fieldId['INFO']].split(';'): if i.startswith("DP="): dp = i.split('=')[1] if i.startswith("AF="): vf1 = float(i.split('=')[1]) ''' for i in row[fieldId['FORMAT']].split(':'): if str(i) == "DP": dp = int(row[fieldId[header[-1]]].split(':')[j]) if str(i) == "AD": ad = str(row[fieldId[header[-1]]].split(':')[j]) if str(',') in ad: ref_count = int(ad.split(',')[0]) alt_count = int(ad.split(',')[1]) try: vf = float( float(alt_count) / (float(ref_count) + float(alt_count))) except: vf = 0.0 else: abortWithMessage( "Sample {0} may not have Haplotype Caller mutations with no ALT or vf" .format(header[-1])) j += 1 try: vf except: print(row, file=sys.stderr) vf = 0.0 try: dp except: print(row, file=sys.stderr) dp = 0.0 var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var
def to_iv(self, start, end=None): to_genome = self.pos_in_exon_coverage_mapped_to_genomic_pos if end is None: if start not in to_genome: print("{0} pos not in exon ({1} -> {2}, {3} keys())".format( start, min(to_genome.keys()), max(to_genome.keys()), len(to_genome))) return HTSeq.GenomicPosition(self.iv.chrom, 1, self.iv.strand) return HTSeq.GenomicPosition( self.iv.chrom, self.pos_in_exon_coverage_mapped_to_genomic_pos[start], self.iv.strand) return HTSeq.GenomicInterval( self.iv.chrom, self.pos_in_exon_coverage_mapped_to_genomic_pos[start], self.pos_in_exon_coverage_mapped_to_genomic_pos[end], self.iv.strand)
def is_blacklisted_by_regions(self, pos1, pos2): if (pos2[0] < pos1[0]) or (pos1[0] == pos2[0] and pos2[1] < pos1[1]): pos3 = pos1 pos1 = pos2 pos2 = pos3 ids = set() position = self.idx_regions[HTSeq.GenomicPosition( pos1[0], pos1[1], pos1[2])] for step in position: ids.add(step) position = self.idx_regions[HTSeq.GenomicPosition( pos2[0], pos2[1], pos2[2])] for step in position: ids.add(step) return ids
def bed_to_genomic_interval(bed): """ Converts bed file to genomic interval (htseq format) file """ for interval in bed: yield HTSeq.GenomicPosition(interval.chrom, interval.start, interval.strand)
def parse_MiSeq(self): ''' MiSeq vcf parser function. Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn chrom = row[0] ref = row[3] alt = row[4] fc = self.fc effect = self.eff for i in row[fieldId['INFO']].split(';'): if i.startswith("DP="): dp = i.split('=')[1] # if the MiSeq software reported functional consequence and effect and the file is not snpEff anotated, the MiSeq annotations will be used instead if i.startswith("FC=") and not fc: for j in i.split('=')[1].split(','): if str(j.split('_')[0]) not in str(fc): fc += str(j.split('_')[0]) + ";" try: if str(j.split('_')[1]) not in str(effect): effect += str(j.split('_')[1]) + ";" except: pass elif str(i) == "EXON": fc += 'EXON' if not fc: fc = str("?") if not effect: effect = str("?") k = 0 for i in row[fieldId['FORMAT']].split(':'): if str(i) == "VF": vf = float(row[fieldId[header[-1]]].split(':')[k]) ''' #for when vf is not in the format column, but AD is if str(i) == "AD" and not dp or not vf: dp = 0 rd = int(row[fieldId[header[-1]]].split(':')[k].split(',')[0]) ad = int(row[fieldId[header[-1]]].split(':')[k].split(',')[1]) dp = int(rd) + int(ad) ''' k += 1 position = int(row[fieldId['POS']]) var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var
def main(): ga = HTSeq.GenomicArray("auto", typecode='O', stranded=False) position = HTSeq.GenomicPosition('chr1', 123203, '.') ga[HTSeq.GenomicInterval( "chr1", 100000, 101000 , "." )] = [0.05, 0.002, 0.04, 0.005] iv = HTSeq.GenomicInterval( "chr1", 100000, 130000 , "." ) for interval, value in ga[iv].steps(): print(interval, value)
def TSS_Profile(ifile1,ifile2): '''read in three files, ifile1 is the sortedbamfile prepared by samtool ifile2 is the Genomic position file with three columns: chr, position, strand''' import HTSeq import numpy import itertools sortedbamfile=HTSeq.BAM_Reader(ifile1) HRE_file=open(ifile2) halfwinwidth=3000 fragmentsize=200 HREpos=set() for line in HRE_file: linelist=line.split() HREpos.add(HTSeq.GenomicPosition(linelist[0],int(linelist[1]),'.'))# creat Genomic position objects by HTSeq # if there is a blank line, linelist[1] will get an index out of range error # make sure no blank lines, you can write a yeild Generator no_blanklines() # def nonblan_lines(f): # for l in f: # line=l.rstrip() # if line: # yield line for HRE in itertools.islice(HREpos,10): print HRE # print out 10 HRE postions profile=numpy.zeros(2*halfwinwidth, dtype='i') for p in HREpos: try: window=HTSeq.GenomicInterval(p.chrom, p.pos-halfwinwidth-fragmentsize,p.pos+halfwinwidth + fragmentsize,".") for almnt in sortedbamfile[window]: almnt.iv.length=fragmentsize if p.strand==".": start_in_window=almnt.iv.start- p.pos +halfwinwidth end_in_window =almnt.iv.end - p.pos +halfwinwidth else: start_in_window=p.pos+halfwinwidth-almnt.iv.end end_in_window =p.pos+halfwinwidth-almnt.iv.start start_in_window=max(start_in_window,0) end_in_window=min(end_in_window, 2*halfwinwidth) if start_in_window >= 2*halfwinwidth or end_in_window <0: continue profile[start_in_window : end_in_window] +=1 except: continue return profile ifile1.close() ifile2.close()
def highest_point(cls, ga, iv, arr=None, gene_name='', **kwargs): if arr is None: arr = cls.array_from_htseq(ga, iv) # Replace this with sensible code. if len(arr) > 0: _max = np.max(arr) else: _max = 0 return False indexes_of_maxes = [] for n, v in enumerate(arr): if v == _max: indexes_of_maxes.append(n) if len(indexes_of_maxes) > 1: consecutive = [] for n, index in enumerate(indexes_of_maxes): if n == 0: consecutive.append(index) continue if index <= (consecutive[n - 1] + 3): # Allow 2 nt gaps. consecutive.append(index) else: # Reject if there are nonadjacent maxima return None # Take the peak as the middle point. max_index = int((consecutive[0] + consecutive[-1]) / 2) # max_index = random.choice(indexes_of_maxes) else: max_index = indexes_of_maxes[0] if type(iv) == type([]): genomic_point_of_highest_coverage = HTSeq.GenomicPosition( iv[0].chrom, iv[0].start + max_index, iv[0].strand) else: genomic_point_of_highest_coverage = HTSeq.GenomicPosition( iv.chrom, iv.start + max_index, iv.strand) return genomic_point_of_highest_coverage
def writeDMSprofile(self): self.dms = HTSeq.GenomicArray({self.genome.id: self.genome_length}, stranded=True, storage='ndarray', typecode="d") for key in self.features.keys(): ft = self.features[key] for pos in ft.getData(): self.dms[HTSeq.GenomicPosition(self.genome.id, pos[0], strand=ft.strand)] = pos[4] dms_bed_plus_fn = self.sample_name + '_dms_plus.bedgraph' dms_bed_minus_fn = self.sample_name + '_dms_minus.bedgraph' write_bed(self.dms, dms_bed_plus_fn, dms_bed_minus_fn)
def filter_raw_tags_by_islands_bam(iterator, island_array, filename, fragment_size, window_size, genome_data): file = open(filename, 'w') # count total number of reads in islands total_reads_in_islands = 0 window_counts = HTSeq.GenomicArray(genome_data, stranded=False, typecode='d') # dictionary to store reads located in islands islandfiltered_reads_dict = {} # dictionary to store windows located in islands islandfiltered_windows_dict = {} for chrom in genome_data: islandfiltered_reads_dict[chrom] = [] islandfiltered_windows_dict[chrom] = [] # iterate through all reads in the bam iterator and write to file if read lands in an island for read in iterator: if not read.aligned: continue if read.iv.chrom not in genome_data: continue # apply shift to determine coordinate read_pos = get_read_pos(read, fragment_size, genome_data) position = HTSeq.GenomicPosition(read.iv.chrom, read_pos) if position.pos >= genome_data[read.iv.chrom]: position.pos = genome_data[read.iv.chrom] - 1 # determine if read lands in an island by using the genomic array island_array; if it does, write to file if island_array[position] >= 1: # line to add to BED file line = str(read.iv.chrom) + "\t" + str(read.iv.start) + "\t" + str(read.iv.end) \ + "\t" + str(read.read.name) + "\t" + str(read.aQual) + "\t" + str(read.iv.strand) + "\n" file.write(line) total_reads_in_islands += 1 islandfiltered_reads_dict[read.iv.chrom].append(read_pos) # determine window position of read window_start = read_pos / window_size * window_size if window_start < 0: window_start = 0 islandfiltered_windows_dict[read.iv.chrom].append(window_start) file.close() # sort and remove duplicates for reads dictionary and windows dictionary for chrom in genome_data: islandfiltered_reads_dict[chrom].sort() islandfiltered_windows_dict[chrom] = remove_duplicates_and_sort(islandfiltered_windows_dict[chrom]) return islandfiltered_reads_dict, islandfiltered_windows_dict, total_reads_in_islands
def get_bed(bed_filename, debug_mode=False): if debug_mode: os.system('head -n 10000 %s > %s' % (bed_filename, 'tmp.bed')) f = open('tmp.bed', 'r') else: f = open(bed_filename, 'r') #ga = HTSeq.GenomicArray(chroms='auto') ga_five = HTSeq.GenomicArray(chroms='auto') for n, read in enumerate(f): if not n % 1e6: print "get_bed({fn}) read {n}".format(fn=bed_filename, n=n) s = read.rstrip('\n').split('\t') #iv = HTSeq.GenomicInterval(s[0], int(s[1]), int(s[2]), s[5]) #ga[iv] += 1 if s[5] == '-': # Check if there should be a 1 offset here. ga_five[HTSeq.GenomicPosition(s[0], int(s[2]), s[5])] += 1 if s[5] == '+': ga_five[HTSeq.GenomicPosition(s[0], int(s[1]), s[5])] += 1 f.close() return ga_five
def to_ga(self, ga, start, end=None): to_genome = self.pos_in_exon_coverage_mapped_to_genomic_pos if end is None: if start not in to_genome: print("{0} pos not in exon ({1} -> {2}, {3} keys())".format( start, min(to_genome.keys()), max(to_genome.keys()), len(to_genome))) ga[HTSeq.GenomicPosition(self.iv.chrom, 1, self.iv.strand)] = 1 return ga[HTSeq.GenomicPosition( self.iv.chrom, self.pos_in_exon_coverage_mapped_to_genomic_pos[start], self.iv.strand)] = self.exon_coverage[start] return # _iv = HTSeq.GenomicInterval( # self.iv.chrom, # self.pos_in_exon_coverage_mapped_to_genomic_pos[start], # self.pos_in_exon_coverage_mapped_to_genomic_pos[end], # self.iv.strand) for pos in range(start, end + 1): ga[HTSeq.GenomicPosition( self.iv.chrom, self.pos_in_exon_coverage_mapped_to_genomic_pos[start], self.iv.strand)] = self.exon_coverage[pos]
def uniqueVariants(self): '''Return the set of unique variants from the set of all variants (for this feature)''' # exploit the hashtable and uniqueness of sets to quickly find # unique tuples (contig, pos, ref, alt) of variant info # sorted by chrom, pos uniqueVariantsTemp = set() for var in self.variants: candidate = (var.pos.chrom, var.pos.pos, var.ref, var.alt) uniqueVariantsTemp.add(candidate) # sort by chr, then position # TO DO: python sorted() will sort as: chr1, chr10, chr2, chr20, chrX. Fix. uniqueVariantsTemp = sorted(uniqueVariantsTemp, key=lambda varx: (varx[0] + str(varx[1]))) # Now construct a returnable set of Variant objects, # specifying multiple "sources" in the source field # this loop's inner-product is #unique variants * #total variants, times #features # and is a major inefficiency uniqueVariants = set() for uniqueVarTup in uniqueVariantsTemp: source = "" frac = "" dp = "" eff = "" fc = "" #annot = "" for varClass in self.variants: if (varClass.pos.chrom, varClass.pos.pos, varClass.ref, varClass.alt) == uniqueVarTup: source += varClass.source + ", " frac += str(varClass.frac) + ", " dp += str(varClass.dp) + ", " eff += str(varClass.eff) + ", " fc += str(varClass.fc) + ", " #annot += str(varClass.annot) + ", " pos = HTSeq.GenomicPosition(uniqueVarTup[0], uniqueVarTup[1]) uniqueVar = Variant( source.strip(", "), pos, ref=uniqueVarTup[2], alt=uniqueVarTup[3], frac=str(frac).strip(", "), dp=str(dp).strip(", "), eff=str(eff).strip(", "), fc=str(fc).strip(", ")) ######## Karl Modified ############## uniqueVariants.add(uniqueVar) return uniqueVariants
def find_summits_in_anchors(anchor, chrom_summits): """ anchor = HTSeq.iv chrom_summits = [] # list of summit position on one chrom """ chrom = anchor.chrom overlapped = 0 for summit in chrom_summits: pos = HTSeq.GenomicPosition(chrom, summit, '.') if pos.overlaps(anchor): ans = summit overlapped = 1 break if overlapped == 0: ans = (anchor.start + anchor.end) / 2 return str(ans)
def parse_IonTorrent(self): ''' Ion Torrent vcf parser function. Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn chrom = row[0] ref = row[3] alt = row[4] effect = self.eff fc = self.fc for i in row[fieldId['INFO']].split(';'): if i.startswith("AO="): tempval = i.split('=')[1] if i.startswith("RO="): ro = i.split('=')[1] if i.startswith("DP="): dp = i.split("=")[1] if str(',') in str(tempval): tempval2 = [ int(numeric_string) for numeric_string in tempval.split(',') ] try: ao = sum(tempval2) except: abortWithMessage( "AO should be an int, or a list of ints: AO = {0}/".format( tempval2)) else: ao = tempval vf = float(float(ao) / float(float(ro) + float(ao))) position = int(row[fieldId['POS']]) for i in str(row[fieldId['ALT']]).split(','): if len(str(row[fieldId['REF']])) > len(i): # this is a deletion in Ion Torrent data position = int(row[fieldId['POS']]) break var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var
def updateStopCountsForPairedEnd(first, second, stop_counts, coverage): """ Update stop counts and coverage for paired-end read (assume that both ends are mappend to the same reference). Args: first: a HTSeq.Alignment object, pointing to the first end of a paired-end read. second: a HTSeq.Alignment object, pointing to the first end of a paired-end read. stop_counts: a HTSeq.GenomicArray object for stop counts. coverage: a HTSeq.GenomicArray object for coverage. """ start = min(first.iv.start, second.iv.start) end = max(first.iv.end, second.iv.end) modification_site = HTSeq.GenomicPosition(first.iv.chrom, start, first.iv.strand) stop_counts[modification_site] += 1 new_iv = HTSeq.GenomicInterval(first.iv.chrom, start, end, first.iv.strand) coverage[new_iv] += 1
def parse_GenericGATK(self): ''' Generic GATK parser function. This was written for the Illumina BaseSpace BWA Enrichment Workflow vcf files, but may apply to more filetypes Input: InputParser object. Output: Variant object ''' row = self.row fieldId = self.fieldId header = self.header fn = self.fn chrom = row[0] ref = row[3] alt = row[4] effect = self.eff fc = self.fc j = 0 position = int(row[fieldId['POS']]) for i in row[fieldId['FORMAT']].split(':'): if str(i) == "AD": ro = int(row[fieldId[header[-1]]].split(':')[j].split(',')[0]) #ao = int(row[fieldId[header[-1]]].split(':')[j].split(',')[-1]) # fails when the mutation has two alternate alleles in the same VCF line ao = sum([ int(x) for x in row[fieldId[header[-1]]].split(':') [j].split(',')[1:] ]) dp = ro + ao try: vf = float( float(ao) / float(dp) ) # one VF for all possible alternate alleles. Nothing unusual, unless the mutation has multiple alt alleles in 1 vcf line except: print("\nwarning: no vaf?\n" + str(row) + "\n") vf = 0 break j += 1 var = Variant(source=fn.split('/')[-1], pos=HTSeq.GenomicPosition(chrom, int(position)), ref=ref, alt=alt, frac=vf, dp=dp, eff=effect.strip(';'), fc=fc.strip(';')) return var