コード例 #1
0
def writeStopCounts(stop_counts, coverage, references, output_file):
    """
    Write stop counts and coverage to output file. Modified as Hongjing want: Jitender 071217
    ## added as she Want full length counts
    Args:
        stop_counts: GenomicArray for stop counts.
        coverage: GenomicArray for coverage.
        references: dict, Dictionary for reference sequences, mapping from reference_name to the length of reference sequence.
        output_file: string, output file to write stop counts and coverage.
    """
    fp = open(output_file, "w")
    for chrome, counts_vector in stop_counts.chrom_vectors.items():
        fp.write("%s\n" % chrome)
        chrom_size = references[chrome]
        for index in range(0, chrom_size):
            fp.write("%d\t" %
                     stop_counts[HTSeq.GenomicPosition(chrome, index, ".")])
        #fp.write("0")
        fp.write("\n")
        for index in range(0, chrom_size):
            fp.write("%d\t" %
                     coverage[HTSeq.GenomicPosition(chrome, index, ".")])
        #fp.write("0")
        fp.write("\n\n")
    fp.close()
コード例 #2
0
def softclipping_realignment(mapq_cutoff, max_del_len, input, output, ref_genome, gtf, splice_bin):
	bwa_bam = pysam.Samfile(input,'rb')
	output_bam = pysam.Samfile(output+'.temp.bam','wb', template=bwa_bam)

	# for RNAseq
	splice_motif = ['GTAG', 'CTAC', 'GCAG', 'CTGC', 'ATAC', 'GTAT']
	try:
		fastafile = pysam.Fastafile(ref_genome)
	except IOError as e:
		print 'read reference genome '+ref_genome+' error!',e
		sys.exit(1)
	try:
		cvg = extract_splice_sites(gtf, splice_bin)
	except IOError as e:
		print 'read GTF file '+gtf+' error!',e
		sys.exit(1)

	try:
		for read in bwa_bam.fetch(until_eof=True):
			if read.mapq >= mapq_cutoff and not read.is_secondary and not read.has_tag('XA'): 
				chr = bwa_bam.getrname(read.rname)
				newcigar,newpos =  detect_sv_from_cigar(chr,read,mapq_cutoff,max_del_len)
				if newcigar != 'NA' and newcigar != read.cigar: 
					old_cigarstring, old_cigar, old_pos = read.cigarstring, read.cigar, read.pos
					read.cigar, read.pos = newcigar, newpos
					if 'D' in read.cigarstring:
						junc_start, junc_end = read.blocks[0][1], read.blocks[1][0]
						htpos1 = HTSeq.GenomicPosition(chr,junc_start,'.')
						htpos2 = HTSeq.GenomicPosition(chr,junc_end,'.')
						if cvg[htpos1] > 0 or cvg[htpos2] > 0:
							read.cigar, read.pos = old_cigar, old_pos
							read.setTag('JM', 'GTF')
							output_bam.write(read)
							continue
						m1=fastafile.fetch(chr,junc_start,junc_start+2)
						m2=fastafile.fetch(chr,junc_end-2,junc_end)
						motif = m1.upper()+m2.upper()
						if motif in splice_motif:
							read.cigar, read.pos = old_cigar, old_pos
							read.setTag('JM', motif)
							output_bam.write(read)
							continue
					read.setTag('OA', str(old_pos+1)+','+old_cigarstring)
			output_bam.write(read)
	except ValueError as e:
		print >> sys.stderr, 'Bam index file is not found!', e
		sys.exit(1)
	bwa_bam.close()
	output_bam.close()
	try:
		subprocess.check_call("samtools sort {0}.temp.bam -o {0}".format(output), shell=True)
	except subprocess.CalledProcessError as e:
		print >> sys.stderr, 'Execution failed for samtools:', e
		sys.exit(1)

	subprocess.check_call("samtools index {}".format(output), shell=True)
コード例 #3
0
 def gf_merge(gf_i, gf_j):
     # Centre of A is contained in core of B, and centre of B is contained in core of A
     gp_i_centre = hts.GenomicPosition(
         gf_i.iv.chrom, (gf_i.iv.start + gf_i.iv.end) / 2,
         gf_i.iv.strand)
     gp_j_centre = hts.GenomicPosition(
         gf_j.iv.chrom, (gf_j.iv.start + gf_j.iv.end) / 2,
         gf_j.iv.strand)
     return (gp_i_centre.is_contained_in(gf_j.iv)
             and gp_j_centre.is_contained_in(gf_i.iv))
コード例 #4
0
def extract_break_genes(breaks, transcript):
    """find genes where breaks happen."""
    """build gene model"""
    """if break in exon, report exon number and codon position"""
    """if break in intron, report closest intron"""
    break5=breaks[0][2]
    break3=breaks[1][0]
    gene5=list(transcript[HTSeq.GenomicPosition(break5[0], int(break5[1]), ".")])
    gene3=list(transcript[HTSeq.GenomicPosition(break3[0], int(break3[1]), ".")])
    return HTSeq.GenomicPosition(break5[0], int(break5[1]), "."), HTSeq.GenomicPosition(break3[0], int(break3[1]), "."), gene5, gene3
コード例 #5
0
ファイル: bed_to_wig.py プロジェクト: dfporter/fbf_clip
def add_to_ga(infile, global_ga):
    ga = HTSeq.GenomicArray('auto', stranded=True)
    with open(infile, 'r') as f:
        for li in f:
            s = li.rstrip('\n').split('\t')
            if s[5] == '+':
                iv = HTSeq.GenomicPosition(
                    s[0], int(s[1]), s[5])
            if s[5] == '-':
                iv = HTSeq.GenomicPosition(
                    s[0], int(s[1]) - 1, s[5])
            ga[iv] += 1
            global_ga[iv] += 1
    return ga
コード例 #6
0
ファイル: bedgraphs.py プロジェクト: dfporter/easyCLIP
def read_bed(fname='', use_first_n_lines=False, **kwargs):
    """Bed format is 0-based and [a,b).
    So a read on the + strand has it's 5' end at [a], and a
    read on the - strand has its read at [b-1].
    """
    ga = HTSeq.GenomicArray('auto', stranded=True, typecode='i')
    
    if ('verbose' in kwargs) and (kwargs['verbose']):
        print('Loading bed file {0}...'.format(fname))
    
    if 'use' not in kwargs:
        use = 'read start'
    else:
        use = kwargs['use']
    
    # If using the first n lines, read the bed:
    if (use_first_n_lines) and (use_first_n_lines>=1):
        if use == 'bed coord':
            for n, s in enumerate(for_split_bed_lines(fname)):
                ga[HTSeq.GenomicInterval(s[0], s[1], s[2], s[3])] += 1
                
                if n >= use_first_n_lines:
                    break

        if use == 'read start':
            for n, s in enumerate(for_split_bed_lines(fname)):
                if s[3] == '+':
                    ga[HTSeq.GenomicPosition(s[0], s[1], s[3])] += 1
                else:
                    ga[HTSeq.GenomicPosition(s[0], s[2]-1, s[3])] += 1
                    
                if n >= use_first_n_lines:
                    break
                    
    # If using the entire bed file, read the bed:
    else:
        if use == 'bed coord':
            for s in for_split_bed_lines(fname):
                ga[HTSeq.GenomicInterval(s[0], s[1], s[2], s[3])] += 1

        if use == 'read start':
            for s in for_split_bed_lines(fname):
                if s[3] == '+':
                    ga[HTSeq.GenomicPosition(s[0], s[1], s[3])] += 1
                else:
                    ga[HTSeq.GenomicPosition(s[0], s[2]-1, s[3])] += 1
                        
    return ga
コード例 #7
0
    def parse_MAF(self):
        ''' maf filetype parser function. Input: InputParser object. Output: Variant object '''

        row = self.row
        fieldId = self.fieldId
        header = self.header
        fn = self.fn
        position = int(
            str(row[fieldId['Start_position']]).split('.')[0]
        )  # case sensitive. what if, 'Start_Position' instead? case-insensitive hash lookup, or make everything lowercase befor making comparisons?
        dp = int(str(row[fieldId['TTotCov']]).split('.')[0])
        vf = float(float(row[fieldId['TVarCov']]) / float(dp))
        chrom = str(row[fieldId['Chromosome']])
        ref = str(row[fieldId['Reference_Allele']])
        alt = str(row[fieldId['Tumor_Seq_Allele2']])
        effect = self.eff
        fc = self.fc
        if ref == "-":
            ref = ""
        if alt == "-":
            alt = ""
        var = Variant(source=fn.split('/')[-1],
                      pos=HTSeq.GenomicPosition(chrom, int(position)),
                      ref=ref,
                      alt=alt,
                      frac=vf,
                      dp=dp,
                      eff=effect.strip(';'),
                      fc=fc.strip(';'))
        return var
コード例 #8
0
    def parse_MuTectOUT(self):
        ''' MuTect '.out' parser function. Input: InputParser object. Output: Variant object '''

        row = self.row
        fieldId = self.fieldId
        header = self.header
        fn = self.fn
        chrom = row[0]
        ref = row[3]
        alt = row[4]
        effect = self.eff
        fc = self.fc
        vf = float(row[fieldId['tumor_f']])
        dp = int(
            int(str(row[fieldId['t_ref_count']]).strip()) +
            int(str(row[fieldId['t_alt_count']]).strip()))
        position = int(row[fieldId['position']])

        var = Variant(source=fn.split('/')[-1],
                      pos=HTSeq.GenomicPosition(chrom, int(position)),
                      ref=ref,
                      alt=alt,
                      frac=vf,
                      dp=dp,
                      eff=effect.strip(';'),
                      fc=fc.strip(';'))
        return var
コード例 #9
0
    def parse_SomaticIndelDetector(self):
        ''' GATK SomaticIndelDetector vcf parser function. Input: InputParser object. Output: Variant object '''

        row = self.row
        fieldId = self.fieldId
        header = self.header
        fn = self.fn
        chrom = row[0]
        ref = row[3]
        alt = row[4]
        effect = self.eff
        fc = self.fc
        j = 0
        # Below attempts to grab sample ID.
        # assumes that sample ID is the final column in the self.header. always true?
        # if not always true, adopt the parse_mutect solution here as well
        tmpsampID = header[-1]

        for i in row[fieldId['FORMAT']].split(':'):
            if i == "AD":
                ALT_count = row[fieldId[tmpsampID]].split(':')[j].split(',')[1]
            elif i == "DP":
                dp = row[fieldId[tmpsampID]].split(':')[j]
                vf = float(float(ALT_count) / float(dp))
            j += 1
        position = int(row[fieldId['POS']])
        var = Variant(source=fn.split('/')[-1],
                      pos=HTSeq.GenomicPosition(chrom, int(position)),
                      ref=ref,
                      alt=alt,
                      frac=vf,
                      dp=dp,
                      eff=effect.strip(';'),
                      fc=fc.strip(';'))
        return var
コード例 #10
0
    def parse_SamTools(self):
        ''' samtools vcf parser function. Input: InputParser object. Output: Variant object '''

        row = self.row
        fieldId = self.fieldId
        header = self.header
        fn = self.fn
        chrom = row[0]
        ref = row[3]
        alt = row[4]
        effect = self.eff
        fc = self.fc
        position = int(row[fieldId['POS']])
        for i in row[fieldId['INFO']].split(';'):
            if i.startswith("DP4="):
                j = i.split('=')[1].split(',')
                ro = int(int(j[0]) + int(j[1]))
                ao = int(int(j[2]) + int(j[3]))
                dp = int(int(ro) + int(ao))
                vf = float(float(ao) / float(dp))
                var = Variant(source=fn.split('/')[-1],
                              pos=HTSeq.GenomicPosition(chrom, int(position)),
                              ref=ref,
                              alt=alt,
                              frac=vf,
                              dp=dp,
                              eff=effect.strip(';'),
                              fc=fc.strip(';'))
                return var
コード例 #11
0
ファイル: peakslib.py プロジェクト: dfporter/fbf_clip
def read_region(bam_filename, iv):
    bamfile = pysam.AlignmentFile(bam_filename, "rb")
    s = bamfile.fetch(iv[0], max(0, int(iv[1])), iv[2])
    reads = list()
    ga = HTSeq.GenomicArray([iv[0]], stranded=True)
    for r in s:
        if(iv[3]=="+" and not r.is_reverse):
            r_pos = HTSeq.GenomicPosition(
                 iv[0], r.reference_start, iv[3])
            ga[r_pos] += 1
        if (iv[3]=="-" and r.is_reverse):
           r_pos = HTSeq.GenomicPosition(
                 iv[0], r.reference_end-1, iv[3])
           ga[r_pos] += 1
    bamfile.close()
    return ga
コード例 #12
0
ファイル: peakSeqFinder.py プロジェクト: dfporter/easyCLIP
    def seq_around_point_of_highest_coverage(self, point):
        if point:

            try:
                point.strand
            except:
                
                try:
                    _str = point.split(':')
                    (pos, strand) = _str[-1].split('/')
                    point = HTSeq.GenomicPosition(_str[0], int(pos), strand)
                except:
                    return ''

            if point.strand == '+':
                iv_for_seq = HTSeq.GenomicInterval(
                    point.chrom, point.start - 15, point.end + 30, point.strand)
#                iv_for_seq.start -= 15  # 5
#                iv_for_seq.end += 30  # 20
            else:
                iv_for_seq = HTSeq.GenomicInterval(
                    point.chrom, point.start - 30, point.end + 15, point.strand)
#                iv_for_seq.start -= 30  # 20
#                iv_for_seq.end += 15  # 5

            return self.grab_sequence_from_iv_with_offset(iv_for_seq)
        else:
            return ''
コード例 #13
0
    def parse_VarScan(self):
        ''' varscan vcf parser function. Input: InputParser object. Output: Variant object '''

        row = self.row
        fieldId = self.fieldId
        header = self.header
        fn = self.fn
        chrom = row[0]
        ref = row[3]
        alt = row[4]
        effect = self.eff
        fc = self.fc
        j = 0
        position = int(row[fieldId['POS']])
        for i in row[fieldId['FORMAT']].split(':'):
            if str(i) == "DP":
                dp = int(row[fieldId[header[-1]]].split(':')[j])
            if str(i) == "FREQ":
                vf = float(
                    float(
                        str(row[fieldId[header[-1]]].split(':')[j]).strip('%'))
                    / float(100))
            j += 1
        var = Variant(source=fn.split('/')[-1],
                      pos=HTSeq.GenomicPosition(chrom, int(position)),
                      ref=ref,
                      alt=alt,
                      frac=vf,
                      dp=dp,
                      eff=effect.strip(';'),
                      fc=fc.strip(';'))
        return var
コード例 #14
0
    def parse_HapCaller(self):
        ''' GATK haplotype caller vcf parser function. Input: InputParser object. Output: Variant object '''

        row = self.row
        fieldId = self.fieldId
        header = self.header
        fn = self.fn
        chrom = row[0]
        ref = row[3]
        alt = row[4]
        effect = self.eff
        fc = self.fc
        j = 0
        position = int(row[fieldId['POS']])
        '''
        for i in row[fieldId['INFO']].split(';'):
            if i.startswith("DP="):
                dp = i.split('=')[1]
            if i.startswith("AF="):
                vf1 = float(i.split('=')[1])
        '''
        for i in row[fieldId['FORMAT']].split(':'):
            if str(i) == "DP":
                dp = int(row[fieldId[header[-1]]].split(':')[j])
            if str(i) == "AD":
                ad = str(row[fieldId[header[-1]]].split(':')[j])
                if str(',') in ad:
                    ref_count = int(ad.split(',')[0])
                    alt_count = int(ad.split(',')[1])
                    try:
                        vf = float(
                            float(alt_count) /
                            (float(ref_count) + float(alt_count)))
                    except:
                        vf = 0.0
                else:
                    abortWithMessage(
                        "Sample {0} may not have Haplotype Caller mutations with no ALT or vf"
                        .format(header[-1]))
            j += 1
        try:
            vf
        except:
            print(row, file=sys.stderr)
            vf = 0.0
        try:
            dp
        except:
            print(row, file=sys.stderr)
            dp = 0.0
        var = Variant(source=fn.split('/')[-1],
                      pos=HTSeq.GenomicPosition(chrom, int(position)),
                      ref=ref,
                      alt=alt,
                      frac=vf,
                      dp=dp,
                      eff=effect.strip(';'),
                      fc=fc.strip(';'))
        return var
コード例 #15
0
 def to_iv(self, start, end=None):
     to_genome = self.pos_in_exon_coverage_mapped_to_genomic_pos
     if end is None:
         if start not in to_genome:
             print("{0} pos not in exon ({1} -> {2}, {3} keys())".format(
                 start, min(to_genome.keys()), max(to_genome.keys()),
                 len(to_genome)))
             return HTSeq.GenomicPosition(self.iv.chrom, 1, self.iv.strand)
         return HTSeq.GenomicPosition(
             self.iv.chrom,
             self.pos_in_exon_coverage_mapped_to_genomic_pos[start],
             self.iv.strand)
     return HTSeq.GenomicInterval(
         self.iv.chrom,
         self.pos_in_exon_coverage_mapped_to_genomic_pos[start],
         self.pos_in_exon_coverage_mapped_to_genomic_pos[end],
         self.iv.strand)
コード例 #16
0
ファイル: Classify.py プロジェクト: yhoogstrate/dr-disco
    def is_blacklisted_by_regions(self, pos1, pos2):
        if (pos2[0] < pos1[0]) or (pos1[0] == pos2[0] and pos2[1] < pos1[1]):
            pos3 = pos1
            pos1 = pos2
            pos2 = pos3

        ids = set()
        position = self.idx_regions[HTSeq.GenomicPosition(
            pos1[0], pos1[1], pos1[2])]
        for step in position:
            ids.add(step)

        position = self.idx_regions[HTSeq.GenomicPosition(
            pos2[0], pos2[1], pos2[2])]
        for step in position:
            ids.add(step)

        return ids
コード例 #17
0
ファイル: riboseq_utils.py プロジェクト: 452990729/ClipSeq
def bed_to_genomic_interval(bed):
    """

    Converts bed file to genomic interval (htseq format) file

    """

    for interval in bed:
        yield HTSeq.GenomicPosition(interval.chrom, interval.start, interval.strand)
コード例 #18
0
    def parse_MiSeq(self):
        ''' MiSeq vcf parser function. Input: InputParser object. Output: Variant object '''

        row = self.row
        fieldId = self.fieldId
        header = self.header
        fn = self.fn
        chrom = row[0]
        ref = row[3]
        alt = row[4]
        fc = self.fc
        effect = self.eff
        for i in row[fieldId['INFO']].split(';'):
            if i.startswith("DP="):
                dp = i.split('=')[1]

            # if the MiSeq software reported functional consequence and effect and the file is not snpEff anotated, the MiSeq annotations will be used instead
            if i.startswith("FC=") and not fc:
                for j in i.split('=')[1].split(','):
                    if str(j.split('_')[0]) not in str(fc):
                        fc += str(j.split('_')[0]) + ";"
                    try:
                        if str(j.split('_')[1]) not in str(effect):
                            effect += str(j.split('_')[1]) + ";"
                    except:
                        pass
            elif str(i) == "EXON":
                fc += 'EXON'
        if not fc:
            fc = str("?")
        if not effect:
            effect = str("?")
        k = 0
        for i in row[fieldId['FORMAT']].split(':'):
            if str(i) == "VF":
                vf = float(row[fieldId[header[-1]]].split(':')[k])
            '''
            #for when vf is not in the format column, but AD is
            if str(i) == "AD" and not dp or not vf:
                dp = 0
                rd = int(row[fieldId[header[-1]]].split(':')[k].split(',')[0])
                ad = int(row[fieldId[header[-1]]].split(':')[k].split(',')[1])
                dp = int(rd) + int(ad)
            '''
            k += 1

        position = int(row[fieldId['POS']])
        var = Variant(source=fn.split('/')[-1],
                      pos=HTSeq.GenomicPosition(chrom, int(position)),
                      ref=ref,
                      alt=alt,
                      frac=vf,
                      dp=dp,
                      eff=effect.strip(';'),
                      fc=fc.strip(';'))
        return var
コード例 #19
0
def main():
    ga = HTSeq.GenomicArray("auto", typecode='O', stranded=False)
    position = HTSeq.GenomicPosition('chr1', 123203, '.')

    ga[HTSeq.GenomicInterval( "chr1", 100000, 101000 , "." )] = [0.05, 0.002, 0.04, 0.005]

    iv = HTSeq.GenomicInterval( "chr1", 100000, 130000 , "." )

    for interval, value in ga[iv].steps():
        print(interval, value)
コード例 #20
0
def TSS_Profile(ifile1,ifile2):
    '''read in three files, ifile1 is the sortedbamfile prepared by samtool
    ifile2 is the Genomic position  file with three columns: chr, position, strand'''
    
    import HTSeq
    import numpy
    import itertools

    sortedbamfile=HTSeq.BAM_Reader(ifile1)
   
    HRE_file=open(ifile2)
    halfwinwidth=3000
    fragmentsize=200

        
    HREpos=set() 
    for line in HRE_file:
        linelist=line.split()
        HREpos.add(HTSeq.GenomicPosition(linelist[0],int(linelist[1]),'.'))# creat Genomic position objects by HTSeq
        # if there is a blank line, linelist[1] will get an index out of range error
        # make sure no blank lines, you can write a yeild Generator no_blanklines()
        #   def nonblan_lines(f):
        #       for l in f:
        #           line=l.rstrip()
        #           if line:
        #               yield line
            
    for HRE in itertools.islice(HREpos,10):
        print HRE  # print out 10 HRE postions 

   
            
              
    profile=numpy.zeros(2*halfwinwidth, dtype='i')
    for p in HREpos:
        try:
            window=HTSeq.GenomicInterval(p.chrom, p.pos-halfwinwidth-fragmentsize,p.pos+halfwinwidth + fragmentsize,".")
            for almnt in sortedbamfile[window]:
                almnt.iv.length=fragmentsize
                if p.strand==".":
                    start_in_window=almnt.iv.start- p.pos +halfwinwidth
                    end_in_window  =almnt.iv.end  - p.pos +halfwinwidth
                else:
                    start_in_window=p.pos+halfwinwidth-almnt.iv.end
                    end_in_window =p.pos+halfwinwidth-almnt.iv.start 
                start_in_window=max(start_in_window,0)
                end_in_window=min(end_in_window, 2*halfwinwidth)
                if start_in_window >= 2*halfwinwidth or end_in_window <0:
                    continue
                profile[start_in_window : end_in_window] +=1
        except:
           continue
    return profile
    ifile1.close()
    ifile2.close()
コード例 #21
0
ファイル: peakFinder.py プロジェクト: dfporter/easyCLIP
    def highest_point(cls, ga, iv, arr=None, gene_name='', **kwargs):

        if arr is None:
            arr = cls.array_from_htseq(ga,
                                       iv)  # Replace this with sensible code.

        if len(arr) > 0:
            _max = np.max(arr)
        else:
            _max = 0
            return False

        indexes_of_maxes = []
        for n, v in enumerate(arr):
            if v == _max:
                indexes_of_maxes.append(n)

        if len(indexes_of_maxes) > 1:
            consecutive = []
            for n, index in enumerate(indexes_of_maxes):
                if n == 0:
                    consecutive.append(index)
                    continue
                if index <= (consecutive[n - 1] + 3):  # Allow 2 nt gaps.
                    consecutive.append(index)
                else:  # Reject if there are nonadjacent maxima
                    return None
            # Take the peak as the middle point.
            max_index = int((consecutive[0] + consecutive[-1]) / 2)

#            max_index = random.choice(indexes_of_maxes)
        else:
            max_index = indexes_of_maxes[0]

        if type(iv) == type([]):
            genomic_point_of_highest_coverage = HTSeq.GenomicPosition(
                iv[0].chrom, iv[0].start + max_index, iv[0].strand)
        else:
            genomic_point_of_highest_coverage = HTSeq.GenomicPosition(
                iv.chrom, iv.start + max_index, iv.strand)

        return genomic_point_of_highest_coverage
コード例 #22
0
 def writeDMSprofile(self):
     self.dms = HTSeq.GenomicArray({self.genome.id: self.genome_length}, stranded=True,
                          storage='ndarray', typecode="d")
     for key in self.features.keys():
         ft = self.features[key]
         for pos in ft.getData():
             self.dms[HTSeq.GenomicPosition(self.genome.id, pos[0], strand=ft.strand)] = pos[4]
     
     dms_bed_plus_fn = self.sample_name + '_dms_plus.bedgraph'
     dms_bed_minus_fn = self.sample_name + '_dms_minus.bedgraph'
     write_bed(self.dms, dms_bed_plus_fn, dms_bed_minus_fn)
コード例 #23
0
ファイル: SICER_MS.py プロジェクト: gabe0001/SICER_v2.3
def filter_raw_tags_by_islands_bam(iterator, island_array, filename, fragment_size, window_size, genome_data):

    file = open(filename, 'w')

    # count total number of reads in islands
    total_reads_in_islands = 0
    window_counts = HTSeq.GenomicArray(genome_data, stranded=False, typecode='d')

    # dictionary to store reads located in islands
    islandfiltered_reads_dict = {}
    # dictionary to store windows located in islands
    islandfiltered_windows_dict = {}

    for chrom in genome_data:
        islandfiltered_reads_dict[chrom] = []
        islandfiltered_windows_dict[chrom] = []

    # iterate through all reads in the bam iterator and write to file if read lands in an island
    for read in iterator:

        if not read.aligned:
            continue
        if read.iv.chrom not in genome_data:
            continue
        # apply shift to determine coordinate
        read_pos = get_read_pos(read, fragment_size, genome_data)
        position = HTSeq.GenomicPosition(read.iv.chrom, read_pos)

        if position.pos >= genome_data[read.iv.chrom]:
            position.pos = genome_data[read.iv.chrom] - 1

        # determine if read lands in an island by using the genomic array island_array; if it does, write to file
        if island_array[position] >= 1:
            # line to add to BED file
            line = str(read.iv.chrom) + "\t" + str(read.iv.start) + "\t" + str(read.iv.end) \
                    + "\t" + str(read.read.name) + "\t" + str(read.aQual) + "\t" + str(read.iv.strand) + "\n"
            file.write(line)
            total_reads_in_islands += 1
            islandfiltered_reads_dict[read.iv.chrom].append(read_pos)
            # determine window position of read
            window_start = read_pos / window_size * window_size
            if window_start < 0:
                window_start = 0
            islandfiltered_windows_dict[read.iv.chrom].append(window_start)

    file.close()

    # sort and remove duplicates for reads dictionary and windows dictionary
    for chrom in genome_data:
        islandfiltered_reads_dict[chrom].sort()
        islandfiltered_windows_dict[chrom] = remove_duplicates_and_sort(islandfiltered_windows_dict[chrom])

    return islandfiltered_reads_dict, islandfiltered_windows_dict, total_reads_in_islands
コード例 #24
0
def get_bed(bed_filename, debug_mode=False):
    if debug_mode:
        os.system('head -n 10000 %s > %s' % (bed_filename, 'tmp.bed'))
        f = open('tmp.bed', 'r')
    else:
        f = open(bed_filename, 'r')
    #ga = HTSeq.GenomicArray(chroms='auto')
    ga_five = HTSeq.GenomicArray(chroms='auto')
    for n, read in enumerate(f):
        if not n % 1e6:
            print "get_bed({fn}) read {n}".format(fn=bed_filename, n=n)
        s = read.rstrip('\n').split('\t')
        #iv = HTSeq.GenomicInterval(s[0], int(s[1]), int(s[2]), s[5])
        #ga[iv] += 1
        if s[5] == '-':
            # Check if there should be a 1 offset here.
            ga_five[HTSeq.GenomicPosition(s[0], int(s[2]), s[5])] += 1
        if s[5] == '+':
            ga_five[HTSeq.GenomicPosition(s[0], int(s[1]), s[5])] += 1
    f.close()
    return ga_five
コード例 #25
0
    def to_ga(self, ga, start, end=None):
        to_genome = self.pos_in_exon_coverage_mapped_to_genomic_pos
        if end is None:
            if start not in to_genome:
                print("{0} pos not in exon ({1} -> {2}, {3} keys())".format(
                    start, min(to_genome.keys()), max(to_genome.keys()),
                    len(to_genome)))
                ga[HTSeq.GenomicPosition(self.iv.chrom, 1, self.iv.strand)] = 1
                return
            ga[HTSeq.GenomicPosition(
                self.iv.chrom,
                self.pos_in_exon_coverage_mapped_to_genomic_pos[start],
                self.iv.strand)] = self.exon_coverage[start]
            return
#        _iv = HTSeq.GenomicInterval(
#            self.iv.chrom,
#            self.pos_in_exon_coverage_mapped_to_genomic_pos[start],
#            self.pos_in_exon_coverage_mapped_to_genomic_pos[end],
#            self.iv.strand)
        for pos in range(start, end + 1):
            ga[HTSeq.GenomicPosition(
                self.iv.chrom,
                self.pos_in_exon_coverage_mapped_to_genomic_pos[start],
                self.iv.strand)] = self.exon_coverage[pos]
コード例 #26
0
ファイル: vaggregatefeature.py プロジェクト: xtmgah/mucor
    def uniqueVariants(self):
        '''Return the set of unique variants from the set of all variants (for this feature)'''
        # exploit the hashtable and uniqueness of sets to quickly find
        # unique tuples (contig, pos, ref, alt) of variant info
        # sorted by chrom, pos
        uniqueVariantsTemp = set()
        for var in self.variants:
            candidate = (var.pos.chrom, var.pos.pos, var.ref, var.alt)
            uniqueVariantsTemp.add(candidate)
        # sort by chr, then position
        # TO DO: python sorted() will sort as: chr1, chr10, chr2, chr20, chrX. Fix.
        uniqueVariantsTemp = sorted(uniqueVariantsTemp,
                                    key=lambda varx: (varx[0] + str(varx[1])))

        # Now construct a returnable set of Variant objects,
        # specifying multiple "sources" in the source field
        # this loop's inner-product is #unique variants * #total variants, times #features
        # and is a major inefficiency
        uniqueVariants = set()
        for uniqueVarTup in uniqueVariantsTemp:
            source = ""
            frac = ""
            dp = ""
            eff = ""
            fc = ""
            #annot = ""
            for varClass in self.variants:
                if (varClass.pos.chrom, varClass.pos.pos, varClass.ref,
                        varClass.alt) == uniqueVarTup:
                    source += varClass.source + ", "
                    frac += str(varClass.frac) + ", "
                    dp += str(varClass.dp) + ", "
                    eff += str(varClass.eff) + ", "
                    fc += str(varClass.fc) + ", "
                    #annot += str(varClass.annot) + ", "
            pos = HTSeq.GenomicPosition(uniqueVarTup[0], uniqueVarTup[1])
            uniqueVar = Variant(
                source.strip(", "),
                pos,
                ref=uniqueVarTup[2],
                alt=uniqueVarTup[3],
                frac=str(frac).strip(", "),
                dp=str(dp).strip(", "),
                eff=str(eff).strip(", "),
                fc=str(fc).strip(", "))  ######## Karl Modified ##############
            uniqueVariants.add(uniqueVar)

        return uniqueVariants
コード例 #27
0
def find_summits_in_anchors(anchor, chrom_summits):
    """
    anchor = HTSeq.iv
    chrom_summits = []  # list of summit position on one chrom
    """
    chrom = anchor.chrom
    overlapped = 0
    for summit in chrom_summits:
        pos = HTSeq.GenomicPosition(chrom, summit, '.')
        if pos.overlaps(anchor):
            ans = summit
            overlapped = 1
            break
    if overlapped == 0:
        ans = (anchor.start + anchor.end) / 2
    return str(ans)
コード例 #28
0
    def parse_IonTorrent(self):
        ''' Ion Torrent vcf parser function. Input: InputParser object. Output: Variant object '''

        row = self.row
        fieldId = self.fieldId
        header = self.header
        fn = self.fn
        chrom = row[0]
        ref = row[3]
        alt = row[4]
        effect = self.eff
        fc = self.fc
        for i in row[fieldId['INFO']].split(';'):
            if i.startswith("AO="):
                tempval = i.split('=')[1]
            if i.startswith("RO="):
                ro = i.split('=')[1]
            if i.startswith("DP="):
                dp = i.split("=")[1]
        if str(',') in str(tempval):
            tempval2 = [
                int(numeric_string) for numeric_string in tempval.split(',')
            ]
            try:
                ao = sum(tempval2)
            except:
                abortWithMessage(
                    "AO should be an int, or a list of ints: AO = {0}/".format(
                        tempval2))
        else:
            ao = tempval
        vf = float(float(ao) / float(float(ro) + float(ao)))
        position = int(row[fieldId['POS']])
        for i in str(row[fieldId['ALT']]).split(','):
            if len(str(row[fieldId['REF']])) > len(i):
                # this is a deletion in Ion Torrent data
                position = int(row[fieldId['POS']])
                break
        var = Variant(source=fn.split('/')[-1],
                      pos=HTSeq.GenomicPosition(chrom, int(position)),
                      ref=ref,
                      alt=alt,
                      frac=vf,
                      dp=dp,
                      eff=effect.strip(';'),
                      fc=fc.strip(';'))
        return var
コード例 #29
0
def updateStopCountsForPairedEnd(first, second, stop_counts, coverage):
    """
    Update stop counts and coverage for paired-end read (assume that both ends are mappend to the same reference).
    
    Args:
        first: a HTSeq.Alignment object, pointing to the first end of a paired-end read.
        second: a HTSeq.Alignment object, pointing to the first end of a paired-end read.
        stop_counts: a HTSeq.GenomicArray object for stop counts.
        coverage: a HTSeq.GenomicArray object for coverage.
    """
    start = min(first.iv.start, second.iv.start)
    end = max(first.iv.end, second.iv.end)
    modification_site = HTSeq.GenomicPosition(first.iv.chrom, start,
                                              first.iv.strand)
    stop_counts[modification_site] += 1
    new_iv = HTSeq.GenomicInterval(first.iv.chrom, start, end, first.iv.strand)
    coverage[new_iv] += 1
コード例 #30
0
    def parse_GenericGATK(self):
        ''' 
        Generic GATK parser function. This was written for the Illumina BaseSpace BWA Enrichment Workflow vcf files, but may apply to more filetypes
        Input: InputParser object. Output: Variant object 
        '''
        row = self.row
        fieldId = self.fieldId
        header = self.header
        fn = self.fn
        chrom = row[0]
        ref = row[3]
        alt = row[4]
        effect = self.eff
        fc = self.fc
        j = 0
        position = int(row[fieldId['POS']])
        for i in row[fieldId['FORMAT']].split(':'):
            if str(i) == "AD":
                ro = int(row[fieldId[header[-1]]].split(':')[j].split(',')[0])
                #ao = int(row[fieldId[header[-1]]].split(':')[j].split(',')[-1]) # fails when the mutation has two alternate alleles in the same VCF line
                ao = sum([
                    int(x) for x in row[fieldId[header[-1]]].split(':')
                    [j].split(',')[1:]
                ])
                dp = ro + ao
                try:
                    vf = float(
                        float(ao) / float(dp)
                    )  # one VF for all possible alternate alleles. Nothing unusual, unless the mutation has multiple alt alleles in 1 vcf line
                except:
                    print("\nwarning: no vaf?\n" + str(row) + "\n")
                    vf = 0
                break
            j += 1

        var = Variant(source=fn.split('/')[-1],
                      pos=HTSeq.GenomicPosition(chrom, int(position)),
                      ref=ref,
                      alt=alt,
                      frac=vf,
                      dp=dp,
                      eff=effect.strip(';'),
                      fc=fc.strip(';'))
        return var