Ejemplo n.º 1
0
def count_reads(transcripts, bam_iter, number_of_counts=1):
    """ Count the reads in a given transcript

    :TODO rename 
    :TODO change to cython
    Arguments
    ---------
    transcripts : list
        list of exons
    bam_iter : pysam.BamFileIterator
        gotton after pysam.BamFile.fetch() call
    """
    # Convert this to Cython
    out_counts = zeros(len(transcripts))
    intron_lengths = []
    read_vector = []
    tree = Intersecter()
    # Assume exons are position sorted
    for ti, transcript in enumerate(transcripts):
        ex_list = []
        for j, i in enumerate(transcript):
            tree.add_interval(
                Interval(int(i[0]), int(i[1]), value={'anno': ti}))
            if j != 0:
                ex_list.append(transcript[j-1][1]\
                        - transcript[j][0])
        intron_lengths.append(ex_list)
    for read in bam_iter:
        block_counter = zeros((len(transcripts), ))
        intron_match = zeros((len(transcripts), ))
        blocks = read.get_blocks()
        junction_lengths = []
        for i, j in enumerate(blocks):
            if i != 0:
                junction_lengths.append(blocks[i - 1][1] - j[0])
            else:
                pass
        junction_lengths = set(junction_lengths)
        for i, k in enumerate(blocks):
            overlap = tree.find(k[0], k[1])
            if len(overlap) == 0:
                break
            else:
                for s in overlap:
                    if (k[0] >= s.start) and\
                            (k[1] <= s.end):
                        block_counter[s.value['anno']] += 1
        for ij, il in enumerate(intron_lengths):
            if set(junction_lengths).issubset(set(il)):
                intron_match[ij] = 1
            else:
                pass
        smatch = nrepeat(len(blocks), len(transcripts))
        gg = logical_and(block_counter == smatch, intron_match)
        read_vector.append(gg)
        out_counts += gg
    read_matrix = array(read_vector)
    uniq_r = sum_(read_matrix, axis=1) == 1
    #normalization_constant = [for i in transcripts]
    return (out_counts)
def get_reads_and_ranges(bam_region, cid, chrom, region_start, region_end, strand, options):
    pos_range = defaultdict(lambda: [0,0])
    filtered_reads = Intersecter()
    read_iterator = filter_reads(
        bam_region, options.chrom_prefix + chrom,
        region_start, region_end, options
    )
    for read in read_iterator:
        if is_valid_paired(read, region_start, options):
            rstart = min(read.pos, read.pnext) + 1
            rend = rstart + abs(read.isize) - 1
            filtered_reads.add_interval(Interval(rstart, rend))
            inc_pr(pos_range, rstart, rend, region_start, region_end)
            inc_pr_at(pos_range, rstart, region_start, region_end)
            inc_pr_at(pos_range, rend, region_start, region_end)
        elif is_valid_single(read, options):
            rstart = read.pos + 1
            rend = rstart + aln_length(read.cigar) - 1
            filtered_reads.add_interval(Interval(rstart, rend))
            inc_pr(pos_range, rstart, rend, region_start, region_end)
            if as_merged(read, options) or as_trimmed(read, options):
                inc_pr_at(pos_range, rstart, region_start, region_end)
                inc_pr_at(pos_range, rend, region_start, region_end)
            elif read.is_reverse:
                inc_pr_at(pos_range, rend, region_start, region_end)
            else:
                inc_pr_at(pos_range, rstart, region_start, region_end)
    return filtered_reads, pos_range
Ejemplo n.º 3
0
def buildIntervalTree(exons):
    '''Build interval tree from exon annotations.'''
    tree = Intersecter()
    for exon in exons:
        tree.add_interval(Interval(exon.start, exon.end,
                            value={'cStart': exon.cStart,
                                    'cEnd': exon.cEnd}))
    return tree
Ejemplo n.º 4
0
def calculate_score(chrom, start, end,ctype="WPS"):
  filteredReads = Intersecter()
  posRange = defaultdict(int)
  for read in readIterator(args,chrom,start,end):
    if read.is_duplicate or read.is_qcfail or read.is_unmapped: continue
    if isSoftClipped(read.cigar): continue
    
    if read.is_paired:
      if read.mate_is_unmapped: continue
      if read.rnext != read.tid: continue
      if read.is_read1 or (read.is_read2 and read.pnext+read.qlen < start):
        if read.isize == 0: continue
        if options.downsample != None and random.random() >= options.downsample: continue
        rstart = min(read.pos,read.pnext)+1 # 1-based
        rend = rstart+abs(read.isize)-1 # end included
        rlength = rend-rstart+1
        if options.minLength <= rlength <= options.maxLength:
          filteredReads.add_interval(Interval(rstart,rend))
          if ctype == "COV":
            for i in range(rstart,rend+1):
              if i >= start and i <= end:
                posRange[i]+=1
          elif ctype == "STARTS":
            if rstart >= start and rstart <= end:
              posRange[rstart]+=1
            if rend >= start and rend <= end:
              posRange[rend]+=1
    else:
      if options.downsample != None and random.random() >= options.downsample: continue
      rstart = read.pos+1 # 1-based
      rend = rstart+aln_length(read.cigar)-1 # end included
      rlength = rend-rstart+1
      if options.minLength <= rlength <= options.maxLength:
        filteredReads.add_interval(Interval(rstart,rend))
        if ctype == "COV":
          for i in range(rstart,rend+1):
            if i >= start and i <= end:
              posRange[i]+=1
        elif ctype == "STARTS":
          if rstart >= start and rstart <= end:
            posRange[rstart]+=1
          if rend >= start and rend <= end:
            posRange[rend]+=1

  if ctype == "WPS":
    protection = options.protection//2
    for pos in xrange(start,end+1):
      rstart,rend = pos-protection,pos+protection
      gcount,bcount = 0,0
      for read in filteredReads.find(rstart,rend):
        if (read.start > rstart) or (read.end < rend): bcount +=1
        else: gcount +=1
      posRange[pos]+=gcount-bcount

  res = []
  for pos in xrange(start,end+1):
    res.append(posRange[pos])
  return res
Ejemplo n.º 5
0
def count_reads(transcripts, bam_iter, number_of_counts=1):
    """ Count the reads in a given transcript

    :TODO rename 
    :TODO change to cython
    Arguments
    ---------
    transcripts : list
        list of exons
    bam_iter : pysam.BamFileIterator
        gotton after pysam.BamFile.fetch() call
    """
    # Convert this to Cython
    out_counts = zeros(len(transcripts))
    intron_lengths = []
    read_vector = []
    tree = Intersecter()
    # Assume exons are position sorted
    for ti, transcript in enumerate(transcripts):
        ex_list = []
        for j, i in enumerate(transcript):
            tree.add_interval(Interval(int(i[0]), int(i[1]), 
                value={'anno':ti}))
            if j != 0:
                ex_list.append(transcript[j-1][1]\
                        - transcript[j][0])
        intron_lengths.append(ex_list)
    for read in bam_iter:
        block_counter = zeros((len(transcripts),))
        intron_match =  zeros((len(transcripts),))
        blocks = read.get_blocks()
        junction_lengths = []
        for i,j in enumerate(blocks):
            if i != 0:
                junction_lengths.append(blocks[i - 1][1] - j[0])
            else: pass
        junction_lengths = set(junction_lengths)
        for i, k in enumerate(blocks):
            overlap = tree.find(k[0], k[1]) 
            if len(overlap) == 0:
                break
            else:
                for s in overlap:
                    if (k[0] >= s.start) and\
                            (k[1] <= s.end):
                        block_counter[s.value['anno']] += 1
        for ij, il in enumerate(intron_lengths):
            if set(junction_lengths).issubset(set(il)):
                intron_match[ij] = 1
            else: pass
        smatch = nrepeat(len(blocks), len(transcripts))
        gg = logical_and(block_counter == smatch, intron_match)
        read_vector.append(gg)
        out_counts += gg
    read_matrix = array(read_vector)
    uniq_r = sum_(read_matrix, axis=1) == 1
    #normalization_constant = [for i in transcripts]
    return(out_counts)
Ejemplo n.º 6
0
def init_intersecter(hits):
    ''' '''

    intersecter = Intersecter()

    for h in hits:
        intersecter.add_interval(Interval(h[0], h[1]))

    return intersecter
Ejemplo n.º 7
0
def init_intersecter(hits):
    ''' '''

    intersecter = Intersecter()

    for h in hits:
        intersecter.add_interval(Interval(h[0], h[1]))

    return intersecter
Ejemplo n.º 8
0
def load_restriction_fragment(in_file, minfragsize=None, maxfragsize=None, verbose=False):
    '''
    This function cites the same function from Hi-C Pro (https://github.com/nservant/HiC-Pro/blob/master/scripts/mapped_2hic_fragments.py) by Nicolas Servant, Eric Viara
    '''
    '''
    Function load_restriction_fragment cite the same function from Hi-C Pro (https://github.com/nservant/HiC-Pro/blob/master/scripts/mapped_2hic_fragments.py) by Nicolas Servant, Eric Viara 
    '''
    
    """
    Read a BED file and store the intervals in a tree
    Intervals are zero-based objects. The output object is a hash table with
    one search tree per chromosome
    in_file = input file [character]
    verbose = verbose mode [logical]
    """
    resFrag = {}
    if verbose:
        print("## Loading Restriction File Intervals '" + in_file + "'...")

    bed_handle = open(in_file)
    nline = 0
    for line in bed_handle:
        nline +=1
        bedtab = line.split("\t")
        try:
            chromosome, start, end, name = bedtab[:4]
        except ValueError:
            print("Warning : wrong input format in line" + nline + ". Not a BED file !?")
            continue

        # BED files are zero-based as Intervals objects
        start = int(start)  # + 1
        end = int(end)
        midPoint = (start + end)/2
        fragl = abs(end - start)
        name = name.strip()

        ## Discard fragments outside the size range
        if minfragsize != None and int(fragl) < int(minfragsize):
            print("Warning : fragment "+ name + " [" +  fragl + "] outside of range. Discarded")
            continue
        if maxfragsize != None and int(fragl) > int(maxfragsize):
            print("Warning : fragment " + name + " [" + fragl + "] outside of range. Discarded")
            continue
       
        if chromosome in resFrag:
            tree = resFrag[chromosome]
            tree.add_interval(Interval(start, end, value={'name': name, 'midPoint': midPoint}))
        else:
            tree = Intersecter()
            tree.add_interval(Interval(start, end, value={'name': name, 'midPoint': midPoint}))
            resFrag[chromosome] = tree
    
    bed_handle.close()
    return resFrag
def load_restriction_fragment(in_file, minfragsize=None, maxfragsize=None, verbose=False):
    """
    Read a BED file and store the intervals in a tree

    Intervals are zero-based objects. The output object is a hash table with
    one search tree per chromosome

    in_file = input file [character]
    verbose = verbose mode [logical]

    """
    resFrag = {}
    if verbose:
        print "## Loading Restriction File Intervals '", in_file, "'..."

    bed_handle = open(in_file)
    nline = 0
    nfilt = 0
    for line in bed_handle:
        nline +=1
        bedtab = line.split("\t")
        try:
            chromosome, start, end, name = bedtab[:4]
        except ValueError:
            print "Warning : wrong input format in line", nline,". Not a BED file !?"
            continue

        # BED files are zero-based as Intervals objects
        start = int(start)  # + 1
        end = int(end)
        fragl = abs(end - start)
        name = name.strip()

        ## Discard fragments outside the size range
        filt=False
        if minfragsize != None and int(fragl) < int(minfragsize):
            nfilt+=1
            filt=True
        elif maxfragsize != None and int(fragl) > int(maxfragsize):
            nfilt+=1
            filt=True
       
        if chromosome in resFrag:
            tree = resFrag[chromosome]
            tree.add_interval(Interval(start, end, value={'name': name, 'filter': filt}))
        else:
            tree = Intersecter()
            tree.add_interval(Interval(start, end, value={'name': name, 'filter': filt}))
            resFrag[chromosome] = tree
    
    if nfilt > 0:
        print "Warning : ", nfilt ,"fragment(s) outside of range and discarded. ", nline - nfilt, " remaining."

    bed_handle.close()
    return resFrag
Ejemplo n.º 10
0
def load_restriction_fragment(in_file, minfragsize=None, maxfragsize=None, verbose=False):
    """
    Read a BED file and store the intervals in a tree

    Intervals are zero-based objects. The output object is a hash table with
    one search tree per chromosome

    in_file = input file [character]
    verbose = verbose mode [logical]

    """
    resFrag = {}
    if verbose:
        print "## Loading Restriction File Intervals '", in_file, "'..."

    bed_handle = open(in_file)
    nline = 0
    for line in bed_handle:
        nline +=1
        bedtab = line.split("\t")
        try:
            chromosome, start, end, name = bedtab[:4]
        except ValueError:
            print "Warning : wrong input format in line", nline,". Not a BED file !?"
            continue

        # BED files are zero-based as Intervals objects
        start = int(start)  # + 1
        end = int(end)
        fragl = abs(end - start)
        name = name.strip()

        ## Discard fragments outside the size range
        if minfragsize != None and int(fragl) < int(minfragsize):
            print "Warning : fragment ", name, " [", fragl, "] outside of range. Discarded"  
            continue
        if maxfragsize != None and int(fragl) > int(maxfragsize):
            print "Warning : fragment ", name, " [", fragl,"] outside of range. Discarded"  
            continue
       
        if chromosome in resFrag.keys():
            tree = resFrag[chromosome]
            tree.add_interval(Interval(start, end, value={'name': name}))
        else:
            tree = Intersecter()
            tree.add_interval(Interval(start, end, value={'name': name}))
            resFrag[chromosome] = tree
    
    bed_handle.close()
    return resFrag
Ejemplo n.º 11
0
def load_BED(in_file, verbose=False):
    """
    Read a BED file and store the intervals in a tree
    Intervals are zero-based objects. The output object is a hash table with
    one search tree per chromosome

    BED file are half-open, meaning that a bin ]100, 200] covered the bases 101 to 200
    
    in_file = input file [character]
    verbose = verbose mode [logical]
    """
    x = {}
    if verbose:
        print "## Loading BED file '", in_file, "'..."
    featureNames = []
    nline = 0
    with open(in_file) as bed_handle:
        for line in bed_handle:
            if nline > 0 and nline % 5000 == 0 and verbose:
                print "## %d features loaded ..." % nline
            nline += 1
            bedtab = line.split("\t")
            try:
                chromosome, start, end, name = bedtab[:4]
            except ValueError:
                print >> sys.stderr, "Warning : wrong input format in line", nline, ". Not a BED file !?"
                continue

            # BED files are zero-based, half-open as Intervals objects
            start = int(start)
            end = int(end)
            featureNames.append(name.strip())
            if chromosome in x:
                tree = x[chromosome]
                tree.add_interval(
                    Interval(start, end, value={'pos': nline - 1}))
            else:
                tree = Intersecter()
                tree.add_interval(
                    Interval(start, end, value={'pos': nline - 1}))
                x[chromosome] = tree
    bed_handle.close()
    return (x, featureNames)
Ejemplo n.º 12
0
def load_bed(in_file, verbose=False):
    """
    Read a BED file and store the intervals in a tree

    Intervals are zero-based objects. The output object is a hash table with
    one search tree per chromosome

    in_file = input file [character]
    verbose = verbose mode [logical]

    """
    intervals = {}
    if verbose:
        print >> sys.stderr, "## Loading BED file '", in_file, "'..."

    bed_handle = open(in_file)
    nline = 0
    for line in bed_handle:
        nline += 1
        bedtab = line.strip().split("\t")
        try:
            chromosome, start, end = bedtab[:3]
        except ValueError:
            print >> sys.stderr, "Warning : wrong input format in line", nline, ". Not a BED file !?"
            continue

        # BED files are zero-based as Intervals objects
        start = int(start)  # + 1
        end = int(end)
        fragl = abs(end - start)

        if chromosome in intervals:
            tree = intervals[chromosome]
            tree.add_interval(Interval(start, end))
        else:
            tree = Intersecter()
            tree.add_interval(Interval(start, end))
            intervals[chromosome] = tree

    bed_handle.close()
    return intervals
Ejemplo n.º 13
0
def load_restriction_fragment(in_file, verbose):
    """
    Read a BED file and store the intervals in a tree

    Intervals are zero-based objects. The output object is a hash table with
    one search tree per chromosome

    in_file = input file [character]
    verbose = verbose mode [logical]

    """
    resFrag = {}
    if verbose:
        print "## Loading Restriction File Intervals '", in_file, "'..."

    bed_handle = open(in_file)
    for line in bed_handle:
        bedtab = line.split("\t")
        try:
            chromosome, start, end, name = bedtab[:4]
        except ValueError:
            # FIXME we might want a proper warning message here !
            continue

        # BED files are zero-based as Intervals objects
        start = int(start)  # + 1
        end = int(end)
        name = name.strip()
        if chromosome in resFrag.keys():
            tree = resFrag[chromosome]
            tree.add_interval(Interval(start, end, value={'name': name}))
        else:
            tree = Intersecter()
            tree.add_interval(Interval(start, end, value={'name': name}))
            resFrag[chromosome] = tree
    bed_handle.close()
    return resFrag
Ejemplo n.º 14
0
def maps_gene(mapped):
    '''Determine if the mapped alignment falls within a gene.'''
    global intersecters

    try:
        intersecter = intersecters[mapped['genome']]
    except KeyError:
        genes = db.feature.find({'genome': mapped['genome'], 'type': 'gene'})

        intersecter = Intersecter()

        # Interval end is exclusive, need to +1 to line up with actual position
        [intersecter.add_interval(Interval(gene['start'], gene['end'] + 1, gene['uid']))
         for gene in genes]

        intersecters[mapped['genome']] = intersecter

    return intersecter.find(mapped['refStart'], mapped['refEnd'])
Ejemplo n.º 15
0
def load_BED(in_file, exclusionSize=0, verbose=False):
    """
    Read a BED file and store the intervals in a tree

    Intervals are zero-based objects. The output object is a hash table with
    one search tree per chromosome

    in_file = input file [character]
    verbose = verbose mode [logical]

    """
    x = {}
    x_ex = {}
    if verbose:
        print "## Loading BED file '", in_file, "'..."
    nline = 0
    with open(in_file) as bed_handle:
        for line in bed_handle:
            nline +=1
            bedtab = line.split("\t")
            try:
                chromosome, start, end, name = bedtab[:4]
            except ValueError:
                print "Warning : wrong input format in line", nline,". Not a BED file !?"
                continue
            
            # BED files are zero-based as Intervals objects
            start = int(start)  # + 1
            end = int(end)
            name = name.strip()
            if chromosome in x.keys():
                tree = x[chromosome]
                tree.add_interval(Interval(start, end, value={'name': name}))
            else:
                tree = Intersecter()
                tree.add_interval(Interval(start, end, value={'name': name}))
                x[chromosome] = tree             
            ## Exclusion regions
            if exclusionSize > 0:
                if chromosome in x_ex.keys():
                    tree_ex = x_ex[chromosome]
                    tree_ex.add_interval(Interval(start - int(exclusionSize), start, value={'name': str(name) + "_up"}))
                    tree_ex.add_interval(Interval(end, end + int(exclusionSize), value={'name': str(name) + "_dwn"}))
                else:
                    tree_ex = Intersecter()
                    tree_ex.add_interval(Interval(start - int(exclusionSize), start, value={'name': str(name) + "_up"}))
                    tree_ex.add_interval(Interval(end, end + int(exclusionSize), value={'name': str(name) + "_dwn"}))
                    x_ex[chromosome] = tree_ex             
    bed_handle.close()
    return (x, x_ex)
Ejemplo n.º 16
0
def maps_gene(mapped):
    """Determine if the mapped alignment falls within a gene."""
    global intersecters

    try:
        intersecter = intersecters[mapped['genome']]
    except KeyError:
        genes = db.feature.find({'genome': mapped['genome'], 'type': 'gene'})

        intersecter = Intersecter()

        # Interval end is exclusive, need to +1 to line up with actual position
        [
            intersecter.add_interval(
                Interval(gene['start'], gene['end'] + 1, gene['uid']))
            for gene in genes
        ]

        intersecters[mapped['genome']] = intersecter

    return intersecter.find(mapped['refStart'], mapped['refEnd'])
Ejemplo n.º 17
0
def load_BED(in_file, exclusionSize=0, verbose=False):
    """
    Read a BED file and store the intervals in a tree

    Intervals are zero-based objects. The output object is a hash table with
    one search tree per chromosome

    in_file = input file [character]
    verbose = verbose mode [logical]

    """
    x = {}
    x_ex = {}
    if verbose:
        print "## Loading BED file '", in_file, "'..."
    nline = 0
    with open(in_file) as bed_handle:
        for line in bed_handle:
            nline += 1
            bedtab = line.split("\t")
            try:
                chromosome, start, end, name = bedtab[:4]
            except ValueError:
                print "Warning : wrong input format in line", nline, ". Not a BED file !?"
                continue

            # BED files are zero-based as Intervals objects
            start = int(start)  # + 1
            end = int(end)
            name = name.strip()
            if chromosome in x:
                tree = x[chromosome]
                tree.add_interval(Interval(start, end, value={'name': name}))
            else:
                tree = Intersecter()
                tree.add_interval(Interval(start, end, value={'name': name}))
                x[chromosome] = tree
            ## Exclusion regions
            if exclusionSize > 0:
                if chromosome in x_ex:
                    tree_ex = x_ex[chromosome]
                    tree_ex.add_interval(
                        Interval(start - int(exclusionSize),
                                 start,
                                 value={'name': str(name) + "_up"}))
                    tree_ex.add_interval(
                        Interval(end,
                                 end + int(exclusionSize),
                                 value={'name': str(name) + "_dwn"}))
                else:
                    tree_ex = Intersecter()
                    tree_ex.add_interval(
                        Interval(start - int(exclusionSize),
                                 start,
                                 value={'name': str(name) + "_up"}))
                    tree_ex.add_interval(
                        Interval(end,
                                 end + int(exclusionSize),
                                 value={'name': str(name) + "_dwn"}))
                    x_ex[chromosome] = tree_ex
    bed_handle.close()
    return (x, x_ex)
Ejemplo n.º 18
0
def compare_two_transcripts(trans1, trans2, transcript_dict, 
        afe=False):
    """
    Returns the splice differences between two transcripts.
    Single exon-comparisons are ignored.

    Parameters
    ----------
    trans1 : string 
        transcript of interest
    trans2 : string 
        second transcript of interest
    transcript_dict : a dictionary of transcript names with 
    values being a list of exons
    afe : bool
       whether to include alternate start and ends

    :TODO make a better return
    :TODO maybe include something similar to to_plot
    Returns
    -------
    Exclusive Junctions : 
    5' upstream exons : 
    3' downstram exons : 
    Skipped Exons : Diffevent 
    """
    # TODO refactor this
    t1 = transcript_dict[trans1]
    t2 = transcript_dict[trans2]
    tree = Intersecter()
    starts1 = [i[0] for i in t1]
    starts2 = [i[0] for i in t2]
    reverse = False
    if min(starts1) <= min(starts2):
        s1 = t1
        s2 = t2
        s2_beg = min(starts2)
    else:
        s1 = t2 
        s2 = t1
        reverse = True
        s2_beg = min(starts1)
    if reverse: torder = (trans2, trans1)
    else: torder = (trans1, trans2)
    # Ignore single-exon stuff
    if len(s1) <= 1 or len(s2) <= 1:
        return([], [])
    for i in s1:
        tree.add_interval(Interval(int(i[0]), int(i[1]), 
            value={'anno':i[2]}))
    matching_exons = []
    exclusive_juncs = []
    skipped_exons = []
    altends = []
    exon_match = {}
    s1.sort(key=lambda x: x[0])
    s2.sort(key=lambda x: x[0])
    max_exon_1 =  s1[-1][2]
    max_exon_2 = s2[-1][2]
    #end_position_s2 = max([i[1] for i in s2])
    s1_end = max([i[1] for i in s1])
    prev_match = None
    if max_exon_1 < s1[0][2]:
        strand = -1
    else:
        strand = 1
    for pcurr in range(len(s2)):
        start, end, exon_n = s2[pcurr]
        overlap = tree.find(int(start), int(end))
        if len(overlap) == 0:
            if prev_match and (start < s1_end):
                #skipped exons
                cigar = _generate_cigar(s2, pcurr, mskip=1)
                try:
                    if exon_match[exon_n - strand] == prev_match.value['anno']:
                        try:
                            nm = tree.find(*s2[pcurr + 1][0:2])[0]
                            ocigar = [(3, nm.start - prev_match.end)]
                            nexon = nm.value['anno']
                        except IndexError:
                            nm=s1[_get_by_exonn(prev_match.value['anno']+strand,s1)] 
                            ocigar = [(3,nm[0] - prev_match.end)]
                            nexon = nm[2]
                    skipped_exons.append(DiffEvent('skipped_exon', start, end,
                            torder, cigar2=cigar, cigar1 = ocigar, 
                            exon_num = (None, exon_n), 
                            exon2=(prev_match.value['anno'], nexon))
                            )
                except KeyError:
                    # Multiple skipped exons
                    ncig = _generate_cigar(s2, pcurr, mskip=1)[1:]
                    skipped_exons[-1]._extend(ncig, cig=2)
            elif start > s1_end: 
                if prev_match:
                    cigar = _generate_cigar(s2, pcurr, mskip=1)
                    pm = tree.find(*s2[pcurr - 1][0:2])[0]
                    pexon = pm.value['anno']
                    ocigar = []
                    for i in range(pexon, max_exon_1+strand, strand):
                        narg = _get_by_exonn(i, s1)
                        ocigar.append((0, s1[narg][0], s1[narg][1]))
                        try:
                            ocigar.append((3, s1[narg][1] - s1[narg+1][0]))
                        except IndexError:
                            pass
                    #:TODO extend ocigar till end?
                    altends.append(DiffEvent('AE', start, end,
                        torder, cigar2=cigar, cigar1=ocigar,
                        exon_num = (None, exon_n)))
                else: 
                    pass
            else: 
                # Alternate start site that starts in between exons
                # of other transcript
                cigar = _generate_cigar(s2, pcurr, mskip=1)[:-1]
                try:
                    nm = tree.find(*s2[pcurr + 1][0:2])[0]
                except IndexError:
                    from IPython import embed
                    embed()
                nexon = nm.value['anno']
                narg = _get_by_exonn(nexon - strand, s1)
                pmatch = s1[narg]
                ocigar = [(0, pmatch[1] - pmatch[0]),
                        (3, nm.start - pmatch[1])]
                altends.append(DiffEvent('AS', start, end,
                        torder, cigar2=cigar, cigar1 = ocigar))
        elif len(overlap) == 1:
            if start == overlap[0].start and end == overlap[0].end:
                s1_exon_n = overlap[0].value['anno']
                matching_exons.append((start, end, (s1_exon_n, 
                    exon_n), (0, 0)))
                if prev_match:
                    if s1_exon_n - prev_match.value['anno']  == strand: 
                        pass
                    else:
                        # Difference in exon matches
                        mskip = abs(s1_exon_n - prev_match.value['anno'] ) - 1
                        narg = _get_by_exonn(prev_match.value['anno']+strand, s1) 
                        s_s1 = s1[narg] # skipped s1
                        cigar = _generate_cigar(s1, narg, mskip=mskip)
                        ocigar = [(3, start - s2[pcurr-1][1])]
                        # Remove previous one
                        skipped_exons.append(
                                DiffEvent('skipped_exon', 
                                s_s1[0], s_s1[1], torder, 
                                cigar2 = ocigar, cigar1 = cigar, 
                                exon_num = (s_s1[2], None), 
                                exon2 = (exon_n-strand, exon_n)))
                prev_match = overlap[0]
            else:
                sstart = min(start, overlap[0].start)
                ssend = max(end, overlap[0].end)
                # Ignore 5' or 3' differences
                if (exon_n == max_exon_2 and
                        overlap[0].value['anno'] == max_exon_1):
                    if end == overlap[0].end:
                        prev_match = overlap[0]
                else:
                    exclusive_juncs.append(
                            (sstart, ssend,
                            (overlap[0].value['anno'], exon_n), 
                            (overlap[0].start - start, overlap[0].end - end) ))
            # Deal with partial matches
            prev_match = overlap[0]
            exon_match[exon_n] = int(overlap[0].value['anno'])
        else:
            pass
    skipped_exons = EventCollection(transcript_ids = [s1, s2], events=skipped_exons)
    skipped_exons.events.extend(altends)
    return(matching_exons, skipped_exons)
Ejemplo n.º 19
0
The threshold can also be adjusted (thresh_max).
"""
for line in smooth_list:
    if line > 0 and not started:
        started = True
        start_base = pos + startpos

    if line < 0 and started:
        thresh += 1

    if thresh >= thresh_max:
        thresh = 0
        started = False
        end_base = pos + startpos
        region_len = end_base - start_base
        if region_len >= nucl_min and region_len <= nucl_max:
            nucl.add_interval(Interval(start_base, end_base))
            count += 1

    pos += 1
## initialize a list for data from find_contig() method
contig_list = []

for read in nucl.find(0, end_base + 1):
    ## for each potential nucleosome interval, call find contig method.
    find_contig(read, smooth_list, startpos, contig_list)

## lastly, call this method to print all probable nucleosome ranges to a bed file
calc_nucl_distance(contig_list, smooth_list, startpos, chrom_num)
                        if read.rnext != read.tid: continue
                        if read.is_read1 or (read.is_read2
                                             and read.pnext + read.qlen <
                                             regionStart - protection - 1):
                            if read.isize == 0: continue
                            if options.downsample != None and random.random(
                            ) >= options.downsample:
                                continue
                            rstart = min(read.pos, read.pnext) + 1  # 1-based
                            lseq = abs(read.isize)
                            rend = rstart + lseq - 1  # end included
                            if minInsSize != None and ((lseq < minInsSize) or
                                                       (lseq > maxInsSize)):
                                continue

                            filteredReads.add_interval(Interval(rstart, rend))
                            #print read.qname,rstart,rend,rend-rstart,abs(read.isize)
                            for i in range(rstart, rend + 1):
                                if i >= regionStart and i <= regionEnd:
                                    posRange[i][0] += 1
                            if rstart >= regionStart and rstart <= regionEnd:
                                posRange[rstart][1] += 1
                            if rend >= regionStart and rend <= regionEnd:
                                posRange[rend][1] += 1
                    else:
                        if options.downsample != None and random.random(
                        ) >= options.downsample:
                            continue
                        rstart = read.pos + 1  # 1-based
                        lseq = aln_length(read.cigar)
                        rend = rstart + lseq - 1  # end included
Ejemplo n.º 21
0
def regionTree1(mylist):
    resFrag = Intersecter()
    for s,e in mylist:
        resFrag.add_interval(Interval(s,e))
    return resFrag
Ejemplo n.º 22
0
def compare_two_transcripts(trans1, trans2, transcript_dict, afe=False):
    """
    Returns the splice differences between two transcripts.
    Single exon-comparisons are ignored.

    Parameters
    ----------
    trans1 : string 
        transcript of interest
    trans2 : string 
        second transcript of interest
    transcript_dict : a dictionary of transcript names with 
    values being a list of exons
    afe : bool
       whether to include alternate start and ends

    :TODO make a better return
    :TODO maybe include something similar to to_plot
    Returns
    -------
    Exclusive Junctions : 
    5' upstream exons : 
    3' downstram exons : 
    Skipped Exons : Diffevent 
    """
    # TODO refactor this
    t1 = transcript_dict[trans1]
    t2 = transcript_dict[trans2]
    tree = Intersecter()
    starts1 = [i[0] for i in t1]
    starts2 = [i[0] for i in t2]
    reverse = False
    if min(starts1) <= min(starts2):
        s1 = t1
        s2 = t2
        s2_beg = min(starts2)
    else:
        s1 = t2
        s2 = t1
        reverse = True
        s2_beg = min(starts1)
    if reverse: torder = (trans2, trans1)
    else: torder = (trans1, trans2)
    # Ignore single-exon stuff
    if len(s1) <= 1 or len(s2) <= 1:
        return ([], [])
    for i in s1:
        tree.add_interval(Interval(int(i[0]), int(i[1]), value={'anno': i[2]}))
    matching_exons = []
    exclusive_juncs = []
    skipped_exons = []
    altends = []
    exon_match = {}
    s1.sort(key=lambda x: x[0])
    s2.sort(key=lambda x: x[0])
    max_exon_1 = s1[-1][2]
    max_exon_2 = s2[-1][2]
    #end_position_s2 = max([i[1] for i in s2])
    s1_end = max([i[1] for i in s1])
    prev_match = None
    if max_exon_1 < s1[0][2]:
        strand = -1
    else:
        strand = 1
    for pcurr in range(len(s2)):
        start, end, exon_n = s2[pcurr]
        overlap = tree.find(int(start), int(end))
        if len(overlap) == 0:
            if prev_match and (start < s1_end):
                #skipped exons
                cigar = _generate_cigar(s2, pcurr, mskip=1)
                try:
                    if exon_match[exon_n - strand] == prev_match.value['anno']:
                        try:
                            nm = tree.find(*s2[pcurr + 1][0:2])[0]
                            ocigar = [(3, nm.start - prev_match.end)]
                            nexon = nm.value['anno']
                        except IndexError:
                            nm = s1[_get_by_exonn(
                                prev_match.value['anno'] + strand, s1)]
                            ocigar = [(3, nm[0] - prev_match.end)]
                            nexon = nm[2]
                    skipped_exons.append(
                        DiffEvent('skipped_exon',
                                  start,
                                  end,
                                  torder,
                                  cigar2=cigar,
                                  cigar1=ocigar,
                                  exon_num=(None, exon_n),
                                  exon2=(prev_match.value['anno'], nexon)))
                except KeyError:
                    # Multiple skipped exons
                    ncig = _generate_cigar(s2, pcurr, mskip=1)[1:]
                    skipped_exons[-1]._extend(ncig, cig=2)
            elif start > s1_end:
                if prev_match:
                    cigar = _generate_cigar(s2, pcurr, mskip=1)
                    pm = tree.find(*s2[pcurr - 1][0:2])[0]
                    pexon = pm.value['anno']
                    ocigar = []
                    for i in range(pexon, max_exon_1 + strand, strand):
                        narg = _get_by_exonn(i, s1)
                        ocigar.append((0, s1[narg][0], s1[narg][1]))
                        try:
                            ocigar.append((3, s1[narg][1] - s1[narg + 1][0]))
                        except IndexError:
                            pass
                    #:TODO extend ocigar till end?
                    altends.append(
                        DiffEvent('AE',
                                  start,
                                  end,
                                  torder,
                                  cigar2=cigar,
                                  cigar1=ocigar,
                                  exon_num=(None, exon_n)))
                else:
                    pass
            else:
                # Alternate start site that starts in between exons
                # of other transcript
                cigar = _generate_cigar(s2, pcurr, mskip=1)[:-1]
                try:
                    nm = tree.find(*s2[pcurr + 1][0:2])[0]
                except IndexError:
                    from IPython import embed
                    embed()
                nexon = nm.value['anno']
                narg = _get_by_exonn(nexon - strand, s1)
                pmatch = s1[narg]
                ocigar = [(0, pmatch[1] - pmatch[0]),
                          (3, nm.start - pmatch[1])]
                altends.append(
                    DiffEvent('AS',
                              start,
                              end,
                              torder,
                              cigar2=cigar,
                              cigar1=ocigar))
        elif len(overlap) == 1:
            if start == overlap[0].start and end == overlap[0].end:
                s1_exon_n = overlap[0].value['anno']
                matching_exons.append(
                    (start, end, (s1_exon_n, exon_n), (0, 0)))
                if prev_match:
                    if s1_exon_n - prev_match.value['anno'] == strand:
                        pass
                    else:
                        # Difference in exon matches
                        mskip = abs(s1_exon_n - prev_match.value['anno']) - 1
                        narg = _get_by_exonn(prev_match.value['anno'] + strand,
                                             s1)
                        s_s1 = s1[narg]  # skipped s1
                        cigar = _generate_cigar(s1, narg, mskip=mskip)
                        ocigar = [(3, start - s2[pcurr - 1][1])]
                        # Remove previous one
                        skipped_exons.append(
                            DiffEvent('skipped_exon',
                                      s_s1[0],
                                      s_s1[1],
                                      torder,
                                      cigar2=ocigar,
                                      cigar1=cigar,
                                      exon_num=(s_s1[2], None),
                                      exon2=(exon_n - strand, exon_n)))
                prev_match = overlap[0]
            else:
                sstart = min(start, overlap[0].start)
                ssend = max(end, overlap[0].end)
                # Ignore 5' or 3' differences
                if (exon_n == max_exon_2
                        and overlap[0].value['anno'] == max_exon_1):
                    if end == overlap[0].end:
                        prev_match = overlap[0]
                else:
                    exclusive_juncs.append(
                        (sstart, ssend, (overlap[0].value['anno'], exon_n),
                         (overlap[0].start - start, overlap[0].end - end)))
            # Deal with partial matches
            prev_match = overlap[0]
            exon_match[exon_n] = int(overlap[0].value['anno'])
        else:
            pass
    skipped_exons = EventCollection(transcript_ids=[s1, s2],
                                    events=skipped_exons)
    skipped_exons.events.extend(altends)
    return (matching_exons, skipped_exons)
 if isSoftClipped(read.cigar): continue
 
 if read.is_paired:
   if read.mate_is_unmapped: continue
   if read.rnext != read.tid: continue
   if read.is_read1 or (not options.pipe and read.is_read2 and read.pnext+read.qlen < start):
     if read.isize == 0: continue
     if options.downsample != None and random.random() >= options.downsample: continue
     if options.random:
       rstart = min(read.pos,read.pnext)+1+random.randint(-5,5) # 1-based
       rend = rstart+abs(read.isize)-1+random.randint(-5,5) # end included
     else:
       rstart = min(read.pos,read.pnext)+1 # 1-based
       rend = rstart+abs(read.isize)-1 # end included
     
     filteredReads.add_interval(Interval(rstart,rend))
     #print read.qname,rstart,rend,rend-rstart,abs(read.isize)
     for i in range(rstart,rend+1):
       if i >= start and i <= end:
         posRange[i][0]+=1
     if rstart >= start and rstart <= end:
       posRange[rstart][1]+=1
     if rend >= start and rend <= end:
       posRange[rend][1]+=1
 else:
   if options.downsample != None and random.random() >= options.downsample: continue
   if options.random:
     rstart = read.pos+1+random.randint(-5,5) # 1-based
     rend = rstart+aln_length(read.cigar)-1+random.randint(-5,5) # end included
   else:
     rstart = read.pos+1 # 1-based
Ejemplo n.º 24
0
def calculate_score(chrom, start, end, ctype="WPS"):
    filteredReads = Intersecter()
    posRange = defaultdict(int)
    for read in readIterator(args, chrom, start, end):
        if read.is_duplicate or read.is_qcfail or read.is_unmapped: continue
        if isSoftClipped(read.cigar): continue

        if read.is_paired:
            if read.mate_is_unmapped: continue
            if read.rnext != read.tid: continue
            if read.is_read1 or (read.is_read2
                                 and read.pnext + read.qlen < start):
                if read.isize == 0: continue
                if options.downsample != None and random.random(
                ) >= options.downsample:
                    continue
                rstart = min(read.pos, read.pnext) + 1  # 1-based
                rend = rstart + abs(read.isize) - 1  # end included
                rlength = rend - rstart + 1
                if options.minLength <= rlength <= options.maxLength:
                    filteredReads.add_interval(Interval(rstart, rend))
                    if ctype == "COV":
                        for i in range(rstart, rend + 1):
                            if i >= start and i <= end:
                                posRange[i] += 1
                    elif ctype == "STARTS":
                        if rstart >= start and rstart <= end:
                            posRange[rstart] += 1
                        if rend >= start and rend <= end:
                            posRange[rend] += 1
        else:
            if options.downsample != None and random.random(
            ) >= options.downsample:
                continue
            rstart = read.pos + 1  # 1-based
            rend = rstart + aln_length(read.cigar) - 1  # end included
            rlength = rend - rstart + 1
            if options.minLength <= rlength <= options.maxLength:
                filteredReads.add_interval(Interval(rstart, rend))
                if ctype == "COV":
                    for i in range(rstart, rend + 1):
                        if i >= start and i <= end:
                            posRange[i] += 1
                elif ctype == "STARTS":
                    if rstart >= start and rstart <= end:
                        posRange[rstart] += 1
                    if rend >= start and rend <= end:
                        posRange[rend] += 1

    if ctype == "WPS":
        protection = options.protection // 2
        for pos in xrange(start, end + 1):
            rstart, rend = pos - protection, pos + protection
            gcount, bcount = 0, 0
            for read in filteredReads.find(rstart, rend):
                if (read.start > rstart) or (read.end < rend): bcount += 1
                else: gcount += 1
            posRange[pos] += gcount - bcount

    res = []
    for pos in xrange(start, end + 1):
        res.append(posRange[pos])
    return res
Ejemplo n.º 25
0
def load_BED(in_file, exclusionSize=0, chroms=None, verbose=False):
    """
    Read a BED file and store the intervals in a tree

    Intervals are zero-based objects. The output object is a hash table with
    one search tree per chromosome

    in_file = input file [character]
    verbose = verbose mode [logical]

    """
    x = {}
    x_ex = {}
    skipped_chrom = []
    if verbose:
        print("## Loading BED file {} ...".format(in_file), file=sys.stderr)
    nline = 0
    with open(in_file) as bed_handle:
        for line in bed_handle:
            if len(line.strip()) == 0:
                continue
            if nline % 1000000 == 0 and nline != 0 and verbose:
                sys.stderr.write("{} million lines loaded\n".format(int(nline/1000000)))
            nline += 1
            bedtab = line.split("\t")
            try:
                chromosome, start, end, name = bedtab[:4]
            except ValueError:
                print("Warning : wrong input format in line {}. Not a BED file !?".format(nline), file=sys.stderr)
                sys.exit(1)
                continue
            
            # BED files are zero-based as Intervals objects
            start = int(start)  # + 1
            end = int(end)
            name = name.strip()

            if chroms is not None and chromosome not in chroms:
                if chromosome not in skipped_chrom:
                    print("Warning : Restrict to cis interactions - {} skipped".format(chromosome), file=sys.stderr)
                    skipped_chrom.append(chromosome)
                continue
            
            if chromosome in x:
                tree = x[chromosome]
                tree.add_interval(Interval(start, end, value={'name': name}))
            else:
                tree = Intersecter()
                tree.add_interval(Interval(start, end, value={'name': name}))
                x[chromosome] = tree             
            ## Exclusion regions
            if exclusionSize > 0:
                if chromosome in x_ex:
                    tree_ex = x_ex[chromosome]
                    tree_ex.add_interval(Interval(start - int(exclusionSize), start, value={'name': str(name) + "_up"}))
                    tree_ex.add_interval(Interval(end, end + int(exclusionSize), value={'name': str(name) + "_dwn"}))
                else:
                    tree_ex = Intersecter()
                    tree_ex.add_interval(Interval(start - int(exclusionSize), start, value={'name': str(name) + "_up"}))
                    tree_ex.add_interval(Interval(end, end + int(exclusionSize), value={'name': str(name) + "_dwn"}))
                    x_ex[chromosome] = tree_ex             
    return (x, x_ex)
Ejemplo n.º 26
0
def build_resFragBin_tree(in_file, resolution=None, verbose=False):
    """
    build restriction fragments bins tree based on the user defined resolution, e.g. 10fragment
    """
    binTree = {}
    if resolution is None:
        print(
            "Please provide suitable resolution to bin contact! For example resolution = 10 for 10 RE fragment as one bin."
        )
        sys.exit()
    else:
        resolution = int(resolution)

    if verbose:
        print("## Building RE fragment bins tree from ordered'" + in_file +
              "'...")
    bed_handle = open(in_file)
    nline = 0
    flag = 0
    for line in bed_handle:
        nline += 1

        bedtab = line.split("\t")
        try:
            chromosome, start, end = bedtab[:3]
            start = int(start)
            end = int(end)
        except ValueError:
            print("Warning : wrong input format in line" + nline +
                  ". Not a BED file !?")
            continue

        # First line
        if flag == 0:
            #Bin variables store current bin information
            startBin = start
            chromosomeBin = chromosome
            flag = 1
        elif chromosomePre != chromosome:  #Start of another chromosome
            nline = 1
            if startBin != endPre:
                endBin = endPre
                midBin = round((startBin + endBin) / 2)

                if chromosomeBin in binTree.keys():
                    tree = binTree[chromosomeBin]
                    tree.add_interval(
                        Interval(startBin, endBin, value={'midPoint': midBin}))
                else:
                    tree = Intersecter()
                    tree.add_interval(
                        Interval(startBin, endBin, value={'midPoint': midBin}))
                    binTree[chromosomeBin] = tree
            startBin = start
            chromosomeBin = chromosome
        elif nline % resolution == 0:
            endBin = end
            midBin = round((startBin + endBin) / 2)

            if chromosomeBin in binTree.keys():
                tree = binTree[chromosomeBin]
                tree.add_interval(
                    Interval(startBin, endBin, value={'midPoint': midBin}))
            else:
                tree = Intersecter()
                tree.add_interval(
                    Interval(startBin, endBin, value={'midPoint': midBin}))
                binTree[chromosomeBin] = tree

            startBin = end
            chromosomeBin = chromosome

        # Update the Pre variable set
        startPre = start
        endPre = end
        chromosomePre = chromosome

    # for the last bin
    if nline % resolution != 0 and startBin != start and chromosomeBin == chromosome:
        endBin = end
        midBin = round((startBin + endBin) / 2)

        if chromosomeBin in binTree.keys():
            tree = binTree[chromosomeBin]
            tree.add_interval(
                Interval(startBin, endBin, value={'midPoint': midBin}))
        else:
            tree = Intersecter()
            tree.add_interval(
                Interval(startBin, endBin, value={'midPoint': midBin}))
            binTree[chromosomeBin] = tree

    bed_handle.close()
    return binTree
Ejemplo n.º 27
0
        continue

    ## only want proper pair reads
    if read.is_proper_pair:  ## if read is proper pair, add entire PE read start/end to list
        if abs(
                read.template_length
        ) < 10000:  ## precaution for weird reads - sometimes have huge length
            if read.is_read1:

                ## get start/end of read
                read_start = min(read.reference_start,
                                 read.next_reference_start) + 1
                read_end = read_start + abs(read.template_length)

                ## add start/end to interval list
                start_end_list.add_interval(Interval(read_start, read_end))

## get last position from last read
final_pos = min(read.reference_start, read.next_reference_start) + abs(
    read.template_length)

if options.region != None:
    chrom = options.region.split(':')[0]
    start, end = map(int, options.region.split(':')[1].split('-'))
else:
    chrom = read.reference_name
    start, end = init_pos, final_pos

window_size = 120  ## define the window size (could be made an option)
prot_region = window_size // 2  ## definitely a parameter worth messing with
Ejemplo n.º 28
0
def generate(x):
    "Generates random interval over a size and span"
    lo = randint(10000, SIZE)
    hi = lo + randint(1, randint(1, 10**4))
    return (lo, hi)

def generate_point(x):
	lo = randint(10000, SIZE)
	return (lo, lo)

# use this to force both examples to generate the same data
seed(10)

# generate 10 thousand random intervals
data = map(generate, xrange(N))

# generate the intervals to query over
query = map(generate_point, xrange(1000))

# create the interval tree
tree = Intersecter()

# build an interval tree from the rest of the data
for start, end in data:
    tree.add_interval( Interval(start, end) )

# perform the query
for q, q in query:
    overlap = tree.find(q, q)
    out = [ (x.start, x.end) for x in overlap ]
    print '(%s) -> %s' % (q, out)