Python Intersecter.findの例、bx.intervals.intersection.Intersecter.find Pythonの例

コード例 #1

0

ファイルを表示

ファイル: get_estimation.py プロジェクト: jeffhsu3/genda

def count_reads(transcripts, bam_iter, number_of_counts=1):
    """ Count the reads in a given transcript

    :TODO rename 
    :TODO change to cython
    Arguments
    ---------
    transcripts : list
        list of exons
    bam_iter : pysam.BamFileIterator
        gotton after pysam.BamFile.fetch() call
    """
    # Convert this to Cython
    out_counts = zeros(len(transcripts))
    intron_lengths = []
    read_vector = []
    tree = Intersecter()
    # Assume exons are position sorted
    for ti, transcript in enumerate(transcripts):
        ex_list = []
        for j, i in enumerate(transcript):
            tree.add_interval(
                Interval(int(i[0]), int(i[1]), value={'anno': ti}))
            if j != 0:
                ex_list.append(transcript[j-1][1]\
                        - transcript[j][0])
        intron_lengths.append(ex_list)
    for read in bam_iter:
        block_counter = zeros((len(transcripts), ))
        intron_match = zeros((len(transcripts), ))
        blocks = read.get_blocks()
        junction_lengths = []
        for i, j in enumerate(blocks):
            if i != 0:
                junction_lengths.append(blocks[i - 1][1] - j[0])
            else:
                pass
        junction_lengths = set(junction_lengths)
        for i, k in enumerate(blocks):
            overlap = tree.find(k[0], k[1])
            if len(overlap) == 0:
                break
            else:
                for s in overlap:
                    if (k[0] >= s.start) and\
                            (k[1] <= s.end):
                        block_counter[s.value['anno']] += 1
        for ij, il in enumerate(intron_lengths):
            if set(junction_lengths).issubset(set(il)):
                intron_match[ij] = 1
            else:
                pass
        smatch = nrepeat(len(blocks), len(transcripts))
        gg = logical_and(block_counter == smatch, intron_match)
        read_vector.append(gg)
        out_counts += gg
    read_matrix = array(read_vector)
    uniq_r = sum_(read_matrix, axis=1) == 1
    #normalization_constant = [for i in transcripts]
    return (out_counts)

コード例 #2

0

ファイルを表示

ファイル: extractSignalBAMs.py プロジェクト: shendurelab/cfDNA

def calculate_score(chrom, start, end,ctype="WPS"):
  filteredReads = Intersecter()
  posRange = defaultdict(int)
  for read in readIterator(args,chrom,start,end):
    if read.is_duplicate or read.is_qcfail or read.is_unmapped: continue
    if isSoftClipped(read.cigar): continue
    
    if read.is_paired:
      if read.mate_is_unmapped: continue
      if read.rnext != read.tid: continue
      if read.is_read1 or (read.is_read2 and read.pnext+read.qlen < start):
        if read.isize == 0: continue
        if options.downsample != None and random.random() >= options.downsample: continue
        rstart = min(read.pos,read.pnext)+1 # 1-based
        rend = rstart+abs(read.isize)-1 # end included
        rlength = rend-rstart+1
        if options.minLength <= rlength <= options.maxLength:
          filteredReads.add_interval(Interval(rstart,rend))
          if ctype == "COV":
            for i in range(rstart,rend+1):
              if i >= start and i <= end:
                posRange[i]+=1
          elif ctype == "STARTS":
            if rstart >= start and rstart <= end:
              posRange[rstart]+=1
            if rend >= start and rend <= end:
              posRange[rend]+=1
    else:
      if options.downsample != None and random.random() >= options.downsample: continue
      rstart = read.pos+1 # 1-based
      rend = rstart+aln_length(read.cigar)-1 # end included
      rlength = rend-rstart+1
      if options.minLength <= rlength <= options.maxLength:
        filteredReads.add_interval(Interval(rstart,rend))
        if ctype == "COV":
          for i in range(rstart,rend+1):
            if i >= start and i <= end:
              posRange[i]+=1
        elif ctype == "STARTS":
          if rstart >= start and rstart <= end:
            posRange[rstart]+=1
          if rend >= start and rend <= end:
            posRange[rend]+=1

  if ctype == "WPS":
    protection = options.protection//2
    for pos in xrange(start,end+1):
      rstart,rend = pos-protection,pos+protection
      gcount,bcount = 0,0
      for read in filteredReads.find(rstart,rend):
        if (read.start > rstart) or (read.end < rend): bcount +=1
        else: gcount +=1
      posRange[pos]+=gcount-bcount

  res = []
  for pos in xrange(start,end+1):
    res.append(posRange[pos])
  return res

コード例 #3

0

ファイルを表示

ファイル: get_estimation.py プロジェクト: jeffhsu3/genda

def count_reads(transcripts, bam_iter, number_of_counts=1):
    """ Count the reads in a given transcript

    :TODO rename 
    :TODO change to cython
    Arguments
    ---------
    transcripts : list
        list of exons
    bam_iter : pysam.BamFileIterator
        gotton after pysam.BamFile.fetch() call
    """
    # Convert this to Cython
    out_counts = zeros(len(transcripts))
    intron_lengths = []
    read_vector = []
    tree = Intersecter()
    # Assume exons are position sorted
    for ti, transcript in enumerate(transcripts):
        ex_list = []
        for j, i in enumerate(transcript):
            tree.add_interval(Interval(int(i[0]), int(i[1]), 
                value={'anno':ti}))
            if j != 0:
                ex_list.append(transcript[j-1][1]\
                        - transcript[j][0])
        intron_lengths.append(ex_list)
    for read in bam_iter:
        block_counter = zeros((len(transcripts),))
        intron_match =  zeros((len(transcripts),))
        blocks = read.get_blocks()
        junction_lengths = []
        for i,j in enumerate(blocks):
            if i != 0:
                junction_lengths.append(blocks[i - 1][1] - j[0])
            else: pass
        junction_lengths = set(junction_lengths)
        for i, k in enumerate(blocks):
            overlap = tree.find(k[0], k[1]) 
            if len(overlap) == 0:
                break
            else:
                for s in overlap:
                    if (k[0] >= s.start) and\
                            (k[1] <= s.end):
                        block_counter[s.value['anno']] += 1
        for ij, il in enumerate(intron_lengths):
            if set(junction_lengths).issubset(set(il)):
                intron_match[ij] = 1
            else: pass
        smatch = nrepeat(len(blocks), len(transcripts))
        gg = logical_and(block_counter == smatch, intron_match)
        read_vector.append(gg)
        out_counts += gg
    read_matrix = array(read_vector)
    uniq_r = sum_(read_matrix, axis=1) == 1
    #normalization_constant = [for i in transcripts]
    return(out_counts)

コード例 #4

0

ファイルを表示

ファイル: subtraction.py プロジェクト: achenge07/capsid-pipeline

def maps_gene(mapped):
    '''Determine if the mapped alignment falls within a gene.'''
    global intersecters

    try:
        intersecter = intersecters[mapped['genome']]
    except KeyError:
        genes = db.feature.find({'genome': mapped['genome'], 'type': 'gene'})

        intersecter = Intersecter()

        # Interval end is exclusive, need to +1 to line up with actual position
        [intersecter.add_interval(Interval(gene['start'], gene['end'] + 1, gene['uid']))
         for gene in genes]

        intersecters[mapped['genome']] = intersecter

    return intersecter.find(mapped['refStart'], mapped['refEnd'])

コード例 #5

0

ファイルを表示

ファイル: subtraction.py プロジェクト: ChiZhou-TJ/capsid-pipeline

def maps_gene(mapped):
    """Determine if the mapped alignment falls within a gene."""
    global intersecters

    try:
        intersecter = intersecters[mapped['genome']]
    except KeyError:
        genes = db.feature.find({'genome': mapped['genome'], 'type': 'gene'})

        intersecter = Intersecter()

        # Interval end is exclusive, need to +1 to line up with actual position
        [
            intersecter.add_interval(
                Interval(gene['start'], gene['end'] + 1, gene['uid']))
            for gene in genes
        ]

        intersecters[mapped['genome']] = intersecter

    return intersecter.find(mapped['refStart'], mapped['refEnd'])

コード例 #6

0

ファイルを表示

    start, end = init_pos, final_pos

window_size = 120  ## define the window size (could be made an option)
prot_region = window_size // 2  ## definitely a parameter worth messing with

## write the first line of info for the peak calling file
outfile.write("fixedStep chrom=chr%s start=%d step=1" % (chrom, start))

## This is where the scoring occurs
for pos in range(start, end + 1):
    ## get the start/end points of the window in current position
    w_start, w_end = pos - prot_region, pos + prot_region  ## define the window at each position in region
    end_points = 0  ## set to 0 each window shift
    intact_reads = 0

    ## get the number of fragments within the window that are fully intact (+1)
    ## or have an endpoint (-1)
    for read in start_end_list.find(w_start, w_end):
        if (read.start > w_start) or (read.end < w_end):
            end_points += 1
        else:
            intact_reads += 1
    ## write the score for each window position to the output wig file
    outfile.write("\n" + str(intact_reads - end_points))

## close the file
outfile.close()

## print total execution time
sys.stderr.write("--- %s seconds ---\n" % (time.time() - start_time))

コード例 #7

0

ファイルを表示

ファイル: extractReadStartsFromBAM_Region_WPS.py プロジェクト: zhang-siting/cfDNA

                                posRange[rend][1] += 1
                        elif read.is_reverse:
                            if rend >= regionStart and rend <= regionEnd:
                                posRange[rend][1] += 1
                        else:
                            if (rstart >= regionStart and rstart <= regionEnd):
                                posRange[rstart][1] += 1

        filename = options.outfile % cid
        outfile = gzip.open(filename, 'w')
        cov_sites = 0
        outLines = []
        for pos in range(regionStart, regionEnd + 1):
            rstart, rend = pos - protection, pos + protection
            gcount, bcount = 0, 0
            for read in filteredReads.find(rstart, rend):
                if (read.start > rstart) or (read.end < rend): bcount += 1
                else: gcount += 1
            covCount, startCount = posRange[pos]
            cov_sites += covCount
            outLines.append(
                "%s\t%d\t%d\t%d\t%d\n" %
                (chrom, pos, covCount, startCount, gcount - bcount))

        if strand == "-": outLines = outLines[::-1]
        for line in outLines:
            outfile.write(line)
        outfile.close()

        if cov_sites == 0 and not options.empty:
            os.remove(filename)

コード例 #8

0

ファイルを表示

def compare_two_transcripts(trans1, trans2, transcript_dict, afe=False):
    """
    Returns the splice differences between two transcripts.
    Single exon-comparisons are ignored.

    Parameters
    ----------
    trans1 : string 
        transcript of interest
    trans2 : string 
        second transcript of interest
    transcript_dict : a dictionary of transcript names with 
    values being a list of exons
    afe : bool
       whether to include alternate start and ends

    :TODO make a better return
    :TODO maybe include something similar to to_plot
    Returns
    -------
    Exclusive Junctions : 
    5' upstream exons : 
    3' downstram exons : 
    Skipped Exons : Diffevent 
    """
    # TODO refactor this
    t1 = transcript_dict[trans1]
    t2 = transcript_dict[trans2]
    tree = Intersecter()
    starts1 = [i[0] for i in t1]
    starts2 = [i[0] for i in t2]
    reverse = False
    if min(starts1) <= min(starts2):
        s1 = t1
        s2 = t2
        s2_beg = min(starts2)
    else:
        s1 = t2
        s2 = t1
        reverse = True
        s2_beg = min(starts1)
    if reverse: torder = (trans2, trans1)
    else: torder = (trans1, trans2)
    # Ignore single-exon stuff
    if len(s1) <= 1 or len(s2) <= 1:
        return ([], [])
    for i in s1:
        tree.add_interval(Interval(int(i[0]), int(i[1]), value={'anno': i[2]}))
    matching_exons = []
    exclusive_juncs = []
    skipped_exons = []
    altends = []
    exon_match = {}
    s1.sort(key=lambda x: x[0])
    s2.sort(key=lambda x: x[0])
    max_exon_1 = s1[-1][2]
    max_exon_2 = s2[-1][2]
    #end_position_s2 = max([i[1] for i in s2])
    s1_end = max([i[1] for i in s1])
    prev_match = None
    if max_exon_1 < s1[0][2]:
        strand = -1
    else:
        strand = 1
    for pcurr in range(len(s2)):
        start, end, exon_n = s2[pcurr]
        overlap = tree.find(int(start), int(end))
        if len(overlap) == 0:
            if prev_match and (start < s1_end):
                #skipped exons
                cigar = _generate_cigar(s2, pcurr, mskip=1)
                try:
                    if exon_match[exon_n - strand] == prev_match.value['anno']:
                        try:
                            nm = tree.find(*s2[pcurr + 1][0:2])[0]
                            ocigar = [(3, nm.start - prev_match.end)]
                            nexon = nm.value['anno']
                        except IndexError:
                            nm = s1[_get_by_exonn(
                                prev_match.value['anno'] + strand, s1)]
                            ocigar = [(3, nm[0] - prev_match.end)]
                            nexon = nm[2]
                    skipped_exons.append(
                        DiffEvent('skipped_exon',
                                  start,
                                  end,
                                  torder,
                                  cigar2=cigar,
                                  cigar1=ocigar,
                                  exon_num=(None, exon_n),
                                  exon2=(prev_match.value['anno'], nexon)))
                except KeyError:
                    # Multiple skipped exons
                    ncig = _generate_cigar(s2, pcurr, mskip=1)[1:]
                    skipped_exons[-1]._extend(ncig, cig=2)
            elif start > s1_end:
                if prev_match:
                    cigar = _generate_cigar(s2, pcurr, mskip=1)
                    pm = tree.find(*s2[pcurr - 1][0:2])[0]
                    pexon = pm.value['anno']
                    ocigar = []
                    for i in range(pexon, max_exon_1 + strand, strand):
                        narg = _get_by_exonn(i, s1)
                        ocigar.append((0, s1[narg][0], s1[narg][1]))
                        try:
                            ocigar.append((3, s1[narg][1] - s1[narg + 1][0]))
                        except IndexError:
                            pass
                    #:TODO extend ocigar till end?
                    altends.append(
                        DiffEvent('AE',
                                  start,
                                  end,
                                  torder,
                                  cigar2=cigar,
                                  cigar1=ocigar,
                                  exon_num=(None, exon_n)))
                else:
                    pass
            else:
                # Alternate start site that starts in between exons
                # of other transcript
                cigar = _generate_cigar(s2, pcurr, mskip=1)[:-1]
                try:
                    nm = tree.find(*s2[pcurr + 1][0:2])[0]
                except IndexError:
                    from IPython import embed
                    embed()
                nexon = nm.value['anno']
                narg = _get_by_exonn(nexon - strand, s1)
                pmatch = s1[narg]
                ocigar = [(0, pmatch[1] - pmatch[0]),
                          (3, nm.start - pmatch[1])]
                altends.append(
                    DiffEvent('AS',
                              start,
                              end,
                              torder,
                              cigar2=cigar,
                              cigar1=ocigar))
        elif len(overlap) == 1:
            if start == overlap[0].start and end == overlap[0].end:
                s1_exon_n = overlap[0].value['anno']
                matching_exons.append(
                    (start, end, (s1_exon_n, exon_n), (0, 0)))
                if prev_match:
                    if s1_exon_n - prev_match.value['anno'] == strand:
                        pass
                    else:
                        # Difference in exon matches
                        mskip = abs(s1_exon_n - prev_match.value['anno']) - 1
                        narg = _get_by_exonn(prev_match.value['anno'] + strand,
                                             s1)
                        s_s1 = s1[narg]  # skipped s1
                        cigar = _generate_cigar(s1, narg, mskip=mskip)
                        ocigar = [(3, start - s2[pcurr - 1][1])]
                        # Remove previous one
                        skipped_exons.append(
                            DiffEvent('skipped_exon',
                                      s_s1[0],
                                      s_s1[1],
                                      torder,
                                      cigar2=ocigar,
                                      cigar1=cigar,
                                      exon_num=(s_s1[2], None),
                                      exon2=(exon_n - strand, exon_n)))
                prev_match = overlap[0]
            else:
                sstart = min(start, overlap[0].start)
                ssend = max(end, overlap[0].end)
                # Ignore 5' or 3' differences
                if (exon_n == max_exon_2
                        and overlap[0].value['anno'] == max_exon_1):
                    if end == overlap[0].end:
                        prev_match = overlap[0]
                else:
                    exclusive_juncs.append(
                        (sstart, ssend, (overlap[0].value['anno'], exon_n),
                         (overlap[0].start - start, overlap[0].end - end)))
            # Deal with partial matches
            prev_match = overlap[0]
            exon_match[exon_n] = int(overlap[0].value['anno'])
        else:
            pass
    skipped_exons = EventCollection(transcript_ids=[s1, s2],
                                    events=skipped_exons)
    skipped_exons.events.extend(altends)
    return (matching_exons, skipped_exons)

コード例 #9

0

ファイルを表示

ファイル: extractSignalBAMs.py プロジェクト: zhang-siting/cfDNA

def calculate_score(chrom, start, end, ctype="WPS"):
    filteredReads = Intersecter()
    posRange = defaultdict(int)
    for read in readIterator(args, chrom, start, end):
        if read.is_duplicate or read.is_qcfail or read.is_unmapped: continue
        if isSoftClipped(read.cigar): continue

        if read.is_paired:
            if read.mate_is_unmapped: continue
            if read.rnext != read.tid: continue
            if read.is_read1 or (read.is_read2
                                 and read.pnext + read.qlen < start):
                if read.isize == 0: continue
                if options.downsample != None and random.random(
                ) >= options.downsample:
                    continue
                rstart = min(read.pos, read.pnext) + 1  # 1-based
                rend = rstart + abs(read.isize) - 1  # end included
                rlength = rend - rstart + 1
                if options.minLength <= rlength <= options.maxLength:
                    filteredReads.add_interval(Interval(rstart, rend))
                    if ctype == "COV":
                        for i in range(rstart, rend + 1):
                            if i >= start and i <= end:
                                posRange[i] += 1
                    elif ctype == "STARTS":
                        if rstart >= start and rstart <= end:
                            posRange[rstart] += 1
                        if rend >= start and rend <= end:
                            posRange[rend] += 1
        else:
            if options.downsample != None and random.random(
            ) >= options.downsample:
                continue
            rstart = read.pos + 1  # 1-based
            rend = rstart + aln_length(read.cigar) - 1  # end included
            rlength = rend - rstart + 1
            if options.minLength <= rlength <= options.maxLength:
                filteredReads.add_interval(Interval(rstart, rend))
                if ctype == "COV":
                    for i in range(rstart, rend + 1):
                        if i >= start and i <= end:
                            posRange[i] += 1
                elif ctype == "STARTS":
                    if rstart >= start and rstart <= end:
                        posRange[rstart] += 1
                    if rend >= start and rend <= end:
                        posRange[rend] += 1

    if ctype == "WPS":
        protection = options.protection // 2
        for pos in xrange(start, end + 1):
            rstart, rend = pos - protection, pos + protection
            gcount, bcount = 0, 0
            for read in filteredReads.find(rstart, rend):
                if (read.start > rstart) or (read.end < rend): bcount += 1
                else: gcount += 1
            posRange[pos] += gcount - bcount

    res = []
    for pos in xrange(start, end + 1):
        res.append(posRange[pos])
    return res

コード例 #10

0

ファイルを表示

ファイル: extractReadStartsFromBAM2Wig.py プロジェクト: ernesto-up/cfDNA

      if (rstart >= start and rstart <= end):
        posRange[rstart][1]+=1
      if rend >= start and rend <= end:
        posRange[rend][1]+=1
    elif read.is_reverse:
      if rend >= start and rend <= end:
        posRange[rend][1]+=1
    else:
      if (rstart >= start and rstart <= end):
        posRange[rstart][1]+=1

protection = options.protection//2
for pos in range(start,end+1):
  rstart,rend = pos-protection,pos+protection
  gcount,bcount = 0,0
  for read in filteredReads.find(rstart,rend):
     if (read.start > rstart) or (read.end < rend): bcount +=1
     else: gcount +=1
  posRange[pos][2]+=gcount-bcount


if options.coverage != "OFF":
  output = sys.stdout
  if options.coverage != "": output = gzip.open(options.coverage,'w')
  output.write("fixedStep chrom=chr%s start=%d step=1\n"%(outchrom,start))
  for pos in range(start,end+1):
    output.write("%d\n"%(posRange[pos][0]))
  if options.coverage != "": output.close()
  else: output.write("\n")

if options.starts != "OFF":

コード例 #11

0

ファイルを表示

ファイル: peak_call.py プロジェクト: rafebatch/cfDNANucleosomeDetection

The threshold can also be adjusted (thresh_max).
"""
for line in smooth_list:
    if line > 0 and not started:
        started = True
        start_base = pos + startpos

    if line < 0 and started:
        thresh += 1

    if thresh >= thresh_max:
        thresh = 0
        started = False
        end_base = pos + startpos
        region_len = end_base - start_base
        if region_len >= nucl_min and region_len <= nucl_max:
            nucl.add_interval(Interval(start_base, end_base))
            count += 1

    pos += 1
## initialize a list for data from find_contig() method
contig_list = []

for read in nucl.find(0, end_base + 1):
    ## for each potential nucleosome interval, call find contig method.
    find_contig(read, smooth_list, startpos, contig_list)

## lastly, call this method to print all probable nucleosome ranges to a bed file
calc_nucl_distance(contig_list, smooth_list, startpos, chrom_num)

コード例 #12

0

ファイルを表示

ファイル: transcripts_utils.py プロジェクト: jeffhsu3/genda

def compare_two_transcripts(trans1, trans2, transcript_dict, 
        afe=False):
    """
    Returns the splice differences between two transcripts.
    Single exon-comparisons are ignored.

    Parameters
    ----------
    trans1 : string 
        transcript of interest
    trans2 : string 
        second transcript of interest
    transcript_dict : a dictionary of transcript names with 
    values being a list of exons
    afe : bool
       whether to include alternate start and ends

    :TODO make a better return
    :TODO maybe include something similar to to_plot
    Returns
    -------
    Exclusive Junctions : 
    5' upstream exons : 
    3' downstram exons : 
    Skipped Exons : Diffevent 
    """
    # TODO refactor this
    t1 = transcript_dict[trans1]
    t2 = transcript_dict[trans2]
    tree = Intersecter()
    starts1 = [i[0] for i in t1]
    starts2 = [i[0] for i in t2]
    reverse = False
    if min(starts1) <= min(starts2):
        s1 = t1
        s2 = t2
        s2_beg = min(starts2)
    else:
        s1 = t2 
        s2 = t1
        reverse = True
        s2_beg = min(starts1)
    if reverse: torder = (trans2, trans1)
    else: torder = (trans1, trans2)
    # Ignore single-exon stuff
    if len(s1) <= 1 or len(s2) <= 1:
        return([], [])
    for i in s1:
        tree.add_interval(Interval(int(i[0]), int(i[1]), 
            value={'anno':i[2]}))
    matching_exons = []
    exclusive_juncs = []
    skipped_exons = []
    altends = []
    exon_match = {}
    s1.sort(key=lambda x: x[0])
    s2.sort(key=lambda x: x[0])
    max_exon_1 =  s1[-1][2]
    max_exon_2 = s2[-1][2]
    #end_position_s2 = max([i[1] for i in s2])
    s1_end = max([i[1] for i in s1])
    prev_match = None
    if max_exon_1 < s1[0][2]:
        strand = -1
    else:
        strand = 1
    for pcurr in range(len(s2)):
        start, end, exon_n = s2[pcurr]
        overlap = tree.find(int(start), int(end))
        if len(overlap) == 0:
            if prev_match and (start < s1_end):
                #skipped exons
                cigar = _generate_cigar(s2, pcurr, mskip=1)
                try:
                    if exon_match[exon_n - strand] == prev_match.value['anno']:
                        try:
                            nm = tree.find(*s2[pcurr + 1][0:2])[0]
                            ocigar = [(3, nm.start - prev_match.end)]
                            nexon = nm.value['anno']
                        except IndexError:
                            nm=s1[_get_by_exonn(prev_match.value['anno']+strand,s1)] 
                            ocigar = [(3,nm[0] - prev_match.end)]
                            nexon = nm[2]
                    skipped_exons.append(DiffEvent('skipped_exon', start, end,
                            torder, cigar2=cigar, cigar1 = ocigar, 
                            exon_num = (None, exon_n), 
                            exon2=(prev_match.value['anno'], nexon))
                            )
                except KeyError:
                    # Multiple skipped exons
                    ncig = _generate_cigar(s2, pcurr, mskip=1)[1:]
                    skipped_exons[-1]._extend(ncig, cig=2)
            elif start > s1_end: 
                if prev_match:
                    cigar = _generate_cigar(s2, pcurr, mskip=1)
                    pm = tree.find(*s2[pcurr - 1][0:2])[0]
                    pexon = pm.value['anno']
                    ocigar = []
                    for i in range(pexon, max_exon_1+strand, strand):
                        narg = _get_by_exonn(i, s1)
                        ocigar.append((0, s1[narg][0], s1[narg][1]))
                        try:
                            ocigar.append((3, s1[narg][1] - s1[narg+1][0]))
                        except IndexError:
                            pass
                    #:TODO extend ocigar till end?
                    altends.append(DiffEvent('AE', start, end,
                        torder, cigar2=cigar, cigar1=ocigar,
                        exon_num = (None, exon_n)))
                else: 
                    pass
            else: 
                # Alternate start site that starts in between exons
                # of other transcript
                cigar = _generate_cigar(s2, pcurr, mskip=1)[:-1]
                try:
                    nm = tree.find(*s2[pcurr + 1][0:2])[0]
                except IndexError:
                    from IPython import embed
                    embed()
                nexon = nm.value['anno']
                narg = _get_by_exonn(nexon - strand, s1)
                pmatch = s1[narg]
                ocigar = [(0, pmatch[1] - pmatch[0]),
                        (3, nm.start - pmatch[1])]
                altends.append(DiffEvent('AS', start, end,
                        torder, cigar2=cigar, cigar1 = ocigar))
        elif len(overlap) == 1:
            if start == overlap[0].start and end == overlap[0].end:
                s1_exon_n = overlap[0].value['anno']
                matching_exons.append((start, end, (s1_exon_n, 
                    exon_n), (0, 0)))
                if prev_match:
                    if s1_exon_n - prev_match.value['anno']  == strand: 
                        pass
                    else:
                        # Difference in exon matches
                        mskip = abs(s1_exon_n - prev_match.value['anno'] ) - 1
                        narg = _get_by_exonn(prev_match.value['anno']+strand, s1) 
                        s_s1 = s1[narg] # skipped s1
                        cigar = _generate_cigar(s1, narg, mskip=mskip)
                        ocigar = [(3, start - s2[pcurr-1][1])]
                        # Remove previous one
                        skipped_exons.append(
                                DiffEvent('skipped_exon', 
                                s_s1[0], s_s1[1], torder, 
                                cigar2 = ocigar, cigar1 = cigar, 
                                exon_num = (s_s1[2], None), 
                                exon2 = (exon_n-strand, exon_n)))
                prev_match = overlap[0]
            else:
                sstart = min(start, overlap[0].start)
                ssend = max(end, overlap[0].end)
                # Ignore 5' or 3' differences
                if (exon_n == max_exon_2 and
                        overlap[0].value['anno'] == max_exon_1):
                    if end == overlap[0].end:
                        prev_match = overlap[0]
                else:
                    exclusive_juncs.append(
                            (sstart, ssend,
                            (overlap[0].value['anno'], exon_n), 
                            (overlap[0].start - start, overlap[0].end - end) ))
            # Deal with partial matches
            prev_match = overlap[0]
            exon_match[exon_n] = int(overlap[0].value['anno'])
        else:
            pass
    skipped_exons = EventCollection(transcript_ids = [s1, s2], events=skipped_exons)
    skipped_exons.events.extend(altends)
    return(matching_exons, skipped_exons)

コード例 #13

0

ファイルを表示

ファイル: interval.py プロジェクト: dlrice/python-notes

def generate(x):
    "Generates random interval over a size and span"
    lo = randint(10000, SIZE)
    hi = lo + randint(1, randint(1, 10**4))
    return (lo, hi)

def generate_point(x):
	lo = randint(10000, SIZE)
	return (lo, lo)

# use this to force both examples to generate the same data
seed(10)

# generate 10 thousand random intervals
data = map(generate, xrange(N))

# generate the intervals to query over
query = map(generate_point, xrange(1000))

# create the interval tree
tree = Intersecter()

# build an interval tree from the rest of the data
for start, end in data:
    tree.add_interval( Interval(start, end) )

# perform the query
for q, q in query:
    overlap = tree.find(q, q)
    out = [ (x.start, x.end) for x in overlap ]
    print '(%s) -> %s' % (q, out)