Ejemplo n.º 1
0
def count_reads(transcripts, bam_iter, number_of_counts=1):
    """ Count the reads in a given transcript

    :TODO rename 
    :TODO change to cython
    Arguments
    ---------
    transcripts : list
        list of exons
    bam_iter : pysam.BamFileIterator
        gotton after pysam.BamFile.fetch() call
    """
    # Convert this to Cython
    out_counts = zeros(len(transcripts))
    intron_lengths = []
    read_vector = []
    tree = Intersecter()
    # Assume exons are position sorted
    for ti, transcript in enumerate(transcripts):
        ex_list = []
        for j, i in enumerate(transcript):
            tree.add_interval(
                Interval(int(i[0]), int(i[1]), value={'anno': ti}))
            if j != 0:
                ex_list.append(transcript[j-1][1]\
                        - transcript[j][0])
        intron_lengths.append(ex_list)
    for read in bam_iter:
        block_counter = zeros((len(transcripts), ))
        intron_match = zeros((len(transcripts), ))
        blocks = read.get_blocks()
        junction_lengths = []
        for i, j in enumerate(blocks):
            if i != 0:
                junction_lengths.append(blocks[i - 1][1] - j[0])
            else:
                pass
        junction_lengths = set(junction_lengths)
        for i, k in enumerate(blocks):
            overlap = tree.find(k[0], k[1])
            if len(overlap) == 0:
                break
            else:
                for s in overlap:
                    if (k[0] >= s.start) and\
                            (k[1] <= s.end):
                        block_counter[s.value['anno']] += 1
        for ij, il in enumerate(intron_lengths):
            if set(junction_lengths).issubset(set(il)):
                intron_match[ij] = 1
            else:
                pass
        smatch = nrepeat(len(blocks), len(transcripts))
        gg = logical_and(block_counter == smatch, intron_match)
        read_vector.append(gg)
        out_counts += gg
    read_matrix = array(read_vector)
    uniq_r = sum_(read_matrix, axis=1) == 1
    #normalization_constant = [for i in transcripts]
    return (out_counts)
Ejemplo n.º 2
0
def buildIntervalTree(exons):
    '''Build interval tree from exon annotations.'''
    tree = Intersecter()
    for exon in exons:
        tree.add_interval(Interval(exon.start, exon.end,
                            value={'cStart': exon.cStart,
                                    'cEnd': exon.cEnd}))
    return tree
Ejemplo n.º 3
0
def count_reads(transcripts, bam_iter, number_of_counts=1):
    """ Count the reads in a given transcript

    :TODO rename 
    :TODO change to cython
    Arguments
    ---------
    transcripts : list
        list of exons
    bam_iter : pysam.BamFileIterator
        gotton after pysam.BamFile.fetch() call
    """
    # Convert this to Cython
    out_counts = zeros(len(transcripts))
    intron_lengths = []
    read_vector = []
    tree = Intersecter()
    # Assume exons are position sorted
    for ti, transcript in enumerate(transcripts):
        ex_list = []
        for j, i in enumerate(transcript):
            tree.add_interval(Interval(int(i[0]), int(i[1]), 
                value={'anno':ti}))
            if j != 0:
                ex_list.append(transcript[j-1][1]\
                        - transcript[j][0])
        intron_lengths.append(ex_list)
    for read in bam_iter:
        block_counter = zeros((len(transcripts),))
        intron_match =  zeros((len(transcripts),))
        blocks = read.get_blocks()
        junction_lengths = []
        for i,j in enumerate(blocks):
            if i != 0:
                junction_lengths.append(blocks[i - 1][1] - j[0])
            else: pass
        junction_lengths = set(junction_lengths)
        for i, k in enumerate(blocks):
            overlap = tree.find(k[0], k[1]) 
            if len(overlap) == 0:
                break
            else:
                for s in overlap:
                    if (k[0] >= s.start) and\
                            (k[1] <= s.end):
                        block_counter[s.value['anno']] += 1
        for ij, il in enumerate(intron_lengths):
            if set(junction_lengths).issubset(set(il)):
                intron_match[ij] = 1
            else: pass
        smatch = nrepeat(len(blocks), len(transcripts))
        gg = logical_and(block_counter == smatch, intron_match)
        read_vector.append(gg)
        out_counts += gg
    read_matrix = array(read_vector)
    uniq_r = sum_(read_matrix, axis=1) == 1
    #normalization_constant = [for i in transcripts]
    return(out_counts)
Ejemplo n.º 4
0
def init_intersecter(hits):
    ''' '''

    intersecter = Intersecter()

    for h in hits:
        intersecter.add_interval(Interval(h[0], h[1]))

    return intersecter
Ejemplo n.º 5
0
def init_intersecter(hits):
    ''' '''

    intersecter = Intersecter()

    for h in hits:
        intersecter.add_interval(Interval(h[0], h[1]))

    return intersecter
def load_restriction_fragment(in_file, minfragsize=None, maxfragsize=None, verbose=False):
    """
    Read a BED file and store the intervals in a tree

    Intervals are zero-based objects. The output object is a hash table with
    one search tree per chromosome

    in_file = input file [character]
    verbose = verbose mode [logical]

    """
    resFrag = {}
    if verbose:
        print "## Loading Restriction File Intervals '", in_file, "'..."

    bed_handle = open(in_file)
    nline = 0
    nfilt = 0
    for line in bed_handle:
        nline +=1
        bedtab = line.split("\t")
        try:
            chromosome, start, end, name = bedtab[:4]
        except ValueError:
            print "Warning : wrong input format in line", nline,". Not a BED file !?"
            continue

        # BED files are zero-based as Intervals objects
        start = int(start)  # + 1
        end = int(end)
        fragl = abs(end - start)
        name = name.strip()

        ## Discard fragments outside the size range
        filt=False
        if minfragsize != None and int(fragl) < int(minfragsize):
            nfilt+=1
            filt=True
        elif maxfragsize != None and int(fragl) > int(maxfragsize):
            nfilt+=1
            filt=True
       
        if chromosome in resFrag:
            tree = resFrag[chromosome]
            tree.add_interval(Interval(start, end, value={'name': name, 'filter': filt}))
        else:
            tree = Intersecter()
            tree.add_interval(Interval(start, end, value={'name': name, 'filter': filt}))
            resFrag[chromosome] = tree
    
    if nfilt > 0:
        print "Warning : ", nfilt ,"fragment(s) outside of range and discarded. ", nline - nfilt, " remaining."

    bed_handle.close()
    return resFrag
Ejemplo n.º 7
0
def load_restriction_fragment(in_file, minfragsize=None, maxfragsize=None, verbose=False):
    '''
    This function cites the same function from Hi-C Pro (https://github.com/nservant/HiC-Pro/blob/master/scripts/mapped_2hic_fragments.py) by Nicolas Servant, Eric Viara
    '''
    '''
    Function load_restriction_fragment cite the same function from Hi-C Pro (https://github.com/nservant/HiC-Pro/blob/master/scripts/mapped_2hic_fragments.py) by Nicolas Servant, Eric Viara 
    '''
    
    """
    Read a BED file and store the intervals in a tree
    Intervals are zero-based objects. The output object is a hash table with
    one search tree per chromosome
    in_file = input file [character]
    verbose = verbose mode [logical]
    """
    resFrag = {}
    if verbose:
        print("## Loading Restriction File Intervals '" + in_file + "'...")

    bed_handle = open(in_file)
    nline = 0
    for line in bed_handle:
        nline +=1
        bedtab = line.split("\t")
        try:
            chromosome, start, end, name = bedtab[:4]
        except ValueError:
            print("Warning : wrong input format in line" + nline + ". Not a BED file !?")
            continue

        # BED files are zero-based as Intervals objects
        start = int(start)  # + 1
        end = int(end)
        midPoint = (start + end)/2
        fragl = abs(end - start)
        name = name.strip()

        ## Discard fragments outside the size range
        if minfragsize != None and int(fragl) < int(minfragsize):
            print("Warning : fragment "+ name + " [" +  fragl + "] outside of range. Discarded")
            continue
        if maxfragsize != None and int(fragl) > int(maxfragsize):
            print("Warning : fragment " + name + " [" + fragl + "] outside of range. Discarded")
            continue
       
        if chromosome in resFrag:
            tree = resFrag[chromosome]
            tree.add_interval(Interval(start, end, value={'name': name, 'midPoint': midPoint}))
        else:
            tree = Intersecter()
            tree.add_interval(Interval(start, end, value={'name': name, 'midPoint': midPoint}))
            resFrag[chromosome] = tree
    
    bed_handle.close()
    return resFrag
Ejemplo n.º 8
0
def add_pvalues_to_peaks_frame_macs_bf(peaks_frame,experiment_peaks_frame,TTAA_frame,lam_win_size,pseudocounts = 0.2,macs_pvalue=True):
	print "lab specific hohoho"
	experiment_gnashy_dict = {}
	experiment_dict_of_trees = {}
	TTAA_frame_gbChr_dict = {} 
	TTAA_dict_of_trees = {}
	list_of_l_names = [lam_win_size]
	print "Making interval tree for experiment hops..."
	for name,group in experiment_peaks_frame.groupby('Chr'):
		experiment_gnashy_dict[name] = group
		experiment_gnashy_dict[name].index = experiment_gnashy_dict[name]["Start"]
		#initialize tree
		experiment_dict_of_trees[name] = Intersecter()
		#populate tree with position as interval
		for idx, row in experiment_gnashy_dict[name].iterrows():
			experiment_dict_of_trees[name].add_interval(Interval(int(idx),int(idx)+3)) 
	print "Making interval tree for TTAAs..."
	#make interval tree for TTAAs
	for name,group in TTAA_frame.groupby('Chr'): 
		TTAA_frame_gbChr_dict[name] = group
		TTAA_frame_gbChr_dict[name].index = TTAA_frame_gbChr_dict[name]["Start"]
		#initialize tree
		TTAA_dict_of_trees[name] = Intersecter() 
		#populate tree with position as interval
		for idx, row in TTAA_frame_gbChr_dict[name].iterrows(): 
			TTAA_dict_of_trees[name].add_interval(Interval(int(idx),int(idx+3)))
	#go through cluster frame and compute pvalues 
	lambda_type_list =[]
	lambda_list = []
	pvalue_list = []
	for idx,row in peaks_frame.iterrows():
		#add number of background hops in cluster to frame
		cluster_center = row["Center"]
		#find lambda and compute significance of cluster
		num_TTAAs = len(TTAA_dict_of_trees[row["Chr"]].find(row["Start"],row["End"]))
		#compute lambda for window size
		num_exp_hops_lam_win_size = len(experiment_dict_of_trees[row["Chr"]].find(cluster_center-(lam_win_size/2 - 1),cluster_center+lam_win_size/2))
		num_TTAAs_lam_win_size = len(TTAA_dict_of_trees[row["Chr"]].find(cluster_center-(lam_win_size/2 - 1),cluster_center+lam_win_size/2))
		lambda_win_size = float(num_exp_hops_lam_win_size)/(max(num_TTAAs_lam_win_size,1))

		lambda_f = lambda_win_size
		lambda_type_list.append(lam_win_size)
		lambda_list.append(lambda_f)
		#compute pvalue and record it
		pvalue = 1-scistat.poisson.cdf((row["Experiment Hops"]+pseudocounts),lambda_f*max(num_TTAAs,1)+pseudocounts)
		pvalue_list.append(pvalue)

						 
	#make frame from all of the lists 
	peaks_frame["Lambda Type"] = lambda_type_list
	peaks_frame["Lambda"] = lambda_list
	peaks_frame["Poisson pvalue"] = pvalue_list
	return peaks_frame
Ejemplo n.º 9
0
def load_restriction_fragment(in_file, minfragsize=None, maxfragsize=None, verbose=False):
    """
    Read a BED file and store the intervals in a tree

    Intervals are zero-based objects. The output object is a hash table with
    one search tree per chromosome

    in_file = input file [character]
    verbose = verbose mode [logical]

    """
    resFrag = {}
    if verbose:
        print "## Loading Restriction File Intervals '", in_file, "'..."

    bed_handle = open(in_file)
    nline = 0
    for line in bed_handle:
        nline +=1
        bedtab = line.split("\t")
        try:
            chromosome, start, end, name = bedtab[:4]
        except ValueError:
            print "Warning : wrong input format in line", nline,". Not a BED file !?"
            continue

        # BED files are zero-based as Intervals objects
        start = int(start)  # + 1
        end = int(end)
        fragl = abs(end - start)
        name = name.strip()

        ## Discard fragments outside the size range
        if minfragsize != None and int(fragl) < int(minfragsize):
            print "Warning : fragment ", name, " [", fragl, "] outside of range. Discarded"  
            continue
        if maxfragsize != None and int(fragl) > int(maxfragsize):
            print "Warning : fragment ", name, " [", fragl,"] outside of range. Discarded"  
            continue
       
        if chromosome in resFrag.keys():
            tree = resFrag[chromosome]
            tree.add_interval(Interval(start, end, value={'name': name}))
        else:
            tree = Intersecter()
            tree.add_interval(Interval(start, end, value={'name': name}))
            resFrag[chromosome] = tree
    
    bed_handle.close()
    return resFrag
Ejemplo n.º 10
0
def find_overlap_dataframes(query, hits):
    '''
    find overlap between sorted query and hits regions
    :param query: [pd.DataFrame] query peaks
    :param hits: [pd.DataFrame] hits regions
    :return: flags [array]
     
    '''
    query_idx, hits_labs = [], []
    tree_hash = {chrom: Intersecter() for chrom in hits.chrom.unique()}
    null_res = [
        tree_hash[chrom].add_interval(Interval(start, end))
        for chrom, start, end in hits.values
    ]

    for idx, line in enumerate(query.values):
        chrom, start, end = line[0:3]
        overlaps = tree_hash[chrom].find(start, end)
        if not overlaps: continue
        for ovp in overlaps:
            hits_labs.append(chrom + ':' + str(ovp.start) + '-' + str(ovp.end))
            query_idx.append(idx)

    tmp_labels = hits.chrom + ':' + hits.start.astype(
        str) + '-' + hits.end.astype(str)
    hit_indexs = tmp_labels[tmp_labels.isin(hits_labs)].index.values
    return np.array(query_idx), hit_indexs
Ejemplo n.º 11
0
def load_bed(read_length, path, flanking):
    """
    parse bed file and ignore overlapping regions
    :param path: BED file path
    :param flanking: Integer to add to each target's start and end
    :return: dictionary of targets where keys are chrom,start,end
    """
    targets = {}
    unique_targets = []
    bed = BedTool(path)
    for record in bed:
        chrom, start, end = record[0], int(record[1]), int(record[2])
        start -= flanking
        end += flanking
        if int(end) < int(
                start):  # if this target on minus strand flip the start/end
            start, end = end, start
        if chrom not in targets:
            targets[chrom] = Intersecter()
        if targets[chrom].find(start, end) == [] and abs(start - end) >= (
                read_length * 2):  # no overlaps yet
            targets[chrom].add_interval(Interval(start, end))
            unique_targets.append([chrom, start, end])

    return unique_targets
Ejemplo n.º 12
0
def regionTree2(tmp,resFrag,strand,info,geneid):
    if strand not in resFrag:
        resFrag[strand] = {}
    if tmp[0] in resFrag[strand]:
        resFrag[strand][tmp[0]].add_interval(Interval(int(tmp[1]),int(tmp[2]),value={"exon":info,'geneid':geneid}))
    else:
        resFrag[strand][tmp[0]] = Intersecter()
        resFrag[strand][tmp[0]].add_interval(Interval(int(tmp[1]),int(tmp[2]),value={"exon":info,'geneid':geneid}))
Ejemplo n.º 13
0
def regionTree(tmp, resFrag):
    """
    tmp: The length of the list is at least 3(chrom,start,end)
    resFrag: a dictionary to store the Interval tree
    """
    if tmp[0] not in resFrag:
        resFrag[tmp[0]] = Intersecter()
    resFrag[tmp[0]].add_interval(Interval(int(tmp[1]), int(tmp[2]), tmp[3:]))
Ejemplo n.º 14
0
    def __init__(self,coordinates):
        self.interval_tree=dict()

        for c in coordinates:
            if c.chr_id not in self.interval_tree:
                self.interval_tree[c.chr_id]=Intersecter()

            self.interval_tree[c.chr_id].add_interval(Interval(c.bpstart,c.bpend,c))
Ejemplo n.º 15
0
def maps_gene(mapped):
    '''Determine if the mapped alignment falls within a gene.'''
    global intersecters

    try:
        intersecter = intersecters[mapped['genome']]
    except KeyError:
        genes = db.feature.find({'genome': mapped['genome'], 'type': 'gene'})

        intersecter = Intersecter()

        # Interval end is exclusive, need to +1 to line up with actual position
        [intersecter.add_interval(Interval(gene['start'], gene['end'] + 1, gene['uid']))
         for gene in genes]

        intersecters[mapped['genome']] = intersecter

    return intersecter.find(mapped['refStart'], mapped['refEnd'])
Ejemplo n.º 16
0
def load_BED(in_file, verbose=False):
    """
    Read a BED file and store the intervals in a tree
    Intervals are zero-based objects. The output object is a hash table with
    one search tree per chromosome

    BED file are half-open, meaning that a bin ]100, 200] covered the bases 101 to 200
    
    in_file = input file [character]
    verbose = verbose mode [logical]
    """
    x = {}
    if verbose:
        print "## Loading BED file '", in_file, "'..."
    featureNames = []
    nline = 0
    with open(in_file) as bed_handle:
        for line in bed_handle:
            if nline > 0 and nline % 5000 == 0 and verbose:
                print "## %d features loaded ..." % nline
            nline += 1
            bedtab = line.split("\t")
            try:
                chromosome, start, end, name = bedtab[:4]
            except ValueError:
                print >> sys.stderr, "Warning : wrong input format in line", nline, ". Not a BED file !?"
                continue

            # BED files are zero-based, half-open as Intervals objects
            start = int(start)
            end = int(end)
            featureNames.append(name.strip())
            if chromosome in x:
                tree = x[chromosome]
                tree.add_interval(
                    Interval(start, end, value={'pos': nline - 1}))
            else:
                tree = Intersecter()
                tree.add_interval(
                    Interval(start, end, value={'pos': nline - 1}))
                x[chromosome] = tree
    bed_handle.close()
    return (x, featureNames)
Ejemplo n.º 17
0
def load_bed(in_file, verbose=False):
    """
    Read a BED file and store the intervals in a tree

    Intervals are zero-based objects. The output object is a hash table with
    one search tree per chromosome

    in_file = input file [character]
    verbose = verbose mode [logical]

    """
    intervals = {}
    if verbose:
        print >> sys.stderr, "## Loading BED file '", in_file, "'..."

    bed_handle = open(in_file)
    nline = 0
    for line in bed_handle:
        nline += 1
        bedtab = line.strip().split("\t")
        try:
            chromosome, start, end = bedtab[:3]
        except ValueError:
            print >> sys.stderr, "Warning : wrong input format in line", nline, ". Not a BED file !?"
            continue

        # BED files are zero-based as Intervals objects
        start = int(start)  # + 1
        end = int(end)
        fragl = abs(end - start)

        if chromosome in intervals:
            tree = intervals[chromosome]
            tree.add_interval(Interval(start, end))
        else:
            tree = Intersecter()
            tree.add_interval(Interval(start, end))
            intervals[chromosome] = tree

    bed_handle.close()
    return intervals
Ejemplo n.º 18
0
def maps_gene(mapped):
    """Determine if the mapped alignment falls within a gene."""
    global intersecters

    try:
        intersecter = intersecters[mapped['genome']]
    except KeyError:
        genes = db.feature.find({'genome': mapped['genome'], 'type': 'gene'})

        intersecter = Intersecter()

        # Interval end is exclusive, need to +1 to line up with actual position
        [
            intersecter.add_interval(
                Interval(gene['start'], gene['end'] + 1, gene['uid']))
            for gene in genes
        ]

        intersecters[mapped['genome']] = intersecter

    return intersecter.find(mapped['refStart'], mapped['refEnd'])
Ejemplo n.º 19
0
def load_BED(in_file, exclusionSize=0, verbose=False):
    """
    Read a BED file and store the intervals in a tree

    Intervals are zero-based objects. The output object is a hash table with
    one search tree per chromosome

    in_file = input file [character]
    verbose = verbose mode [logical]

    """
    x = {}
    x_ex = {}
    if verbose:
        print "## Loading BED file '", in_file, "'..."
    nline = 0
    with open(in_file) as bed_handle:
        for line in bed_handle:
            nline +=1
            bedtab = line.split("\t")
            try:
                chromosome, start, end, name = bedtab[:4]
            except ValueError:
                print "Warning : wrong input format in line", nline,". Not a BED file !?"
                continue
            
            # BED files are zero-based as Intervals objects
            start = int(start)  # + 1
            end = int(end)
            name = name.strip()
            if chromosome in x.keys():
                tree = x[chromosome]
                tree.add_interval(Interval(start, end, value={'name': name}))
            else:
                tree = Intersecter()
                tree.add_interval(Interval(start, end, value={'name': name}))
                x[chromosome] = tree             
            ## Exclusion regions
            if exclusionSize > 0:
                if chromosome in x_ex.keys():
                    tree_ex = x_ex[chromosome]
                    tree_ex.add_interval(Interval(start - int(exclusionSize), start, value={'name': str(name) + "_up"}))
                    tree_ex.add_interval(Interval(end, end + int(exclusionSize), value={'name': str(name) + "_dwn"}))
                else:
                    tree_ex = Intersecter()
                    tree_ex.add_interval(Interval(start - int(exclusionSize), start, value={'name': str(name) + "_up"}))
                    tree_ex.add_interval(Interval(end, end + int(exclusionSize), value={'name': str(name) + "_dwn"}))
                    x_ex[chromosome] = tree_ex             
    bed_handle.close()
    return (x, x_ex)
def get_reads_and_ranges(bam_region, cid, chrom, region_start, region_end, strand, options):
    pos_range = defaultdict(lambda: [0,0])
    filtered_reads = Intersecter()
    read_iterator = filter_reads(
        bam_region, options.chrom_prefix + chrom,
        region_start, region_end, options
    )
    for read in read_iterator:
        if is_valid_paired(read, region_start, options):
            rstart = min(read.pos, read.pnext) + 1
            rend = rstart + abs(read.isize) - 1
            filtered_reads.add_interval(Interval(rstart, rend))
            inc_pr(pos_range, rstart, rend, region_start, region_end)
            inc_pr_at(pos_range, rstart, region_start, region_end)
            inc_pr_at(pos_range, rend, region_start, region_end)
        elif is_valid_single(read, options):
            rstart = read.pos + 1
            rend = rstart + aln_length(read.cigar) - 1
            filtered_reads.add_interval(Interval(rstart, rend))
            inc_pr(pos_range, rstart, rend, region_start, region_end)
            if as_merged(read, options) or as_trimmed(read, options):
                inc_pr_at(pos_range, rstart, region_start, region_end)
                inc_pr_at(pos_range, rend, region_start, region_end)
            elif read.is_reverse:
                inc_pr_at(pos_range, rend, region_start, region_end)
            else:
                inc_pr_at(pos_range, rstart, region_start, region_end)
    return filtered_reads, pos_range
Ejemplo n.º 21
0
 def _index_record_in_intervaldict(self, rec):
     '''
 Insert CDS intervals from a chromosome SeqRecord into our cached
 interval tree dict.
 '''
     intersector = self.intervaldict.setdefault(rec.id, Intersecter())
     for feature in rec.features:  # Each top-level CDS
         intvl = dict(chrom=rec.id, value={'cdsobj': feature})
         if hasattr(feature, 'strand'):
             intvl['strand'] = feature.strand  # could still be None.
         locus = Interval(int(feature.location.start),
                          int(feature.location.end), **intvl)
         intersector.add_interval(locus)
Ejemplo n.º 22
0
def purify_introns(Introns_Dict, Exons_Sect):
    '''
    select only introns that don't contain exons, these are likely to be "pure"
    introns that lack any real polyA sites.
    '''
    pure = dict()
    for (chrm, strand) in Introns_Dict.keys():
        for intron in Introns_Dict[(chrm, strand)]:
            if not Exons_Sect[(chrm, strand)].find(intron[0], intron[1]):
                if not pure.has_key((chrm, strand)):
                    pure[(chrm, strand)] = Intersecter()
                pure[(chrm, strand)].add(intron[0], intron[1], intron)
    return pure
Ejemplo n.º 23
0
def calculate_score(chrom, start, end,ctype="WPS"):
  filteredReads = Intersecter()
  posRange = defaultdict(int)
  for read in readIterator(args,chrom,start,end):
    if read.is_duplicate or read.is_qcfail or read.is_unmapped: continue
    if isSoftClipped(read.cigar): continue
    
    if read.is_paired:
      if read.mate_is_unmapped: continue
      if read.rnext != read.tid: continue
      if read.is_read1 or (read.is_read2 and read.pnext+read.qlen < start):
        if read.isize == 0: continue
        if options.downsample != None and random.random() >= options.downsample: continue
        rstart = min(read.pos,read.pnext)+1 # 1-based
        rend = rstart+abs(read.isize)-1 # end included
        rlength = rend-rstart+1
        if options.minLength <= rlength <= options.maxLength:
          filteredReads.add_interval(Interval(rstart,rend))
          if ctype == "COV":
            for i in range(rstart,rend+1):
              if i >= start and i <= end:
                posRange[i]+=1
          elif ctype == "STARTS":
            if rstart >= start and rstart <= end:
              posRange[rstart]+=1
            if rend >= start and rend <= end:
              posRange[rend]+=1
    else:
      if options.downsample != None and random.random() >= options.downsample: continue
      rstart = read.pos+1 # 1-based
      rend = rstart+aln_length(read.cigar)-1 # end included
      rlength = rend-rstart+1
      if options.minLength <= rlength <= options.maxLength:
        filteredReads.add_interval(Interval(rstart,rend))
        if ctype == "COV":
          for i in range(rstart,rend+1):
            if i >= start and i <= end:
              posRange[i]+=1
        elif ctype == "STARTS":
          if rstart >= start and rstart <= end:
            posRange[rstart]+=1
          if rend >= start and rend <= end:
            posRange[rend]+=1

  if ctype == "WPS":
    protection = options.protection//2
    for pos in xrange(start,end+1):
      rstart,rend = pos-protection,pos+protection
      gcount,bcount = 0,0
      for read in filteredReads.find(rstart,rend):
        if (read.start > rstart) or (read.end < rend): bcount +=1
        else: gcount +=1
      posRange[pos]+=gcount-bcount

  res = []
  for pos in xrange(start,end+1):
    res.append(posRange[pos])
  return res
Ejemplo n.º 24
0
 def GenomicInterval_2_intersecter_and_dict(GI):
     II = dict()
     ID = dict()
     for intron in GI:
         chrm = clean_chr_name(intron.chr)
         strand = intron.strand
         if not II.has_key((chrm, strand)):
             II[(chrm, strand)] = Intersecter()
         II[(chrm, strand)].add(intron.start, intron.stop,
                                [intron.start, intron.stop])
         if not ID.has_key((chrm, intron.strand)):
             ID[(chrm, strand)] = []
         ID[(chrm, strand)].append([intron.start, intron.stop])
     return II, ID
Ejemplo n.º 25
0
def polyA_dict_2_intersecter( polyA ):
    '''
    load a polyA gff file into a dictionary/intersecter object
    NOTE: bx.python is (open,open) in its interval searchers,
    e.g. T.find( 1,10 ) will not return true if T contains (1,1) or (10,10)
    '''
    polyA_I = dict()
    for (chrm, strand) in polyA.keys():
        if not polyA_I.has_key((chrm,strand)):
            polyA_I[(chrm,strand)] = Intersecter()
        for p_site in polyA[(chrm,strand)]:
            polyA_I[(chrm,strand)].add( 
                p_site, p_site, [p_site, polyA[(chrm,strand)][p_site]] ) 
    return polyA_I
Ejemplo n.º 26
0
def load_restriction_fragment(in_file, verbose):
    """
    Read a BED file and store the intervals in a tree

    Intervals are zero-based objects. The output object is a hash table with
    one search tree per chromosome

    in_file = input file [character]
    verbose = verbose mode [logical]

    """
    resFrag = {}
    if verbose:
        print "## Loading Restriction File Intervals '", in_file, "'..."

    bed_handle = open(in_file)
    for line in bed_handle:
        bedtab = line.split("\t")
        try:
            chromosome, start, end, name = bedtab[:4]
        except ValueError:
            # FIXME we might want a proper warning message here !
            continue

        # BED files are zero-based as Intervals objects
        start = int(start)  # + 1
        end = int(end)
        name = name.strip()
        if chromosome in resFrag.keys():
            tree = resFrag[chromosome]
            tree.add_interval(Interval(start, end, value={'name': name}))
        else:
            tree = Intersecter()
            tree.add_interval(Interval(start, end, value={'name': name}))
            resFrag[chromosome] = tree
    bed_handle.close()
    return resFrag
def index_bed(genes, un_stranded):
    """ """
    chroms = set([i['chrom'] for i in genes])
    if not un_stranded:
        bed_quicksect = {
            chr + ":" + str: Intersecter()
            for str in ['+', '-'] for chr in set(chroms)
        }
    else:
        bed_quicksect = {chr: Intersecter() for chr in set(chroms)}
    for annotation in genes:
        chrom = annotation['chrom']
        strand = annotation['strand']
        contig = chrom if un_stranded else chrom + ":" + strand
        bed_quicksect[contig].insert_interval(
            Interval(annotation['beg'],
                     annotation['end'],
                     chrom=chrom,
                     strand=strand,
                     value={
                         "annotation_type": annotation['annotation_type'],
                         "annotation_name": annotation['annotation_name'],
                     }))
    return bed_quicksect
Ejemplo n.º 28
0
 def gtf_CDSs_2_intersecter_and_dict(gtf_fn):
     fid = open(gtf_fname)
     CDS_I = dict()
     CDS_D = dict()
     for line in fid:
         data = line.strip().split('\t')
         if not data[2] == 'CDS':
             continue
         chrm = clean_chr_name(data[0])
         strand = data[6]
         start = int(data[3])
         end = int(data[4])
         if not CDS_I.has_key((chrm, strand)):
             CDS_I[(chrm, strand)] = Intersecter()
         CDS_I[(chrm, strand)].add(start, end, [start, end])
         if not CDS_D.has_key((chrm, strand)):
             CDS_D[(chrm, strand)] = []
         CDS_D[(chrm, strand)].append([start, end])
     return CDS_I, CDS_D
Ejemplo n.º 29
0
def emf_init_index(filename):
  from bx.intervals.intersection import Intersecter, Interval
  if os.path.isfile(filename):
    infile = open(filename)
    res = {}
    for line in infile:
      if line.startswith("#"): continue
      fields = line.split() #Chr\tStart\tEnd\tFile\tByte\tLine
      if len(fields) == 6:
        chrom,start,end,cfile,bytes,lines = fields
        start,end = int(start)+1,int(end)+1 #MAKE COORDINATES ONE BASED HALF-OPEN
        if chrom not in res: 
          res[chrom] = {}
          res[chrom]["coords"] = Intersecter()
        res[chrom]["coords"].add_interval(Interval(start, end))
        res[chrom][(start,end)] = cfile,int(bytes),int(lines)
      else:
        sys.stderr.write("Unexpected line [%s]: %s"%(filename,line))
    infile.close()
    return res
  return None
Ejemplo n.º 30
0
 def calculate_intersection(cls,coords1,coords2,build_matrix=False):
 
     coords_in_common=set()
     intersection_indexes=set()
     if build_matrix:
         intersection_matrix=sparse_matrix((len(coords1),len(coords2)))
     coord_to_row_index=dict()
     row_index=0 
     interval_tree=dict()
 
     #Build the interval tree on the first set of coordinates
     for c in coords1:
         if c.chr_id not in interval_tree:
             interval_tree[c.chr_id]=Intersecter()
 
         interval_tree[c.chr_id].add_interval(Interval(c.bpstart,c.bpend))
         coord_to_row_index[c]=row_index
         row_index+=1
 
     #Calculating the intersection
     #for each coordinates on the second set check intersection and fill the matrix
     for cl_index,c in enumerate(coords2):
         if interval_tree.has_key(c.chr_id):
             coords_hits=interval_tree[c.chr_id].find(c.bpstart, c.bpend)
             #coords_in_common+=coords_hits
             for coord_hit in coords_hits:
                 c_to_add=Coordinate.coordinates_from_interval(c.chr_id, coord_hit)
                 coords_in_common.add(c_to_add)
                 row_index=coord_to_row_index[c_to_add]
                 intersection_indexes.add(row_index)
                 
                 if build_matrix:
                     intersection_matrix[row_index,cl_index]+=1
     
     if build_matrix:
         return list(coords_in_common),list(intersection_indexes),intersection_matrix
     else:
         return list(coords_in_common),list(intersection_indexes)
Ejemplo n.º 31
0
    def annotate(self):
        #allocate_memory
        self.annotation_track=np.ones(len(self.input_coordinates),dtype=np.int)

        self.interval_tree=dict()
        self.coord_to_row_index=dict()
        self.row_index=0 

        #Build the interval Tree
        for c in self.input_coordinates:
            if c.chr_id not in self.interval_tree:
                self.interval_tree[c.chr_id]=Intersecter()
    
            self.interval_tree[c.chr_id].add_interval(Interval(c.bpstart,c.bpend))
            self.coord_to_row_index[c]=self.row_index
            self.row_index+=1

        for idx,bed_filename in enumerate(self.annotations_filenames):
            coordinates=Coordinate.bed_to_coordinates(bed_filename)

            prime_number=self.annotation_names_to_prime[self.annotation_names[idx]]
            for idx_intersection in  self.__intersection_indexes(coordinates):
                self.annotation_track[idx_intersection]*=prime_number
Ejemplo n.º 32
0
def load_restriction_fragment(enzyme_bed):
    """
    Read enzyme bed file and store the intervals in a tree
    :param enzyme_bed: bed file path
    :return: res_frag
    """
    res_frag = defaultdict(lambda :Intersecter())
    with open(enzyme_bed) as fp:
        for nline, line in enumerate(fp, 1):
            line_list = line.strip().split()
            try:
                chrn, start, end, name = line_list[:4]
            except ValueError:
                print("[Warning] wrong input format in line :{}. "
                      "Not a bed file?!".format(nline))
                continue

            start = int(start)
            end = int(end)

            tree = res_frag[chrn]
            tree.add_interval(Interval(start, end, value={'name':name}))

    return res_frag
Ejemplo n.º 33
0
# fields three and six are set to default value '-'
# field 8 is set to default value '.' to meet compatibility with GFF3
f1.columns = [
    'chr', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase',
    'attributes'
]

f2 = pd.read_csv(sys.argv[2], header=None, sep="\t")
f2.columns = [
    'chr', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase',
    'attributes'
]

tree_dict = {}

tree = Intersecter()

# first input file - gvf file containing the annotation
for chrom in range(1, 23):
    tree_dict['chr' + str(chrom)] = Intersecter()

tree_dict['chrX'] = Intersecter()
tree_dict['chrY'] = Intersecter()

gene_id_re = re.compile("(GeneID=[0-9]+)")
gene_symbol = re.compile("(GeneSymbol=[A-z0-9]+)")

for i, j in f1.iterrows():
    geneid = gene_id_re.search(j[-1]).group(0)
    geneid_new = geneid.replace("GeneID=", "")
    genesymbol = gene_symbol.search(j[-1]).group(0)
Ejemplo n.º 34
0
def load_BED(in_file, exclusionSize=0, verbose=False):
    """
    Read a BED file and store the intervals in a tree

    Intervals are zero-based objects. The output object is a hash table with
    one search tree per chromosome

    in_file = input file [character]
    verbose = verbose mode [logical]

    """
    x = {}
    x_ex = {}
    if verbose:
        print "## Loading BED file '", in_file, "'..."
    nline = 0
    with open(in_file) as bed_handle:
        for line in bed_handle:
            nline += 1
            bedtab = line.split("\t")
            try:
                chromosome, start, end, name = bedtab[:4]
            except ValueError:
                print "Warning : wrong input format in line", nline, ". Not a BED file !?"
                continue

            # BED files are zero-based as Intervals objects
            start = int(start)  # + 1
            end = int(end)
            name = name.strip()
            if chromosome in x:
                tree = x[chromosome]
                tree.add_interval(Interval(start, end, value={'name': name}))
            else:
                tree = Intersecter()
                tree.add_interval(Interval(start, end, value={'name': name}))
                x[chromosome] = tree
            ## Exclusion regions
            if exclusionSize > 0:
                if chromosome in x_ex:
                    tree_ex = x_ex[chromosome]
                    tree_ex.add_interval(
                        Interval(start - int(exclusionSize),
                                 start,
                                 value={'name': str(name) + "_up"}))
                    tree_ex.add_interval(
                        Interval(end,
                                 end + int(exclusionSize),
                                 value={'name': str(name) + "_dwn"}))
                else:
                    tree_ex = Intersecter()
                    tree_ex.add_interval(
                        Interval(start - int(exclusionSize),
                                 start,
                                 value={'name': str(name) + "_up"}))
                    tree_ex.add_interval(
                        Interval(end,
                                 end + int(exclusionSize),
                                 value={'name': str(name) + "_dwn"}))
                    x_ex[chromosome] = tree_ex
    bed_handle.close()
    return (x, x_ex)
Ejemplo n.º 35
0
mdict = {}
for i in gffs:
    k = gffs.index(i)
    syn_region[k] = {}
    for y in i:
        if k == 0:
            if y.name.startswith('GRMZM'):
                y.name = y.name.split('.')[0]
            else:
                y.name = '_'.join(y.name.split('.')[:2])
        if k == 1 or k == 2:
            y.name = '.'.join(y.name.split('.')[:2])
        if y[2] != 'gene': continue
        if y.name not in syn[k]: continue
        if y[0] not in syn_region[k]:
            syn_region[k][y[0]] = Intersecter()
        syn_region[k][y[0]].add_interval(Interval(int(y[3]), int(y[4]),
                                                  y.name))
        if y.name not in mdict:
            mdict[y.name] = []
#		if '_' in y.name:continue
#		if 'scaffold' in y.chrom:continue
        mdict[y.name].append(y.chrom)
        mdict[y.name].append(int(y[3]))
        mdict[y.name].append(int(y[4]))
        mdict[y.name].append(y.strand)

cand = open(sys.argv[2])
count = 0
for line in cand:
    count += 1
  chrom = options.region.split(':')[0]
  start,end = map(int,options.region.split(':')[1].replace(",","").split("-"))
  outchrom = chrom
  outchrom = outchrom.replace("chr","")
  if outchrom.startswith('gi|'): # gi|224589803|ref|NC_000012.11|
    NCfield = outchrom.split("|")[-2]
    if NCfield.startswith("NC"):
      outchrom = "%d"%(int(NCfield.split(".")[0].split("_")[-1]))
  if options.verbose: 
    sys.stderr.write("Coordinates parsed: Chrom %s Start %d End %d\n"%(chrom,start,end))
except:
  sys.stderr.write("Error: Region not defined in a correct format!")
  sys.exit()

posRange = defaultdict(lambda:[0,0,0])
filteredReads = Intersecter()
for read in readIterator(args,options):
  if read.is_duplicate or read.is_qcfail or read.is_unmapped: continue
  if isSoftClipped(read.cigar): continue
  
  if read.is_paired:
    if read.mate_is_unmapped: continue
    if read.rnext != read.tid: continue
    if read.is_read1 or (not options.pipe and read.is_read2 and read.pnext+read.qlen < start):
      if read.isize == 0: continue
      if options.downsample != None and random.random() >= options.downsample: continue
      if options.random:
        rstart = min(read.pos,read.pnext)+1+random.randint(-5,5) # 1-based
        rend = rstart+abs(read.isize)-1+random.randint(-5,5) # end included
      else:
        rstart = min(read.pos,read.pnext)+1 # 1-based
Ejemplo n.º 37
0
trees = dict()
with gzip.open(args.subject) if args.subject.endswith('.gz') else open(
        args.subject) as fh:
    genome = ''
    for line in fh:
        if line.isspace() or line[0] == '#':
            continue

        data = line.rstrip().split('\t')
        if len(data) != 9:
            sys.stderr.write('number of columns != 9: {}'.format(line))

        g, start, end, strand = data[0], int(data[3]), int(data[4]), data[6]
        if g != genome:
            genome = g
            trees[genome] = Intersecter()

        if strand == '+':
            start -= args.extend_upstream
            end += args.extend_downstream
        else:
            start -= args.extend_downstream
            end += args.extend_upstream

        if not args.embeded and strand == '-':  # complement strand
            start, end = -end, -start
        trees[genome].add_interval(Interval(start, end, value=data))

if args.split:
    if args.split_dir is None:
        outdir = '{}.intersect@{}'.format(
Ejemplo n.º 38
0
def read_chain_file(chain_file, print_table=False):
    '''
	Read chain file.

	Parameters
	----------
	chain_file : file
		Chain format file. Input chain_file could be either plain text, compressed file
		(".gz",".Z", ".z", ".bz", ".bz2", ".bzip2"), or a URL pointing to the chain file
		("http://","https://", "ftp://"). If url was used, chain file must be plain text.

	print_table : bool, optional
		Print mappings in human readable table.

	Returns
	-------
	maps : dict
		Dictionary with source chrom name as key, IntervalTree object as value. An
		IntervalTree contains many intervals. An interval is a start and end position
		and a value. eg. Interval(11, 12, strand="-", value = "abc")

	target_chromSize : dict
		Chromosome sizes of target genome

	source_chromSize : dict
		Chromosome sizes of source genome
	'''

    logging.info("Read the chain file \"%s\" " % chain_file)
    maps = {}
    target_chromSize = {}
    source_chromSize = {}
    if print_table:
        blocks = []

    for line in ireader.reader(chain_file):
        # Example: chain 4900 chrY 58368225 + 25985403 25985638 chr5 151006098 - 43257292 43257528 1
        if not line.strip():
            continue
        line = line.strip()
        if line.startswith(('#', ' ')): continue
        fields = line.split()

        if fields[0] == 'chain' and len(fields) in [12, 13]:
            #score = int(fields[1])		  # Alignment score
            source_name = fields[2]  # E.g. chrY
            source_size = int(fields[3])  # Full length of the chromosome
            source_strand = fields[4]  # Must be +
            if source_strand != '+':
                raise Exception(
                    "Source strand in a chain file must be +. (%s)" % line)
            source_start = int(fields[5])  # Start of source region
            #source_end = int(fields[6])	  # End of source region

            target_name = fields[7]  # E.g. chr5
            target_size = int(fields[8])  # Full length of the chromosome
            target_strand = fields[9]  # + or -
            target_start = int(fields[10])
            #target_end = int(fields[11])
            target_chromSize[target_name] = target_size
            source_chromSize[source_name] = source_size

            if target_strand not in ['+', '-']:
                raise Exception("Target strand must be - or +. (%s)" % line)
            #chain_id = None if len(fields) == 12 else fields[12]
            if source_name not in maps:
                maps[source_name] = Intersecter()

            sfrom, tfrom = source_start, target_start

        # Now read the alignment chain from the file and store it as a list (source_from, source_to) -> (target_from, target_to)
        elif fields[0] != 'chain' and len(fields) == 3:
            size, sgap, tgap = int(fields[0]), int(fields[1]), int(fields[2])
            if print_table:
                if target_strand == '+':
                    blocks.append(
                        (source_name, sfrom, sfrom + size, source_strand,
                         target_name, tfrom, tfrom + size, target_strand))
                elif target_strand == '-':
                    blocks.append(
                        (source_name, sfrom, sfrom + size, source_strand,
                         target_name, target_size - (tfrom + size),
                         target_size - tfrom, target_strand))

            if target_strand == '+':
                maps[source_name].add_interval(
                    Interval(
                        sfrom, sfrom + size,
                        (target_name, tfrom, tfrom + size, target_strand)))
            elif target_strand == '-':
                maps[source_name].add_interval(
                    Interval(sfrom, sfrom + size,
                             (target_name, target_size - (tfrom + size),
                              target_size - tfrom, target_strand)))

            sfrom += size + sgap
            tfrom += size + tgap

        elif fields[0] != 'chain' and len(fields) == 1:
            size = int(fields[0])
            if print_table:
                if target_strand == '+':
                    blocks.append(
                        (source_name, sfrom, sfrom + size, source_strand,
                         target_name, tfrom, tfrom + size, target_strand))
                elif target_strand == '-':
                    blocks.append(
                        (source_name, sfrom, sfrom + size, source_strand,
                         target_name, target_size - (tfrom + size),
                         target_size - tfrom, target_strand))

            if target_strand == '+':
                maps[source_name].add_interval(
                    Interval(
                        sfrom, sfrom + size,
                        (target_name, tfrom, tfrom + size, target_strand)))
            elif target_strand == '-':
                maps[source_name].add_interval(
                    Interval(sfrom, sfrom + size,
                             (target_name, target_size - (tfrom + size),
                              target_size - tfrom, target_strand)))
        else:
            raise Exception("Invalid chain format. (%s)" % line)
    #if (sfrom + size) != source_end  or (tfrom + size) != target_end:
    #	 raise Exception("Alignment blocks do not match specified block sizes. (%s)" % header)

    if print_table:
        for i in blocks:
            print('\t'.join([str(n) for n in i]))

    return (maps, target_chromSize, source_chromSize)
Ejemplo n.º 39
0
def compare_two_transcripts(trans1, trans2, transcript_dict, 
        afe=False):
    """
    Returns the splice differences between two transcripts.
    Single exon-comparisons are ignored.

    Parameters
    ----------
    trans1 : string 
        transcript of interest
    trans2 : string 
        second transcript of interest
    transcript_dict : a dictionary of transcript names with 
    values being a list of exons
    afe : bool
       whether to include alternate start and ends

    :TODO make a better return
    :TODO maybe include something similar to to_plot
    Returns
    -------
    Exclusive Junctions : 
    5' upstream exons : 
    3' downstram exons : 
    Skipped Exons : Diffevent 
    """
    # TODO refactor this
    t1 = transcript_dict[trans1]
    t2 = transcript_dict[trans2]
    tree = Intersecter()
    starts1 = [i[0] for i in t1]
    starts2 = [i[0] for i in t2]
    reverse = False
    if min(starts1) <= min(starts2):
        s1 = t1
        s2 = t2
        s2_beg = min(starts2)
    else:
        s1 = t2 
        s2 = t1
        reverse = True
        s2_beg = min(starts1)
    if reverse: torder = (trans2, trans1)
    else: torder = (trans1, trans2)
    # Ignore single-exon stuff
    if len(s1) <= 1 or len(s2) <= 1:
        return([], [])
    for i in s1:
        tree.add_interval(Interval(int(i[0]), int(i[1]), 
            value={'anno':i[2]}))
    matching_exons = []
    exclusive_juncs = []
    skipped_exons = []
    altends = []
    exon_match = {}
    s1.sort(key=lambda x: x[0])
    s2.sort(key=lambda x: x[0])
    max_exon_1 =  s1[-1][2]
    max_exon_2 = s2[-1][2]
    #end_position_s2 = max([i[1] for i in s2])
    s1_end = max([i[1] for i in s1])
    prev_match = None
    if max_exon_1 < s1[0][2]:
        strand = -1
    else:
        strand = 1
    for pcurr in range(len(s2)):
        start, end, exon_n = s2[pcurr]
        overlap = tree.find(int(start), int(end))
        if len(overlap) == 0:
            if prev_match and (start < s1_end):
                #skipped exons
                cigar = _generate_cigar(s2, pcurr, mskip=1)
                try:
                    if exon_match[exon_n - strand] == prev_match.value['anno']:
                        try:
                            nm = tree.find(*s2[pcurr + 1][0:2])[0]
                            ocigar = [(3, nm.start - prev_match.end)]
                            nexon = nm.value['anno']
                        except IndexError:
                            nm=s1[_get_by_exonn(prev_match.value['anno']+strand,s1)] 
                            ocigar = [(3,nm[0] - prev_match.end)]
                            nexon = nm[2]
                    skipped_exons.append(DiffEvent('skipped_exon', start, end,
                            torder, cigar2=cigar, cigar1 = ocigar, 
                            exon_num = (None, exon_n), 
                            exon2=(prev_match.value['anno'], nexon))
                            )
                except KeyError:
                    # Multiple skipped exons
                    ncig = _generate_cigar(s2, pcurr, mskip=1)[1:]
                    skipped_exons[-1]._extend(ncig, cig=2)
            elif start > s1_end: 
                if prev_match:
                    cigar = _generate_cigar(s2, pcurr, mskip=1)
                    pm = tree.find(*s2[pcurr - 1][0:2])[0]
                    pexon = pm.value['anno']
                    ocigar = []
                    for i in range(pexon, max_exon_1+strand, strand):
                        narg = _get_by_exonn(i, s1)
                        ocigar.append((0, s1[narg][0], s1[narg][1]))
                        try:
                            ocigar.append((3, s1[narg][1] - s1[narg+1][0]))
                        except IndexError:
                            pass
                    #:TODO extend ocigar till end?
                    altends.append(DiffEvent('AE', start, end,
                        torder, cigar2=cigar, cigar1=ocigar,
                        exon_num = (None, exon_n)))
                else: 
                    pass
            else: 
                # Alternate start site that starts in between exons
                # of other transcript
                cigar = _generate_cigar(s2, pcurr, mskip=1)[:-1]
                try:
                    nm = tree.find(*s2[pcurr + 1][0:2])[0]
                except IndexError:
                    from IPython import embed
                    embed()
                nexon = nm.value['anno']
                narg = _get_by_exonn(nexon - strand, s1)
                pmatch = s1[narg]
                ocigar = [(0, pmatch[1] - pmatch[0]),
                        (3, nm.start - pmatch[1])]
                altends.append(DiffEvent('AS', start, end,
                        torder, cigar2=cigar, cigar1 = ocigar))
        elif len(overlap) == 1:
            if start == overlap[0].start and end == overlap[0].end:
                s1_exon_n = overlap[0].value['anno']
                matching_exons.append((start, end, (s1_exon_n, 
                    exon_n), (0, 0)))
                if prev_match:
                    if s1_exon_n - prev_match.value['anno']  == strand: 
                        pass
                    else:
                        # Difference in exon matches
                        mskip = abs(s1_exon_n - prev_match.value['anno'] ) - 1
                        narg = _get_by_exonn(prev_match.value['anno']+strand, s1) 
                        s_s1 = s1[narg] # skipped s1
                        cigar = _generate_cigar(s1, narg, mskip=mskip)
                        ocigar = [(3, start - s2[pcurr-1][1])]
                        # Remove previous one
                        skipped_exons.append(
                                DiffEvent('skipped_exon', 
                                s_s1[0], s_s1[1], torder, 
                                cigar2 = ocigar, cigar1 = cigar, 
                                exon_num = (s_s1[2], None), 
                                exon2 = (exon_n-strand, exon_n)))
                prev_match = overlap[0]
            else:
                sstart = min(start, overlap[0].start)
                ssend = max(end, overlap[0].end)
                # Ignore 5' or 3' differences
                if (exon_n == max_exon_2 and
                        overlap[0].value['anno'] == max_exon_1):
                    if end == overlap[0].end:
                        prev_match = overlap[0]
                else:
                    exclusive_juncs.append(
                            (sstart, ssend,
                            (overlap[0].value['anno'], exon_n), 
                            (overlap[0].start - start, overlap[0].end - end) ))
            # Deal with partial matches
            prev_match = overlap[0]
            exon_match[exon_n] = int(overlap[0].value['anno'])
        else:
            pass
    skipped_exons = EventCollection(transcript_ids = [s1, s2], events=skipped_exons)
    skipped_exons.events.extend(altends)
    return(matching_exons, skipped_exons)
Ejemplo n.º 40
0
if options.pipe:  ## if -p specified, pass file along pipe without writing
    outfile = sys.stdout
elif options.outdir != None:  ## otherwise, write to output file
    outfilepath = os.path.join(os.getcwd(), options.outdir + "/" + outfilename)
    directory = os.path.dirname(outfilepath)  ## get the directory full path

    if not os.path.exists(directory):  ## check if it exists
        os.makedirs(directory)  ## if not, create the directory
    outfile = open(outfilepath, "w+")  ## write the file to the directory
else:
    outfile = open(outfilename, "w+")  ## otherwise, write to current dir
"""
Handle data
"""
total_reads = 0
start_end_list = Intersecter(
)  ## list to hold start/end of each read in infile

init_pos = 0  ## for defining region if user does not
final_pos = 0  ## for defining region if user does not

read_start = 0  ## start of each read
read_end = 0  ## end of each read
other_read_end = 0
"""
Main functionality:
"""

for read in infile:
    total_reads += 1

    ## get start position of the first read
Ejemplo n.º 41
0
def generate(x):
    "Generates random interval over a size and span"
    lo = randint(10000, SIZE)
    hi = lo + randint(1, randint(1, 10**4))
    return (lo, hi)

def generate_point(x):
	lo = randint(10000, SIZE)
	return (lo, lo)

# use this to force both examples to generate the same data
seed(10)

# generate 10 thousand random intervals
data = map(generate, xrange(N))

# generate the intervals to query over
query = map(generate_point, xrange(1000))

# create the interval tree
tree = Intersecter()

# build an interval tree from the rest of the data
for start, end in data:
    tree.add_interval( Interval(start, end) )

# perform the query
for q, q in query:
    overlap = tree.find(q, q)
    out = [ (x.start, x.end) for x in overlap ]
    print '(%s) -> %s' % (q, out)