def get_clusters(self):
     if len(self.vals) == 0:
         return
     # sort values in order of increasing start position
     self.vals.sort(key=operator.itemgetter(0))
     # initialize cluster
     valiter = iter(self.vals)
     start, end, ind, val = valiter.next()
     ind_val_dict = collections.defaultdict(lambda: [])
     ind_val_dict[ind].append(val)
     # find clusters
     for next_start, next_end, ind, val in valiter:
         if next_start > (end + self.max_dist):
             # this interval is outside bounds of current cluster
             yield Interval(start,
                            end,
                            chrom=self.rname,
                            strand=self.strand,
                            value=ind_val_dict)
             ind_val_dict = collections.defaultdict(lambda: [])
             start = next_start
         # update values
         if next_end > end:
             end = next_end
         ind_val_dict[ind].append(val)
     if len(ind_val_dict) > 0:
         yield Interval(start,
                        end,
                        chrom=self.rname,
                        strand=self.strand,
                        value=ind_val_dict)
def build_exon_trees(genes):
    trees = collections.defaultdict(lambda: IntervalTree())
    for g in genes:        
        for e in g.exons:
            start, end = e
            trees[g.chrom].insert_interval(Interval(start, end, strand=g.strand))
    return trees
예제 #3
0
def build_gene_maps(samfh, genefile):
    rname_tid_map = dict(
        (rname, i) for i, rname in enumerate(samfh.references))
    gene_genome_map = [None] * len(samfh.references)
    gene_trees = collections.defaultdict(lambda: IntervalTree())
    # build gene and genome data structures for fast lookup
    for g in GeneFeature.parse(open(genefile)):
        name = config.GENE_REF_PREFIX + g.tx_name
        if name not in rname_tid_map:
            continue
        if g.chrom not in rname_tid_map:
            continue
        gene_tid = rname_tid_map[name]
        # get reference index in sam file
        chrom_tid = rname_tid_map[g.chrom]
        # store gene by reference id in sam file
        gene_genome_map[gene_tid] = g
        # add gene to interval tree
        gene_interval = Interval(g.tx_start,
                                 g.tx_end,
                                 chrom=g.chrom,
                                 strand=g.strand,
                                 value=g.tx_name)
        gene_trees[chrom_tid].insert_interval(gene_interval)
    return gene_genome_map, gene_trees
def find_split_read_clusters(partition_splits, max_indel_size, max_multihits):
    refmaps = {}
    split_mapping_codes = []
    unmapped_read_dict = collections.defaultdict(lambda: [])
    for split_ind, split_reads in enumerate(partition_splits):
        mapping_codes = set()
        for r in split_reads:
            #print 'IND', split_ind, 'mapping code', _mapping_code_strings[get_mapping_code(r, max_multihits)]
            # keep track of mapping results for reads in this split
            mapping_codes.add(get_mapping_code(r, max_multihits))
            if r.is_unmapped:
                #print 'UNMAPPED', 'IND', split_ind, 'READ', r
                unmapped_read_dict[split_ind].append(r)
                continue
            #print 'MAPPED', 'IND', split_ind, 'READ', r
            # cluster reads by reference name and strand
            strand = int(r.is_reverse)
            rkey = (r.rname, strand)
            if rkey not in refmaps:
                refmaps[rkey] = RefCluster(r.rname, strand, max_indel_size)
            refmaps[rkey].add(r.pos, r.aend, split_ind, r)
        split_mapping_codes.append(mapping_codes)
    # convert unmapped reads index dict to a static dict
    unmapped_read_dict = dict(unmapped_read_dict)
    if all((MAPPING not in codes) for codes in split_mapping_codes):
        # if there are no mapped reads, create a dummy "unmapped" ReadCluster
        # and return early
        interval = Interval(0, 0, chrom=-1, strand=0, value={})
        rclust = ReadCluster(interval,
                             start_ind=0,
                             end_ind=len(partition_splits),
                             mapped_inds=[],
                             unmapped_inds=set(unmapped_read_dict),
                             unmapped_read_dict=unmapped_read_dict)
        # results are returned as a list of tuples, where each tuple represents
        # a set of reads that cluster together. return a singleton tuple here
        return split_mapping_codes, [(rclust, )]
    # search reference maps by index to find split points
    cluster_intervals, concordant_clust_ids = find_split_points(
        refmaps, len(partition_splits))
    # convert from cluster ids to read cluster intervals
    concordant_clusters = []
    for start_ind, end_ind, mapped_inds, clust_ids in concordant_clust_ids:
        #print 'SR START', start_ind, 'END', end_ind, 'MAPPED INDS', mapped_inds, 'IDS', clust_ids
        # determine indexes of unmapped splits in this cluster and
        # get lists of unmapped reads
        unmapped_inds = set(xrange(start_ind, end_ind)).difference(mapped_inds)
        rclusts = []
        for id in clust_ids:
            interval = cluster_intervals[id]
            # build a new ReadCluster object with complete information about
            # start,end indexes and which indexes are mapping/nonmapping.
            # thus clusters will now be contiguous lists of split indexes with
            # padding information at the start/end
            rclust = ReadCluster(interval, start_ind, end_ind, mapped_inds,
                                 unmapped_inds, unmapped_read_dict)
            rclusts.append(rclust)
        concordant_clusters.append(tuple(rclusts))
    return split_mapping_codes, concordant_clusters
예제 #5
0
def build_genome_tx_trees(genefile):
    genome_tx_trees = collections.defaultdict(lambda: IntervalTree())
    # build gene and genome data structures for fast lookup
    for g in GeneFeature.parse(open(genefile)):
        # add gene to interval tree
        interval = Interval(g.tx_start, g.tx_end, strand=g.strand, value=g)
        genome_tx_trees[g.chrom].insert_interval(interval)
    return genome_tx_trees
예제 #6
0
def build_genome_transcript_trees(transcripts):
    genome_tx_trees = collections.defaultdict(lambda: IntervalTree())
    transcript_dict = {}
    for t in transcripts:
        # add to dict
        transcript_dict[t.tx_id] = t
        # add exons to interval tree
        for start, end in t.exons:
            interval = Interval(start, end, strand=t.strand, value=t.tx_id)
            genome_tx_trees[t.chrom].insert_interval(interval)
    return transcript_dict, genome_tx_trees
예제 #7
0
def build_gene_interval_trees(genefile):
    trees = collections.defaultdict(lambda: IntervalTree())
    intervals = {}
    # build gene interval trees for fast lookup by genomic position
    for g in GeneFeature.parse(open(genefile)):
        k = (g.chrom, g.tx_start, g.tx_end)
        if k not in intervals:
            # add gene to tree
            txlist = []
            intervals[k] = txlist
            interval = Interval(g.tx_start, g.tx_end, strand=g.strand, value=txlist)
            trees[g.chrom].insert_interval(interval)
        else:
            txlist = intervals[k]
        # add isoform to value (list of isoforms that share start/end)
        txlist.append(g)
    return trees
예제 #8
0
def build_exon_interval_trees(genefile):
    exon_trees = collections.defaultdict(lambda: IntervalTree())
    exon_intervals = {}
    # build gene and genome data structures for fast lookup
    for g in GeneFeature.parse(open(genefile)):
        for i, e in enumerate(g.exons):
            k = (g.chrom, e[0], e[1])
            if k not in exon_intervals:
                # add exon to tree
                txlist = []
                exon_intervals[k] = txlist
                interval = Interval(e[0], e[1], strand=g.strand, value=txlist)
                exon_trees[g.chrom].insert_interval(interval)
            else:
                txlist = exon_intervals[k]
            # add transcript isoform
            txlist.append((g, i))
    return exon_trees
def build_exon_trees(samfh, genefile):
    rname_tid_map = dict(
        (rname, i) for i, rname in enumerate(samfh.references))
    exon_trees = collections.defaultdict(lambda: IntervalTree())
    # build gene and genome data structures for fast lookup
    for g in GeneFeature.parse(open(genefile)):
        name = config.GENE_REF_PREFIX + g.tx_name
        if name not in rname_tid_map:
            continue
        if g.chrom not in rname_tid_map:
            continue
        gene_tid = rname_tid_map[name]
        # get reference index in sam file
        chrom_tid = rname_tid_map[g.chrom]
        # add gene to interval tree
        for start, end in g.exons[1::-1]:
            exon_interval = Interval(start,
                                     end,
                                     chrom=chrom_tid,
                                     strand=g.strand,
                                     value=gene_tid)
            exon_trees[chrom_tid].insert_interval(exon_interval)
    return dict(exon_trees)
예제 #10
0
def get_chimera_type(fiveprime_gene, threeprime_gene, gene_trees):
    # get gene information
    chrom1, start5p, end5p, strand1 = fiveprime_gene.chrom, fiveprime_gene.tx_start, fiveprime_gene.tx_end, fiveprime_gene.strand
    chrom2, start3p, end3p, strand2 = threeprime_gene.chrom, threeprime_gene.tx_start, threeprime_gene.tx_end, threeprime_gene.strand
    # interchromosomal
    if chrom1 != chrom2:
        return CHIMERA_INTERCHROMOSOMAL, None    
    # orientation
    same_strand = strand1 == strand2
    # genes on same chromosome so check overlap
    is_overlapping = (start5p < end3p) and (start3p < end5p)            
    if is_overlapping:
        if not same_strand:
            if ((start5p <= start3p and strand1 == "+") or
                (start5p > start3p and strand1 == "-")):                    
                return (CHIMERA_OVERLAP_CONVERGE, 0)
            else:
                return (CHIMERA_OVERLAP_DIVERGE, 0)
        else:
            if ((start5p <= start3p and strand1 == "+") or
                (end5p >= end3p and strand1 == "-")):
                return (CHIMERA_OVERLAP_SAME, 0)
            else:
                return (CHIMERA_OVERLAP_COMPLEX, 0)
    # if code gets here then the genes are on the same chromosome but do not
    # overlap.  first calculate distance (minimum distance between genes)
    if start5p <= start3p:
        distance = start3p - end5p
        between_interval = Interval(end5p, start3p)
    else:
        distance = end3p - start5p
        between_interval = Interval(end3p, start5p)
    # check whether there are genes intervening between the
    # chimera candidates
    genes_between = []
    genes_between_same_strand = []
    for hit in gene_trees[chrom1].find(between_interval.start,
                                       between_interval.end):
        if (hit.start > between_interval.start and
            hit.end < between_interval.end):             
            if hit.strand == strand1:
                genes_between_same_strand.append(hit)
            genes_between.append(hit)

    if same_strand:
        if len(genes_between_same_strand) == 0:
            return CHIMERA_READTHROUGH, distance
        else:
            return CHIMERA_INTRACHROMOSOMAL, distance
    else:
        # check for reads between neighboring genes    
        if len(genes_between) == 0:
            if ((start5p <= start3p and strand1 == "+") or
                (start5p > start3p and strand1 == "-")):                    
                return (CHIMERA_ADJ_CONVERGE, distance)
            elif ((start5p >= start3p and strand1 == "+") or
                  (start5p < start3p and strand1 == "-")):
                return (CHIMERA_ADJ_DIVERGE, distance)
            elif ((start5p <= start3p and strand1 == "+") or
                  (start5p > start3p and strand1 == "-")):
                return (CHIMERA_ADJ_SAME, distance)
            elif ((start5p >= start3p and strand1 == "+") or
                  (start5p < start3p and strand1 == '-')):
                return (CHIMERA_ADJ_COMPLEX, distance)
        else:
            return CHIMERA_INTRA_COMPLEX, distance    
    return CHIMERA_UNKNOWN, distance
예제 #11
0
def filter_homologous_genes(input_file, 
                            output_file, 
                            index_dir,
                            homolog_segment_length,
                            min_isize,
                            max_isize,
                            maxhits,
                            num_processors,
                            tmp_dir):
    logging.debug("Parameters")
    logging.debug("\thomolog segment length: %d" % (homolog_segment_length))
    logging.debug("\tmin fragment size: %d" % (min_isize))
    logging.debug("\tmax fragment size: %d" % (max_isize))
    # open the reference sequence fasta file
    ref_fasta_file = os.path.join(index_dir, config.TRANSCRIPTOME_INDEX + ".fa")
    ref_fa = pysam.Fastafile(ref_fasta_file)
    interval_trees_3p = collections.defaultdict(lambda: IntervalTree())
    # generate FASTA file of sequences to use in mapping
    logging.debug("Generating homologous sequences to test")
    fasta5p = os.path.join(tmp_dir, "homologous_5p.fa")    
    f = open(fasta5p, "w")
    for c in Chimera.parse(open(input_file)):
        start5p, end5p, start3p, end3p = get_mapped_read_intervals(c, min_isize, max_isize, homolog_segment_length)
        # add 3' gene to interval trees
        interval_trees_3p[c.tx_name_3p].insert_interval(Interval(start3p, end3p, value=c.name))
        # extract sequence of 5' gene
        seq5p = ref_fa.fetch(c.tx_name_5p, start5p, end5p)
        for i in xrange(0, len(seq5p) - homolog_segment_length):
            print >>f, ">%s,%s:%d-%d\n%s" % (c.name,c.tx_name_5p,
                                             start5p+i,
                                             start5p+i+homolog_segment_length,
                                             seq5p[i:i+homolog_segment_length])
    f.close()
    # map 5' sequences to reference using bowtie
    logging.debug("Mapping homologous sequences")
    bowtie2_index = os.path.join(index_dir, config.TRANSCRIPTOME_INDEX)
    sam5p = os.path.join(tmp_dir, "homologous_5p.sam")
    args = [config.BOWTIE2_BIN, 
            '-p', num_processors, '--phred33',
            '--end-to-end', '--very-sensitive', '--reorder',
            '-f', '-k', maxhits,
            '-x', bowtie2_index,
            '-U', fasta5p,
            "-S", sam5p]
    retcode = subprocess.call(map(str,args))
    if retcode != 0:
        return config.JOB_ERROR
    # analyze results for homologous genes
    logging.debug("Analyzing mapping results")
    samfh = pysam.Samfile(sam5p, "r")
    tid_rname_map = dict((i,refname) for i,refname in enumerate(samfh.references))
    homologous_chimeras = set()
    for r in pysam.Samfile(sam5p, "r"):
        if r.is_unmapped:
            continue
        # reference name must be in list of 3' chimeras
        rname = tid_rname_map[r.tid]        
        if rname not in interval_trees_3p:
            continue
        # get chimera name from 'qname'
        chimera_name = r.qname.split(",")[0]
        for hit in interval_trees_3p[rname].find(r.pos,r.aend):
            if hit.value == chimera_name:
                homologous_chimeras.add(chimera_name)
    # write output
    logging.debug("Writing output")
    f = open(output_file, "w")
    for c in Chimera.parse(open(input_file)):
        if c.name in homologous_chimeras:
            logging.debug("Removing homologous chimera %s between %s and %s" % 
                          (c.name, c.gene_name_5p, c.gene_name_3p))
            continue
        print >>f, '\t'.join(map(str, c.to_list()))        
    f.close()
    # cleanup
    if os.path.exists(fasta5p):
        os.remove(fasta5p)
    if os.path.exists(sam5p):
        os.remove(sam5p)    
    return config.JOB_SUCCESS