def get_clusters(self): if len(self.vals) == 0: return # sort values in order of increasing start position self.vals.sort(key=operator.itemgetter(0)) # initialize cluster valiter = iter(self.vals) start, end, ind, val = valiter.next() ind_val_dict = collections.defaultdict(lambda: []) ind_val_dict[ind].append(val) # find clusters for next_start, next_end, ind, val in valiter: if next_start > (end + self.max_dist): # this interval is outside bounds of current cluster yield Interval(start, end, chrom=self.rname, strand=self.strand, value=ind_val_dict) ind_val_dict = collections.defaultdict(lambda: []) start = next_start # update values if next_end > end: end = next_end ind_val_dict[ind].append(val) if len(ind_val_dict) > 0: yield Interval(start, end, chrom=self.rname, strand=self.strand, value=ind_val_dict)
def build_exon_trees(genes): trees = collections.defaultdict(lambda: IntervalTree()) for g in genes: for e in g.exons: start, end = e trees[g.chrom].insert_interval(Interval(start, end, strand=g.strand)) return trees
def build_gene_maps(samfh, genefile): rname_tid_map = dict( (rname, i) for i, rname in enumerate(samfh.references)) gene_genome_map = [None] * len(samfh.references) gene_trees = collections.defaultdict(lambda: IntervalTree()) # build gene and genome data structures for fast lookup for g in GeneFeature.parse(open(genefile)): name = config.GENE_REF_PREFIX + g.tx_name if name not in rname_tid_map: continue if g.chrom not in rname_tid_map: continue gene_tid = rname_tid_map[name] # get reference index in sam file chrom_tid = rname_tid_map[g.chrom] # store gene by reference id in sam file gene_genome_map[gene_tid] = g # add gene to interval tree gene_interval = Interval(g.tx_start, g.tx_end, chrom=g.chrom, strand=g.strand, value=g.tx_name) gene_trees[chrom_tid].insert_interval(gene_interval) return gene_genome_map, gene_trees
def find_split_read_clusters(partition_splits, max_indel_size, max_multihits): refmaps = {} split_mapping_codes = [] unmapped_read_dict = collections.defaultdict(lambda: []) for split_ind, split_reads in enumerate(partition_splits): mapping_codes = set() for r in split_reads: #print 'IND', split_ind, 'mapping code', _mapping_code_strings[get_mapping_code(r, max_multihits)] # keep track of mapping results for reads in this split mapping_codes.add(get_mapping_code(r, max_multihits)) if r.is_unmapped: #print 'UNMAPPED', 'IND', split_ind, 'READ', r unmapped_read_dict[split_ind].append(r) continue #print 'MAPPED', 'IND', split_ind, 'READ', r # cluster reads by reference name and strand strand = int(r.is_reverse) rkey = (r.rname, strand) if rkey not in refmaps: refmaps[rkey] = RefCluster(r.rname, strand, max_indel_size) refmaps[rkey].add(r.pos, r.aend, split_ind, r) split_mapping_codes.append(mapping_codes) # convert unmapped reads index dict to a static dict unmapped_read_dict = dict(unmapped_read_dict) if all((MAPPING not in codes) for codes in split_mapping_codes): # if there are no mapped reads, create a dummy "unmapped" ReadCluster # and return early interval = Interval(0, 0, chrom=-1, strand=0, value={}) rclust = ReadCluster(interval, start_ind=0, end_ind=len(partition_splits), mapped_inds=[], unmapped_inds=set(unmapped_read_dict), unmapped_read_dict=unmapped_read_dict) # results are returned as a list of tuples, where each tuple represents # a set of reads that cluster together. return a singleton tuple here return split_mapping_codes, [(rclust, )] # search reference maps by index to find split points cluster_intervals, concordant_clust_ids = find_split_points( refmaps, len(partition_splits)) # convert from cluster ids to read cluster intervals concordant_clusters = [] for start_ind, end_ind, mapped_inds, clust_ids in concordant_clust_ids: #print 'SR START', start_ind, 'END', end_ind, 'MAPPED INDS', mapped_inds, 'IDS', clust_ids # determine indexes of unmapped splits in this cluster and # get lists of unmapped reads unmapped_inds = set(xrange(start_ind, end_ind)).difference(mapped_inds) rclusts = [] for id in clust_ids: interval = cluster_intervals[id] # build a new ReadCluster object with complete information about # start,end indexes and which indexes are mapping/nonmapping. # thus clusters will now be contiguous lists of split indexes with # padding information at the start/end rclust = ReadCluster(interval, start_ind, end_ind, mapped_inds, unmapped_inds, unmapped_read_dict) rclusts.append(rclust) concordant_clusters.append(tuple(rclusts)) return split_mapping_codes, concordant_clusters
def build_genome_tx_trees(genefile): genome_tx_trees = collections.defaultdict(lambda: IntervalTree()) # build gene and genome data structures for fast lookup for g in GeneFeature.parse(open(genefile)): # add gene to interval tree interval = Interval(g.tx_start, g.tx_end, strand=g.strand, value=g) genome_tx_trees[g.chrom].insert_interval(interval) return genome_tx_trees
def build_genome_transcript_trees(transcripts): genome_tx_trees = collections.defaultdict(lambda: IntervalTree()) transcript_dict = {} for t in transcripts: # add to dict transcript_dict[t.tx_id] = t # add exons to interval tree for start, end in t.exons: interval = Interval(start, end, strand=t.strand, value=t.tx_id) genome_tx_trees[t.chrom].insert_interval(interval) return transcript_dict, genome_tx_trees
def build_gene_interval_trees(genefile): trees = collections.defaultdict(lambda: IntervalTree()) intervals = {} # build gene interval trees for fast lookup by genomic position for g in GeneFeature.parse(open(genefile)): k = (g.chrom, g.tx_start, g.tx_end) if k not in intervals: # add gene to tree txlist = [] intervals[k] = txlist interval = Interval(g.tx_start, g.tx_end, strand=g.strand, value=txlist) trees[g.chrom].insert_interval(interval) else: txlist = intervals[k] # add isoform to value (list of isoforms that share start/end) txlist.append(g) return trees
def build_exon_interval_trees(genefile): exon_trees = collections.defaultdict(lambda: IntervalTree()) exon_intervals = {} # build gene and genome data structures for fast lookup for g in GeneFeature.parse(open(genefile)): for i, e in enumerate(g.exons): k = (g.chrom, e[0], e[1]) if k not in exon_intervals: # add exon to tree txlist = [] exon_intervals[k] = txlist interval = Interval(e[0], e[1], strand=g.strand, value=txlist) exon_trees[g.chrom].insert_interval(interval) else: txlist = exon_intervals[k] # add transcript isoform txlist.append((g, i)) return exon_trees
def build_exon_trees(samfh, genefile): rname_tid_map = dict( (rname, i) for i, rname in enumerate(samfh.references)) exon_trees = collections.defaultdict(lambda: IntervalTree()) # build gene and genome data structures for fast lookup for g in GeneFeature.parse(open(genefile)): name = config.GENE_REF_PREFIX + g.tx_name if name not in rname_tid_map: continue if g.chrom not in rname_tid_map: continue gene_tid = rname_tid_map[name] # get reference index in sam file chrom_tid = rname_tid_map[g.chrom] # add gene to interval tree for start, end in g.exons[1::-1]: exon_interval = Interval(start, end, chrom=chrom_tid, strand=g.strand, value=gene_tid) exon_trees[chrom_tid].insert_interval(exon_interval) return dict(exon_trees)
def get_chimera_type(fiveprime_gene, threeprime_gene, gene_trees): # get gene information chrom1, start5p, end5p, strand1 = fiveprime_gene.chrom, fiveprime_gene.tx_start, fiveprime_gene.tx_end, fiveprime_gene.strand chrom2, start3p, end3p, strand2 = threeprime_gene.chrom, threeprime_gene.tx_start, threeprime_gene.tx_end, threeprime_gene.strand # interchromosomal if chrom1 != chrom2: return CHIMERA_INTERCHROMOSOMAL, None # orientation same_strand = strand1 == strand2 # genes on same chromosome so check overlap is_overlapping = (start5p < end3p) and (start3p < end5p) if is_overlapping: if not same_strand: if ((start5p <= start3p and strand1 == "+") or (start5p > start3p and strand1 == "-")): return (CHIMERA_OVERLAP_CONVERGE, 0) else: return (CHIMERA_OVERLAP_DIVERGE, 0) else: if ((start5p <= start3p and strand1 == "+") or (end5p >= end3p and strand1 == "-")): return (CHIMERA_OVERLAP_SAME, 0) else: return (CHIMERA_OVERLAP_COMPLEX, 0) # if code gets here then the genes are on the same chromosome but do not # overlap. first calculate distance (minimum distance between genes) if start5p <= start3p: distance = start3p - end5p between_interval = Interval(end5p, start3p) else: distance = end3p - start5p between_interval = Interval(end3p, start5p) # check whether there are genes intervening between the # chimera candidates genes_between = [] genes_between_same_strand = [] for hit in gene_trees[chrom1].find(between_interval.start, between_interval.end): if (hit.start > between_interval.start and hit.end < between_interval.end): if hit.strand == strand1: genes_between_same_strand.append(hit) genes_between.append(hit) if same_strand: if len(genes_between_same_strand) == 0: return CHIMERA_READTHROUGH, distance else: return CHIMERA_INTRACHROMOSOMAL, distance else: # check for reads between neighboring genes if len(genes_between) == 0: if ((start5p <= start3p and strand1 == "+") or (start5p > start3p and strand1 == "-")): return (CHIMERA_ADJ_CONVERGE, distance) elif ((start5p >= start3p and strand1 == "+") or (start5p < start3p and strand1 == "-")): return (CHIMERA_ADJ_DIVERGE, distance) elif ((start5p <= start3p and strand1 == "+") or (start5p > start3p and strand1 == "-")): return (CHIMERA_ADJ_SAME, distance) elif ((start5p >= start3p and strand1 == "+") or (start5p < start3p and strand1 == '-')): return (CHIMERA_ADJ_COMPLEX, distance) else: return CHIMERA_INTRA_COMPLEX, distance return CHIMERA_UNKNOWN, distance
def filter_homologous_genes(input_file, output_file, index_dir, homolog_segment_length, min_isize, max_isize, maxhits, num_processors, tmp_dir): logging.debug("Parameters") logging.debug("\thomolog segment length: %d" % (homolog_segment_length)) logging.debug("\tmin fragment size: %d" % (min_isize)) logging.debug("\tmax fragment size: %d" % (max_isize)) # open the reference sequence fasta file ref_fasta_file = os.path.join(index_dir, config.TRANSCRIPTOME_INDEX + ".fa") ref_fa = pysam.Fastafile(ref_fasta_file) interval_trees_3p = collections.defaultdict(lambda: IntervalTree()) # generate FASTA file of sequences to use in mapping logging.debug("Generating homologous sequences to test") fasta5p = os.path.join(tmp_dir, "homologous_5p.fa") f = open(fasta5p, "w") for c in Chimera.parse(open(input_file)): start5p, end5p, start3p, end3p = get_mapped_read_intervals(c, min_isize, max_isize, homolog_segment_length) # add 3' gene to interval trees interval_trees_3p[c.tx_name_3p].insert_interval(Interval(start3p, end3p, value=c.name)) # extract sequence of 5' gene seq5p = ref_fa.fetch(c.tx_name_5p, start5p, end5p) for i in xrange(0, len(seq5p) - homolog_segment_length): print >>f, ">%s,%s:%d-%d\n%s" % (c.name,c.tx_name_5p, start5p+i, start5p+i+homolog_segment_length, seq5p[i:i+homolog_segment_length]) f.close() # map 5' sequences to reference using bowtie logging.debug("Mapping homologous sequences") bowtie2_index = os.path.join(index_dir, config.TRANSCRIPTOME_INDEX) sam5p = os.path.join(tmp_dir, "homologous_5p.sam") args = [config.BOWTIE2_BIN, '-p', num_processors, '--phred33', '--end-to-end', '--very-sensitive', '--reorder', '-f', '-k', maxhits, '-x', bowtie2_index, '-U', fasta5p, "-S", sam5p] retcode = subprocess.call(map(str,args)) if retcode != 0: return config.JOB_ERROR # analyze results for homologous genes logging.debug("Analyzing mapping results") samfh = pysam.Samfile(sam5p, "r") tid_rname_map = dict((i,refname) for i,refname in enumerate(samfh.references)) homologous_chimeras = set() for r in pysam.Samfile(sam5p, "r"): if r.is_unmapped: continue # reference name must be in list of 3' chimeras rname = tid_rname_map[r.tid] if rname not in interval_trees_3p: continue # get chimera name from 'qname' chimera_name = r.qname.split(",")[0] for hit in interval_trees_3p[rname].find(r.pos,r.aend): if hit.value == chimera_name: homologous_chimeras.add(chimera_name) # write output logging.debug("Writing output") f = open(output_file, "w") for c in Chimera.parse(open(input_file)): if c.name in homologous_chimeras: logging.debug("Removing homologous chimera %s between %s and %s" % (c.name, c.gene_name_5p, c.gene_name_3p)) continue print >>f, '\t'.join(map(str, c.to_list())) f.close() # cleanup if os.path.exists(fasta5p): os.remove(fasta5p) if os.path.exists(sam5p): os.remove(sam5p) return config.JOB_SUCCESS