def non_constitutive_case(self): ''' In this case, I also estimate the psi for the target exon since it is alternatively spliced. Both upstream and downstream exons are checked for the closest sufficiently included exon. ''' print 'non-constitutive case' index = self.component.index(self.target) # get tx path information self.all_paths = algs.AllPaths(self.splice_graph, self.component, self.target, self.splice_graph.chr) self.all_paths.trim_tx_paths() self.all_paths.set_all_path_coordinates() self.paths, self.counts = self.all_paths.estimate_counts() if self.upstream and self.downstream: # known flanking exon case self.psi_upstream = mem.estimate_psi(self.upstream, self.paths, self.counts) self.psi_downstream = mem.estimate_psi(self.downstream, self.paths, self.counts) elif self.strand == '-': self.upstream, self.psi_upstream = self.find_closest_exon_above_cutoff(self.paths, self.counts, self.component[index + 1:]) self.downstream, self.psi_downstream = self.find_closest_exon_above_cutoff(self.paths, self.counts, list(reversed(self.component[:index]))) else: self.upstream, self.psi_upstream = self.find_closest_exon_above_cutoff(self.paths, self.counts, list(reversed(self.component[:index]))) self.downstream, self.psi_downstream = self.find_closest_exon_above_cutoff(self.paths, self.counts, self.component[index + 1:]) utils.save_path_info(self.id, self.paths, self.counts) self.psi_target = mem.estimate_psi(self.target, self.paths, self.counts)
def predefined_exons_case(id, target, sGraph, genome, upstream_exon, downstream_exon): """ Strategy: 1. Use All Paths (then trim) 2. Save counts/paths to file 3. get sequence information """ # get possible exons for primer amplification tmp_exons = copy.deepcopy(sGraph.get_graph().nodes()) tmp = sorted(tmp_exons, key=lambda x: (x[0], x[1])) if sGraph.strand == '+': my_exons = tmp[tmp.index(upstream_exon):tmp.index(downstream_exon) + 1] else: my_exons = tmp[tmp.index(downstream_exon):tmp.index(upstream_exon) + 1] # Use correct tx's and estimate counts/psi all_paths = algs.AllPaths(sGraph, my_exons, target, chr=sGraph.chr, strand=sGraph.strand) # all_paths.trim_tx_paths() #all_paths.trim_tx_paths_using_flanking_exons(sGraph.strand, upstream_exon, downstream_exon) all_paths.trim_tx_paths_using_flanking_exons_and_target( sGraph.strand, target, upstream_exon, downstream_exon) all_paths.set_all_path_coordinates() # all_paths.keep_weakly_connected() # hack to prevent extraneous exons causing problems in EM alg paths, counts = all_paths.estimate_counts() # run EM algorithm # psi_target = algs.estimate_psi(target, paths, counts) psi_target = mem.estimate_psi(target, paths, counts) utils.save_path_info(id, paths, counts) # save paths/counts in tmp/isoforms/id.json # get sequence of upstream/target/downstream combo genome_chr = genome[sGraph.chr] # chr object from pygr upstream_seq, target_seq, downstream_seq = genome_chr[upstream_exon[ 0]:upstream_exon[1]], genome_chr[target[0]:target[1]], genome_chr[ downstream_exon[0]:downstream_exon[1]] # get sequence using pygr if sGraph.strand == '-': upstream_seq, target_seq, downstream_seq = -upstream_seq, -target_seq, -downstream_seq # get reverse-complement if necessary return [ sGraph.strand, '%s:%d-%d' % (sGraph.chr, target[0], target[1]), psi_target, sGraph.chr + ':' + '-'.join(map(str, upstream_exon)), # upstream eg. +chr1:1000-2000 -1, # user defined exon, don't estimate psi sGraph.chr + ':' + '-'.join(map(str, downstream_exon)), # downstream eg. +chr1:1000-2000 -1, # user defined exon, don't estimate psi all_paths, upstream_seq, target_seq, downstream_seq ]
def two_biconnected_case(self): ''' This is a case where the target exon is constitutive but has two flanking biconnected components. Meaning estimating psi for both the upstream and downstream exon is necessary ''' print 'two biconnected case' if self.component[0][-1] == self.target: before_component, after_component = self.component else: after_component, before_component = self.component # since there is two components I need two subgraphs/paths. One for # before and after the target exon (before/after are defined by # chromosome position) before_all_paths = algs.AllPaths(self.splice_graph, before_component, self.target, self.splice_graph.chr) before_all_paths.trim_tx_paths() before_paths, before_counts = before_all_paths.estimate_counts() after_all_paths = algs.AllPaths(self.splice_graph, after_component, self.target, self.splice_graph.chr) after_all_paths.trim_tx_paths() after_paths, after_counts = after_all_paths.estimate_counts() if self.upstream and self.downstream: if self.strand == '+': self.psi_upstream = mem.estimate_psi(self.upstream, before_paths, before_counts) self.psi_downstream = mem.estimate_psi(self.downstream, after_paths, after_counts) elif self.strand == '-': self.psi_upstream = mem.estimate_psi(self.upstream, after_paths, after_counts) self.psi_downstream = mem.estimate_psi(self.downstream, before_paths, before_counts) elif self.strand == '+': self.upstream, self.psi_upstream = self.find_closest_exon_above_cutoff(before_paths, before_counts, list(reversed(before_component[:-1]))) self.downstream, self.psi_downstream = self.find_closest_exon_above_cutoff(after_paths, after_counts, after_component[1:]) else: self.upstream, self.psi_upstream = self.find_closest_exon_above_cutoff(after_paths, after_counts, after_component[1:]) self.downstream, self.psi_downstream = self.find_closest_exon_above_cutoff(before_paths, before_counts, list(reversed(before_component[:-1]))) self.total_components = before_component[:-1] + after_component self.psi_target = 1.0 # handle the combined components tmp_start_ix = self.total_components.index(self.upstream) if self.splice_graph.strand == '+' else self.total_components.index(self.downstream) tmp_end_ix = self.total_components.index(self.downstream) if self.splice_graph.strand == '+' else self.total_components.index(self.upstream) self.all_paths = algs.AllPaths(self.splice_graph, self.total_components[tmp_start_ix:tmp_end_ix], self.target, self.splice_graph.chr) self.all_paths.trim_tx_paths() self.all_paths.set_all_path_coordinates() self.paths, self.counts = self.all_paths.estimate_counts() # used to be self.before_all_paths utils.save_path_info(self.id, self.paths, self.counts)
def save_isforms_and_counts(line, options): # get information about each row ID, target_coordinate = line[:2] strand = target_coordinate[0] chr = utils.get_chr(target_coordinate[1:]) tmp_start, tmp_end = utils.get_pos(target_coordinate) logging.debug('Saving isoform and count information for event %s . . .' % ID) # get information from GTF annotation gene_dict, gene_name = retrieve_gene_information(options, strand, chr, tmp_start, tmp_end) # get edge weights edge_weights_list = [ sam_obj.extractSamRegion(chr, gene_dict['start'], gene_dict['end']) for sam_obj in options['rnaseq'] ] # construct splice graph for each BAM file bam_splice_graphs = sg.construct_splice_graph(edge_weights_list, gene_dict, chr, strand, options['read_threshold'], options['min_jct_count'], output_type='list', both=options['both_flag']) for bam_ix, my_splice_graph in enumerate(bam_splice_graphs): # this case is meant for user-defined flanking exons if line[utils.PSI_UP] == '-1' and line[utils.PSI_DOWN] == '-1': # find path and count information paths, counts = user_defined_exons(my_splice_graph, line) # filter out single exon paths # my_tmp = [(path, count) for path, count in zip(paths, counts) if len(path) > 1] # paths, counts = zip(*my_tmp) # this case is meant for automatic choice of flanking exons else: paths, counts = primerseq_defined_exons(my_splice_graph, line, options['psi']) utils.save_path_info('%s.%d' % (ID, bam_ix), paths, counts, save_dir='tmp/indiv_isoforms/') logging.debug( 'Finished saving isoform and count information for event %s.' % ID)
def no_biconnected_case(self): ''' Case where the target, upstream, and downstream exons are all constitutive. Thus just return the immediate upstream and downstream exon along with original target. ''' print 'no biconnected case' # add information to log file logging.debug('It appears %s has two imediate flanking constitutive exons' % str(self.target)) if len(self.graph.successors(self.target)) > 1: logging.debug('Conflict between biconnected components and successors') if len(self.graph.predecessors(self.target)) > 1: logging.debug('Conflict between biconnected components and predecessors') if self.upstream and self.downstream: tmp_upstream = self.graph.predecessors(self.target)[0] if self.strand == '+' else self.graph.successors(self.target)[0] # define adjacent exons as flanking constitutive since all three (the # target exon, upstream exon, and downstream exon) are constitutive tmp_upstream = self.graph.predecessors(self.target)[0] if self.strand == '+' else self.graph.successors(self.target)[0] tmp_downstream = self.graph.successors(self.target)[0] if self.strand == '+' else self.graph.predecessors(self.target)[0] if self.upstream and self.downstream: if self.upstream != tmp_upstream or self.downstream != tmp_downstream: # raise error if the user defined exon does not match expectation raise utils.PrimerSeqError('Error: Flanking exon choice too far from target exon') self.upstream = tmp_upstream # assign upstream after user-defined exon check self.downstream = tmp_downstream # assign downstream after user-defined exon check # defining two attributes as the same thing seems silly but in a # different case with two biconnected components the two components # need to be merged into a single self.total_components self.total_components = [self.upstream, self.target, self.downstream] self.component = self.total_components # create a dummy all paths variable even though there is only one path self.all_paths = algs.AllPaths(self.splice_graph, self.component, self.target, self.splice_graph.chr) self.all_paths.trim_tx_paths() self.all_paths.set_all_path_coordinates() # only one isoform, so read counts do not really matter self.paths, self.counts = self.all_paths.estimate_counts() utils.save_path_info(self.id, self.paths, self.counts) # since the upstream, target, and downstream exon are constitutive then # they all have inclusion of 1.0 self.psi_target, self.psi_upstream, self.psi_downstream = 1.0, 1.0, 1.0
def save_isforms_and_counts(line, options): # get information about each row ID, target_coordinate = line[:2] strand = target_coordinate[0] chr = utils.get_chr(target_coordinate[1:]) tmp_start, tmp_end = utils.get_pos(target_coordinate) logging.debug('Saving isoform and count information for event %s . . .' % ID) # get information from GTF annotation gene_dict, gene_name = retrieve_gene_information(options, strand, chr, tmp_start, tmp_end) # get edge weights edge_weights_list = [sam_obj.extractSamRegion(chr, gene_dict['start'], gene_dict['end']) for sam_obj in options['rnaseq']] # construct splice graph for each BAM file bam_splice_graphs = sg.construct_splice_graph(edge_weights_list, gene_dict, chr, strand, options['read_threshold'], options['min_jct_count'], output_type='list', both=options['both_flag']) for bam_ix, my_splice_graph in enumerate(bam_splice_graphs): # this case is meant for user-defined flanking exons if line[utils.PSI_UP] == '-1' and line[utils.PSI_DOWN] == '-1': # find path and count information paths, counts = user_defined_exons(my_splice_graph, line) # filter out single exon paths # my_tmp = [(path, count) for path, count in zip(paths, counts) if len(path) > 1] # paths, counts = zip(*my_tmp) # this case is meant for automatic choice of flanking exons else: paths, counts = primerseq_defined_exons(my_splice_graph, line, options['psi']) utils.save_path_info('%s.%d' % (ID, bam_ix), paths, counts, save_dir='tmp/indiv_isoforms/') logging.debug('Finished saving isoform and count information for event %s.' % ID)
def predefined_exons_case(id, target, sGraph, genome, upstream_exon, downstream_exon): """ Strategy: 1. Use All Paths (then trim) 2. Save counts/paths to file 3. get sequence information """ # get possible exons for primer amplification tmp_exons = copy.deepcopy(sGraph.get_graph().nodes()) tmp = sorted(tmp_exons, key=lambda x: (x[0], x[1])) if sGraph.strand == '+': my_exons = tmp[tmp.index(upstream_exon):tmp.index(downstream_exon) + 1] else: my_exons = tmp[tmp.index(downstream_exon):tmp.index(upstream_exon) + 1] # Use correct tx's and estimate counts/psi all_paths = algs.AllPaths(sGraph, my_exons, target, chr=sGraph.chr, strand=sGraph.strand) # all_paths.trim_tx_paths() #all_paths.trim_tx_paths_using_flanking_exons(sGraph.strand, upstream_exon, downstream_exon) all_paths.trim_tx_paths_using_flanking_exons_and_target(sGraph.strand, target, upstream_exon, downstream_exon) all_paths.set_all_path_coordinates() # all_paths.keep_weakly_connected() # hack to prevent extraneous exons causing problems in EM alg paths, counts = all_paths.estimate_counts() # run EM algorithm # psi_target = algs.estimate_psi(target, paths, counts) psi_target = mem.estimate_psi(target, paths, counts) utils.save_path_info(id, paths, counts) # save paths/counts in tmp/isoforms/id.json # get sequence of upstream/target/downstream combo genome_chr = genome[sGraph.chr] # chr object from pygr upstream_seq, target_seq, downstream_seq = genome_chr[upstream_exon[0]:upstream_exon[1]], genome_chr[target[0]:target[1]], genome_chr[downstream_exon[0]:downstream_exon[1]] # get sequence using pygr if sGraph.strand == '-': upstream_seq, target_seq, downstream_seq = -upstream_seq, -target_seq, -downstream_seq # get reverse-complement if necessary return [sGraph.strand, '%s:%d-%d' % (sGraph.chr, target[0], target[1]), psi_target, sGraph.chr + ':' + '-'.join(map(str, upstream_exon)), # upstream eg. +chr1:1000-2000 -1, # user defined exon, don't estimate psi sGraph.chr + ':' + '-'.join(map(str, downstream_exon)), # downstream eg. +chr1:1000-2000 -1, # user defined exon, don't estimate psi all_paths, upstream_seq, target_seq, downstream_seq]
def last_exon_case(self): ''' Case where the target and one flanking exon are constitutive. ''' print 'last exon case' if len(self.graph.successors(self.target)) > 1: logging.debug('Conflict between biconnected components and successors') possible_const = self.component[:-1] possible_const.reverse() # reverse the order since closer exons should be looked at first # get tx path information self.all_paths = algs.AllPaths(self.splice_graph, self.component, self.target, self.splice_graph.chr) self.all_paths.trim_tx_paths() self.all_paths.set_all_path_coordinates() self.paths, self.counts = self.all_paths.estimate_counts() if self.upstream and self.downstream: # user defined flanking exon case if self.strand == '+': self.psi_upstream = mem.estimate_psi(self.upstream, self.paths, self.counts) self.psi_downstream = 1.0 elif self.strand == '-': self.psi_upstream = 1.0 self.psi_downstream = mem.estimate_psi(self.downstream, self.paths, self.counts) if self.strand == '+': self.upstream, self.psi_upstream = self.find_closest_exon_above_cutoff(self.paths, self.counts, possible_const) self.downstream = self.graph.successors(self.target)[0] self.psi_downstream = 1.0 utils.save_path_info(self.id, [p + [self.downstream] for p in self.paths], self.counts) # add const. downstream exon to all self.paths else: self.upstream = self.graph.successors(self.target)[0] self.psi_upstream = 1.0 self.downstream, self.psi_downstream = self.find_closest_exon_above_cutoff(self.paths, self.counts, possible_const) utils.save_path_info(self.id, [[self.upstream] + p for p in self.paths], self.counts) # add const. upstream exon to all paths self.psi_target = 1.0 # the target is constitutive in this case
def first_exon_case(self): ''' Case where the target and one flanking exon is constitutive. ''' print 'first exon case' if len(self.graph.predecessors(self.target)) > 1: logging.debug('Error: Conflict between biconnected components and predecessors') # get tx path information self.all_paths = algs.AllPaths(self.splice_graph, self.component, self.target, self.splice_graph.chr) self.all_paths.trim_tx_paths() self.all_paths.set_all_path_coordinates() self.paths, self.counts = self.all_paths.estimate_counts() if self.upstream and self.downstream: # user defined flanking exon case if self.strand == '+' and self.graph.predecessors(self.target)[0] == self.upstream: self.psi_upstream = 1.0 self.psi_downsteam = mem.estimate_psi(self.downstream, self.paths, self.counts) elif self.strand == '-' and self.graph.predecessors(self.target)[0] == self.downstream: self.psi_downstream = 1.0 self.psi_upstream = mem.estimate_psi(self.upstream, self.paths, self.counts) else: raise utils.PrimerSeqError('Error: Flanking exon choice too far from target exon') elif self.strand == '+': self.upstream = self.graph.predecessors(self.target)[0] self.psi_upstream = 1.0 # defined by biconnected component alg as constitutive self.downstream, self.psi_downstream = self.find_closest_exon_above_cutoff(self.paths, self.counts, self.component[1:]) utils.save_path_info(self.id, [[self.upstream] + p for p in self.paths], self.counts) # add const. upstream exon to all self.paths else: self.upstream, self.psi_upstream = self.find_closest_exon_above_cutoff(self.paths, self.counts, self.component[1:]) self.downstream = self.graph.predecessors(self.target)[0] self.psi_downstream = 1.0 utils.save_path_info(self.id, [p + [self.downstream] for p in self.paths], self.counts) # add const. downstream exon to all paths self.psi_target = 1.0