Exemple #1
0
    def non_constitutive_case(self):
        '''
        In this case, I also estimate the psi for the target exon since
        it is alternatively spliced. Both upstream and downstream exons are
        checked for the closest sufficiently included exon.
        '''
        print 'non-constitutive case'
        index = self.component.index(self.target)

        # get tx path information
        self.all_paths = algs.AllPaths(self.splice_graph, self.component, self.target, self.splice_graph.chr)
        self.all_paths.trim_tx_paths()
        self.all_paths.set_all_path_coordinates()
        self.paths, self.counts = self.all_paths.estimate_counts()

        if self.upstream and self.downstream:
            # known flanking exon case
            self.psi_upstream = mem.estimate_psi(self.upstream, self.paths, self.counts)
            self.psi_downstream = mem.estimate_psi(self.downstream, self.paths, self.counts)
        elif self.strand == '-':
            self.upstream, self.psi_upstream = self.find_closest_exon_above_cutoff(self.paths,
                                                                                   self.counts,
                                                                                   self.component[index + 1:])
            self.downstream, self.psi_downstream = self.find_closest_exon_above_cutoff(self.paths,
                                                                                       self.counts,
                                                                                       list(reversed(self.component[:index])))
        else:
            self.upstream, self.psi_upstream = self.find_closest_exon_above_cutoff(self.paths,
                                                                                   self.counts,
                                                                                   list(reversed(self.component[:index])))
            self.downstream, self.psi_downstream = self.find_closest_exon_above_cutoff(self.paths,
                                                                                       self.counts,
                                                                                       self.component[index + 1:])
        utils.save_path_info(self.id, self.paths, self.counts)
        self.psi_target = mem.estimate_psi(self.target, self.paths, self.counts)
def predefined_exons_case(id, target, sGraph, genome, upstream_exon,
                          downstream_exon):
    """
    Strategy:
    1. Use All Paths (then trim)
    2. Save counts/paths to file
    3. get sequence information
    """
    # get possible exons for primer amplification
    tmp_exons = copy.deepcopy(sGraph.get_graph().nodes())
    tmp = sorted(tmp_exons, key=lambda x: (x[0], x[1]))
    if sGraph.strand == '+':
        my_exons = tmp[tmp.index(upstream_exon):tmp.index(downstream_exon) + 1]
    else:
        my_exons = tmp[tmp.index(downstream_exon):tmp.index(upstream_exon) + 1]

    # Use correct tx's and estimate counts/psi
    all_paths = algs.AllPaths(sGraph,
                              my_exons,
                              target,
                              chr=sGraph.chr,
                              strand=sGraph.strand)
    # all_paths.trim_tx_paths()
    #all_paths.trim_tx_paths_using_flanking_exons(sGraph.strand, upstream_exon, downstream_exon)
    all_paths.trim_tx_paths_using_flanking_exons_and_target(
        sGraph.strand, target, upstream_exon, downstream_exon)
    all_paths.set_all_path_coordinates()
    # all_paths.keep_weakly_connected()  # hack to prevent extraneous exons causing problems in EM alg
    paths, counts = all_paths.estimate_counts()  # run EM algorithm
    # psi_target = algs.estimate_psi(target, paths, counts)
    psi_target = mem.estimate_psi(target, paths, counts)
    utils.save_path_info(id, paths,
                         counts)  # save paths/counts in tmp/isoforms/id.json

    # get sequence of upstream/target/downstream combo
    genome_chr = genome[sGraph.chr]  # chr object from pygr
    upstream_seq, target_seq, downstream_seq = genome_chr[upstream_exon[
        0]:upstream_exon[1]], genome_chr[target[0]:target[1]], genome_chr[
            downstream_exon[0]:downstream_exon[1]]  # get sequence using pygr
    if sGraph.strand == '-':
        upstream_seq, target_seq, downstream_seq = -upstream_seq, -target_seq, -downstream_seq  # get reverse-complement if necessary

    return [
        sGraph.strand,
        '%s:%d-%d' % (sGraph.chr, target[0], target[1]),
        psi_target,
        sGraph.chr + ':' +
        '-'.join(map(str, upstream_exon)),  # upstream eg. +chr1:1000-2000
        -1,  # user defined exon, don't estimate psi
        sGraph.chr + ':' +
        '-'.join(map(str, downstream_exon)),  # downstream eg. +chr1:1000-2000
        -1,  # user defined exon, don't estimate psi
        all_paths,
        upstream_seq,
        target_seq,
        downstream_seq
    ]
Exemple #3
0
    def two_biconnected_case(self):
        '''
        This is a case where the target exon is constitutive but has two
        flanking biconnected components. Meaning estimating psi for both
        the upstream and downstream exon is necessary
        '''
        print 'two biconnected case'
        if self.component[0][-1] == self.target:
            before_component, after_component = self.component
        else:
            after_component, before_component = self.component

        # since there is two components I need two subgraphs/paths. One for
        # before and after the target exon (before/after are defined by
        # chromosome position)
        before_all_paths = algs.AllPaths(self.splice_graph, before_component, self.target, self.splice_graph.chr)
        before_all_paths.trim_tx_paths()
        before_paths, before_counts = before_all_paths.estimate_counts()
        after_all_paths = algs.AllPaths(self.splice_graph, after_component, self.target, self.splice_graph.chr)
        after_all_paths.trim_tx_paths()
        after_paths, after_counts = after_all_paths.estimate_counts()

        if self.upstream and self.downstream:
            if self.strand == '+':
                self.psi_upstream = mem.estimate_psi(self.upstream, before_paths, before_counts)
                self.psi_downstream = mem.estimate_psi(self.downstream, after_paths, after_counts)
            elif self.strand == '-':
                self.psi_upstream = mem.estimate_psi(self.upstream, after_paths, after_counts)
                self.psi_downstream = mem.estimate_psi(self.downstream, before_paths, before_counts)
        elif self.strand == '+':
            self.upstream, self.psi_upstream = self.find_closest_exon_above_cutoff(before_paths,
                                                                                   before_counts,
                                                                                   list(reversed(before_component[:-1])))
            self.downstream, self.psi_downstream = self.find_closest_exon_above_cutoff(after_paths,
                                                                                       after_counts,
                                                                                       after_component[1:])
        else:
            self.upstream, self.psi_upstream = self.find_closest_exon_above_cutoff(after_paths,
                                                                                   after_counts,
                                                                                   after_component[1:])
            self.downstream, self.psi_downstream = self.find_closest_exon_above_cutoff(before_paths,
                                                                                       before_counts,
                                                                                       list(reversed(before_component[:-1])))
        self.total_components = before_component[:-1] + after_component
        self.psi_target = 1.0

        # handle the combined components
        tmp_start_ix = self.total_components.index(self.upstream) if self.splice_graph.strand == '+' else self.total_components.index(self.downstream)
        tmp_end_ix = self.total_components.index(self.downstream) if self.splice_graph.strand == '+' else self.total_components.index(self.upstream)
        self.all_paths = algs.AllPaths(self.splice_graph, self.total_components[tmp_start_ix:tmp_end_ix], self.target, self.splice_graph.chr)
        self.all_paths.trim_tx_paths()
        self.all_paths.set_all_path_coordinates()
        self.paths, self.counts = self.all_paths.estimate_counts()  # used to be self.before_all_paths
        utils.save_path_info(self.id, self.paths, self.counts)
def save_isforms_and_counts(line, options):
    # get information about each row
    ID, target_coordinate = line[:2]
    strand = target_coordinate[0]
    chr = utils.get_chr(target_coordinate[1:])
    tmp_start, tmp_end = utils.get_pos(target_coordinate)
    logging.debug('Saving isoform and count information for event %s . . .' %
                  ID)

    # get information from GTF annotation
    gene_dict, gene_name = retrieve_gene_information(options, strand, chr,
                                                     tmp_start, tmp_end)

    # get edge weights
    edge_weights_list = [
        sam_obj.extractSamRegion(chr, gene_dict['start'], gene_dict['end'])
        for sam_obj in options['rnaseq']
    ]

    # construct splice graph for each BAM file
    bam_splice_graphs = sg.construct_splice_graph(edge_weights_list,
                                                  gene_dict,
                                                  chr,
                                                  strand,
                                                  options['read_threshold'],
                                                  options['min_jct_count'],
                                                  output_type='list',
                                                  both=options['both_flag'])

    for bam_ix, my_splice_graph in enumerate(bam_splice_graphs):
        # this case is meant for user-defined flanking exons
        if line[utils.PSI_UP] == '-1' and line[utils.PSI_DOWN] == '-1':
            # find path and count information
            paths, counts = user_defined_exons(my_splice_graph, line)

            # filter out single exon paths
            # my_tmp = [(path, count) for path, count in zip(paths, counts) if len(path) > 1]
            # paths, counts = zip(*my_tmp)
        # this case is meant for automatic choice of flanking exons
        else:
            paths, counts = primerseq_defined_exons(my_splice_graph, line,
                                                    options['psi'])
        utils.save_path_info('%s.%d' % (ID, bam_ix),
                             paths,
                             counts,
                             save_dir='tmp/indiv_isoforms/')
    logging.debug(
        'Finished saving isoform and count information for event %s.' % ID)
Exemple #5
0
    def no_biconnected_case(self):
        '''
        Case where the target, upstream, and downstream exons are all
        constitutive. Thus just return the immediate upstream and downstream
        exon along with original target.
        '''
        print 'no biconnected case'
        # add information to log file
        logging.debug('It appears %s has two imediate flanking constitutive exons' % str(self.target))
        if len(self.graph.successors(self.target)) > 1:
            logging.debug('Conflict between biconnected components and successors')
        if len(self.graph.predecessors(self.target)) > 1:
            logging.debug('Conflict between biconnected components and predecessors')

        if self.upstream and self.downstream:
            tmp_upstream = self.graph.predecessors(self.target)[0] if self.strand == '+' else self.graph.successors(self.target)[0]

        # define adjacent exons as flanking constitutive since all three (the
        # target exon, upstream exon, and downstream exon) are constitutive
        tmp_upstream = self.graph.predecessors(self.target)[0] if self.strand == '+' else self.graph.successors(self.target)[0]
        tmp_downstream = self.graph.successors(self.target)[0] if self.strand == '+' else self.graph.predecessors(self.target)[0]
        if self.upstream and self.downstream:
            if self.upstream != tmp_upstream or self.downstream != tmp_downstream:
                # raise error if the user defined exon does not match expectation
                raise utils.PrimerSeqError('Error: Flanking exon choice too far from target exon')
        self.upstream = tmp_upstream  # assign upstream after user-defined exon check
        self.downstream = tmp_downstream  # assign downstream after user-defined exon check

        # defining two attributes as the same thing seems silly but in a
        # different case with two biconnected components the two components
        # need to be merged into a single self.total_components
        self.total_components = [self.upstream, self.target, self.downstream]
        self.component = self.total_components

        # create a dummy all paths variable even though there is only one path
        self.all_paths = algs.AllPaths(self.splice_graph, self.component, self.target, self.splice_graph.chr)
        self.all_paths.trim_tx_paths()
        self.all_paths.set_all_path_coordinates()

        # only one isoform, so read counts do not really matter
        self.paths, self.counts = self.all_paths.estimate_counts()
        utils.save_path_info(self.id, self.paths, self.counts)

        # since the upstream, target, and downstream exon are constitutive then
        # they all have inclusion of 1.0
        self.psi_target, self.psi_upstream, self.psi_downstream = 1.0, 1.0, 1.0
Exemple #6
0
def save_isforms_and_counts(line, options):
    # get information about each row
    ID, target_coordinate = line[:2]
    strand = target_coordinate[0]
    chr = utils.get_chr(target_coordinate[1:])
    tmp_start, tmp_end = utils.get_pos(target_coordinate)
    logging.debug('Saving isoform and count information for event %s . . .' % ID)

    # get information from GTF annotation
    gene_dict, gene_name = retrieve_gene_information(options,
                                                     strand, chr, tmp_start, tmp_end)

    # get edge weights
    edge_weights_list = [sam_obj.extractSamRegion(chr, gene_dict['start'], gene_dict['end'])
                         for sam_obj in options['rnaseq']]

    # construct splice graph for each BAM file
    bam_splice_graphs = sg.construct_splice_graph(edge_weights_list,
                                                  gene_dict,
                                                  chr,
                                                  strand,
                                                  options['read_threshold'],
                                                  options['min_jct_count'],
                                                  output_type='list',
                                                  both=options['both_flag'])

    for bam_ix, my_splice_graph in enumerate(bam_splice_graphs):
        # this case is meant for user-defined flanking exons
        if line[utils.PSI_UP] == '-1' and line[utils.PSI_DOWN] == '-1':
            # find path and count information
            paths, counts = user_defined_exons(my_splice_graph, line)

            # filter out single exon paths
            # my_tmp = [(path, count) for path, count in zip(paths, counts) if len(path) > 1]
            # paths, counts = zip(*my_tmp)
        # this case is meant for automatic choice of flanking exons
        else:
            paths, counts = primerseq_defined_exons(my_splice_graph, line, options['psi'])
        utils.save_path_info('%s.%d' % (ID, bam_ix),
                             paths, counts,
                             save_dir='tmp/indiv_isoforms/')
    logging.debug('Finished saving isoform and count information for event %s.' % ID)
Exemple #7
0
def predefined_exons_case(id, target, sGraph, genome, upstream_exon, downstream_exon):
    """
    Strategy:
    1. Use All Paths (then trim)
    2. Save counts/paths to file
    3. get sequence information
    """
    # get possible exons for primer amplification
    tmp_exons = copy.deepcopy(sGraph.get_graph().nodes())
    tmp = sorted(tmp_exons, key=lambda x: (x[0], x[1]))
    if sGraph.strand == '+':
        my_exons = tmp[tmp.index(upstream_exon):tmp.index(downstream_exon) + 1]
    else:
        my_exons = tmp[tmp.index(downstream_exon):tmp.index(upstream_exon) + 1]

    # Use correct tx's and estimate counts/psi
    all_paths = algs.AllPaths(sGraph, my_exons, target, chr=sGraph.chr, strand=sGraph.strand)
    # all_paths.trim_tx_paths()
    #all_paths.trim_tx_paths_using_flanking_exons(sGraph.strand, upstream_exon, downstream_exon)
    all_paths.trim_tx_paths_using_flanking_exons_and_target(sGraph.strand, target, upstream_exon, downstream_exon)
    all_paths.set_all_path_coordinates()
    # all_paths.keep_weakly_connected()  # hack to prevent extraneous exons causing problems in EM alg
    paths, counts = all_paths.estimate_counts()  # run EM algorithm
    # psi_target = algs.estimate_psi(target, paths, counts)
    psi_target = mem.estimate_psi(target, paths, counts)
    utils.save_path_info(id, paths, counts)  # save paths/counts in tmp/isoforms/id.json

    # get sequence of upstream/target/downstream combo
    genome_chr = genome[sGraph.chr]  # chr object from pygr
    upstream_seq, target_seq, downstream_seq = genome_chr[upstream_exon[0]:upstream_exon[1]], genome_chr[target[0]:target[1]], genome_chr[downstream_exon[0]:downstream_exon[1]]  # get sequence using pygr
    if sGraph.strand == '-':
        upstream_seq, target_seq, downstream_seq = -upstream_seq, -target_seq, -downstream_seq  # get reverse-complement if necessary

    return [sGraph.strand, '%s:%d-%d' % (sGraph.chr, target[0], target[1]), psi_target,
            sGraph.chr + ':' + '-'.join(map(str, upstream_exon)),  # upstream eg. +chr1:1000-2000
            -1,  # user defined exon, don't estimate psi
            sGraph.chr + ':' + '-'.join(map(str, downstream_exon)),  # downstream eg. +chr1:1000-2000
            -1,  # user defined exon, don't estimate psi
            all_paths, upstream_seq,
            target_seq, downstream_seq]
Exemple #8
0
    def last_exon_case(self):
        '''
        Case where the target and one flanking exon are constitutive.
        '''
        print 'last exon case'
        if len(self.graph.successors(self.target)) > 1:
            logging.debug('Conflict between biconnected components and successors')

        possible_const = self.component[:-1]
        possible_const.reverse()  # reverse the order since closer exons should be looked at first

        # get tx path information
        self.all_paths = algs.AllPaths(self.splice_graph, self.component, self.target, self.splice_graph.chr)
        self.all_paths.trim_tx_paths()
        self.all_paths.set_all_path_coordinates()
        self.paths, self.counts = self.all_paths.estimate_counts()

        if self.upstream and self.downstream:
            # user defined flanking exon case
            if self.strand == '+':
                self.psi_upstream = mem.estimate_psi(self.upstream, self.paths, self.counts)
                self.psi_downstream = 1.0
            elif self.strand == '-':
                self.psi_upstream = 1.0
                self.psi_downstream = mem.estimate_psi(self.downstream, self.paths, self.counts)
        if self.strand == '+':
            self.upstream, self.psi_upstream = self.find_closest_exon_above_cutoff(self.paths,
                                                                                   self.counts, possible_const)
            self.downstream = self.graph.successors(self.target)[0]
            self.psi_downstream = 1.0
            utils.save_path_info(self.id, [p + [self.downstream] for p in self.paths], self.counts)  # add const. downstream exon to all self.paths
        else:
            self.upstream = self.graph.successors(self.target)[0]
            self.psi_upstream = 1.0
            self.downstream, self.psi_downstream = self.find_closest_exon_above_cutoff(self.paths,
                                                                                       self.counts, possible_const)
            utils.save_path_info(self.id, [[self.upstream] + p for p in self.paths], self.counts)  # add const. upstream exon to all paths
        self.psi_target = 1.0  # the target is constitutive in this case
Exemple #9
0
    def first_exon_case(self):
        '''
        Case where the target and one flanking exon is constitutive.
        '''
        print 'first exon case'
        if len(self.graph.predecessors(self.target)) > 1:
            logging.debug('Error: Conflict between biconnected components and predecessors')

        # get tx path information
        self.all_paths = algs.AllPaths(self.splice_graph, self.component, self.target, self.splice_graph.chr)
        self.all_paths.trim_tx_paths()
        self.all_paths.set_all_path_coordinates()
        self.paths, self.counts = self.all_paths.estimate_counts()

        if self.upstream and self.downstream:
            # user defined flanking exon case
            if self.strand == '+' and self.graph.predecessors(self.target)[0] == self.upstream:
                self.psi_upstream = 1.0
                self.psi_downsteam = mem.estimate_psi(self.downstream, self.paths, self.counts)
            elif self.strand == '-' and self.graph.predecessors(self.target)[0] == self.downstream:
                self.psi_downstream = 1.0
                self.psi_upstream = mem.estimate_psi(self.upstream, self.paths, self.counts)
            else:
                raise utils.PrimerSeqError('Error: Flanking exon choice too far from target exon')
        elif self.strand == '+':
            self.upstream = self.graph.predecessors(self.target)[0]
            self.psi_upstream = 1.0  # defined by biconnected component alg as constitutive
            self.downstream, self.psi_downstream = self.find_closest_exon_above_cutoff(self.paths,
                                                                                       self.counts, self.component[1:])
            utils.save_path_info(self.id, [[self.upstream] + p for p in self.paths], self.counts)  # add const. upstream exon to all self.paths
        else:
            self.upstream, self.psi_upstream = self.find_closest_exon_above_cutoff(self.paths,
                                                                                   self.counts, self.component[1:])
            self.downstream = self.graph.predecessors(self.target)[0]
            self.psi_downstream = 1.0
            utils.save_path_info(self.id, [p + [self.downstream] for p in self.paths], self.counts)  # add const. downstream exon to all paths
        self.psi_target = 1.0