Ejemplo n.º 1
0
    def maybe_make_db(self):
        """Get GFFutils database from file or create from a gtf"""
        if self.gffutils_db is not None:
            copied_db = os.path.join(self.gtf_folder,
                                     os.path.basename(self.gffutils_db))
            util.progress('Copying gffutils database from {} to {} '
                          '...'.format(self.gffutils_db, copied_db))
            shutil.copyfile(self.gffutils_db, copied_db)
            util.done()

            util.progress('Reading gffutils database from {} ...'.format(
                copied_db))
            db = gffutils.FeatureDB(copied_db)
            util.done()
        else:
            basename = os.path.basename(self.gtf_filename)
            db_filename = os.path.join(self.gtf_folder,
                                       '{}.db'.format(basename))
            util.progress("Found GTF file in {}".format(self.gtf_filename))
            try:
                db = gffutils.FeatureDB(db_filename)
                util.progress(
                    "Found existing built outrigger-built gffutils database "
                    "file in {}".format(db_filename))
            except (ValueError, TypeError):
                util.progress(
                    'Creating a "gffutils" '
                    'database {} ...'.format(db_filename))
                db = gtf.create_db(self.gtf_filename, db_filename)
                util.done()
        return db
Ejemplo n.º 2
0
 def write_new_gtf(self, db):
     gtf = os.path.join(self.gtf_folder,
                        os.path.basename(self.gtf_filename))
     util.progress('Write new GTF to {} ...'.format(gtf))
     with open(gtf, 'w') as f:
         for feature in db.all_features(order_by=common.ORDER_BY):
             f.write(str(feature) + '\n')
     util.done()
Ejemplo n.º 3
0
    def make_graph(junction_exon_triples, db):
        """Create graph database of exon-junction adjacencies"""
        util.progress('Populating graph database of the '
                      'junction-direction-exon triples ...')

        event_maker = events.EventMaker(junction_exon_triples, db)
        util.done()
        return event_maker
Ejemplo n.º 4
0
    def csv(self):
        """Create a csv file of compiled splice junctions"""
        if not os.path.exists(self.junction_reads):
            splice_junctions = self.make_junction_reads_file()
        else:
            util.progress('Found compiled junction reads file in {} and '
                          'reading it in ...'.format(self.junction_reads))
            splice_junctions = pd.read_csv(self.junction_reads,
                                           low_memory=self.low_memory)
            util.done()

        return splice_junctions
Ejemplo n.º 5
0
    def junction_metadata(spliced_reads, csv):
        """Get just the junction info from the concatenated read files"""
        util.progress('Creating splice junction metadata of merely where '
                      'junctions start and stop')

        metadata = star.make_metadata(spliced_reads)
        util.done()

        if not os.path.exists(csv):
            util.progress('Writing metadata of junctions to {csv}'
                          ' ...'.format(csv=csv))
            metadata.to_csv(csv, index=False)

        return metadata
Ejemplo n.º 6
0
 def maybe_read_junction_reads(self):
     try:
         dtype = {self.reads_col: np.float32}
         util.progress(
             'Reading splice junction reads from {} ...'.format(
                 self.junction_reads))
         junction_reads = pd.read_csv(
             self.junction_reads, dtype=dtype, low_memory=self.low_memory)
         util.done()
     except OSError:
         raise IOError(
             "There is no junction reads file at the expected location"
             " ({csv}). Are you in the correct directory?".format(
                 csv=self.junction_reads))
     return junction_reads
Ejemplo n.º 7
0
    def maybe_overwrite(self, filename):
        """Ensures that filename is not overwritten unless user-specified

        - If the file doesn't exist, return "True", as in "yes, please
          overwrite" even though there wasn't really a file there, but the
          action is still to create the file in the next step
        - If the file exists and the user specified nothing, exit the program
          and complain that either --force or --resume must be specified
        - If the file exists and the user specified --force, then return True
        - If the file exists and the user specified --resume, then return False
          so that the file is not overwritten

        Parameters
        ----------
        filename : str
            Path to a file that you may want to overwrite

        Returns
        -------
        if_overwrite : bool
            If True, then the next step in the program has the go-ahead to
            "overwrite" or create the file. If False, the file exists and the
            user doesn't want to overwrite it
        """
        if not os.path.exists(filename):
            return True
        if os.path.exists(filename):
            if self.force:
                util.progress("Found existing {filename}, overwriting with "
                              "--force flag".format(filename=filename))
                return True
            if self.resume:
                util.progress(
                    "With the flag '--resume', Found an existing file "
                    "containing novel exons,"
                    "{filename}, not re-calculating. To force overwriting, "
                    "use the flag ''--force'.".format(filename=filename))
                return False
        else:
            raise ValueError("Found existing {filename} "
                             "but don't "
                             "know whether you want me to continue where I "
                             "stopped ('--resume') or force overwrite "
                             "and restart from "
                             "scratch ('--force')! Exiting."
                             ".".format(filename=filename))
Ejemplo n.º 8
0
 def filter_junctions_on_reads(self, spliced_reads):
     # Filter junction metadata to only get junctions with minimum reads
     util.progress('Filtering for only junctions with minimum {} reads '
                   '...'.format(self.min_reads))
     original = len(spliced_reads.groupby(common.JUNCTION_ID))
     enough_reads_rows = spliced_reads[self.reads_col] >= self.min_reads
     spliced_reads = spliced_reads.loc[enough_reads_rows]
     enough_reads = len(spliced_reads.groupby(common.JUNCTION_ID))
     filtered = original - enough_reads
     util.progress('\t{enough}/{original} junctions remain after '
                   'filtering out {filtered} junctions with < '
                   '{min_reads} '
                   'reads.'.format(filtered=filtered, enough=enough_reads,
                                   original=original,
                                   min_reads=self.min_reads))
     util.done(2)
     return spliced_reads
Ejemplo n.º 9
0
    def make_events_by_traversing_graph(self, event_maker, db):
        """Search the splice graph for alternative exons"""
        existing_events = [os.path.exists(
            os.path.join(self.index_folder, splice_abbrev.lower(),
                         EVENTS_CSV)
            ) for splice_abbrev in common.SPLICE_ABBREVS]
        if all(existing_events) and not self.force:
            util.progress('Found existing splicing events files for all splice'
                          ' types, so not searching. To force'
                          ' re-finding these splicing events, use the flag'
                          ' "--force".')
            return

        event_dfs = event_maker.find_events()

        for splice_abbrev, event_df in event_dfs.items():
            csv = os.path.join(self.index_folder, splice_abbrev.lower(),
                               EVENTS_CSV)
            dirname = os.path.dirname(csv)
            if not os.path.exists(dirname):
                os.makedirs(dirname)

            n_events = len(event_df.groupby(level=0, axis=0))
            if n_events > 0:
                util.progress(
                    'Found {n} {abbrev} events.'.format(
                        n=n_events, abbrev=splice_abbrev.upper(), csv=csv))
                self.get_event_attributes(db, event_df, splice_abbrev)
            else:
                util.progress(
                    'No {abbrev} events found in the junction and exon '
                    'data.'.format(abbrev=splice_abbrev.upper()))
Ejemplo n.º 10
0
    def make_exon_junction_adjacencies(self, metadata, db):
        """Get annotated exon_cols next to junctions in data"""
        exon_junction_adjacencies = adjacencies.ExonJunctionAdjacencies(
            metadata, db, max_de_novo_exon_length=self.max_de_novo_exon_length)

        novel_exons_gtf = os.path.join(self.gtf_folder, 'novel_exons.gtf')
        if self.maybe_overwrite(novel_exons_gtf):
            util.progress('Detecting de novo exons based on gaps between '
                          'junctions ...')
            exon_junction_adjacencies.detect_exons_from_junctions()
            util.done()

        novel_exons_gtf = os.path.join(self.gtf_folder, 'novel_exons.gtf')
        novel_exons = exon_junction_adjacencies.db.features_of_type(
            adjacencies.NOVEL_EXON)
        n_novel_exons = sum(1 for _ in novel_exons)
        util.progress('Writing {n} novel exons to {gtf} ...'.format(
            n=n_novel_exons, gtf=novel_exons_gtf))
        exon_junction_adjacencies.write_de_novo_exons(novel_exons_gtf)
        util.done()

        csv = os.path.join(self.index_folder,
                           'junction_exon_direction_triples.csv')
        if not os.path.exists(csv) or self.force:
            util.progress('Getting junction-direction-exon triples for graph '
                          'database ...')
            junction_exon_triples = \
                exon_junction_adjacencies.upstream_downstream_exons()
            util.done()

            util.progress('Writing junction-exon-direction triples'
                          ' to {}...'.format(csv))
            junction_exon_triples.to_csv(csv, index=False)
            util.done()
        elif self.resume:
            junction_exon_triples = pd.read_csv(csv,
                                                low_memory=self.low_memory)
        else:
            raise ValueError("Found existing junction-exon-triples file "
                             "({csv}) but don't "
                             "know whether you want me to continue where I "
                             "stopped ('--resume') or force restart from "
                             "scratch ('--force')! Exiting."
                             ".".format(csv=csv))

        return junction_exon_triples
Ejemplo n.º 11
0
 def make_junction_reads_file(self):
     if self.bams is None:
         util.progress(
             'Reading SJ.out.files and creating a big splice junction'
             ' table of reads spanning exon-exon junctions...')
         splice_junctions = star.read_multiple_sj_out_tab(
             self.sj_out_tab,
             ignore_multimapping=self.ignore_multimapping)
     else:
         util.progress('Reading bam files and creating a big splice '
                       'junction table of reads spanning exon-exon '
                       'junctions')
         splice_junctions = bam.read_multiple_bams(
             self.bams, self.ignore_multimapping, self.n_jobs)
     dirname = os.path.dirname(self.junction_reads)
     if not os.path.exists(dirname):
         os.makedirs(dirname)
     util.progress('Writing {} ...\n'.format(self.junction_reads))
     splice_junctions.to_csv(self.junction_reads, index=False)
     util.done()
     return splice_junctions
Ejemplo n.º 12
0
    def get_event_attributes(self, db, event_df, splice_type):
        util.progress(
            'Making metadata file of {splice_type} events, '
            'annotating them with GTF attributes ...'.format(
                splice_type=splice_type.upper()))

        sa = gtf.SplicingAnnotator(db, event_df, splice_type.upper())
        util.progress('Making ".bed" files for exons in each event ...')
        folder = os.path.join(self.index_folder, splice_type)
        sa.exon_bedfiles(folder=folder)
        util.done()

        attributes = sa.attributes()
        util.done()

        # Write to a file
        csv = os.path.join(self.index_folder, splice_type, EVENTS_CSV)
        util.progress('Writing {splice_type} events to {csv} '
                      '...'.format(splice_type=splice_type.upper(), csv=csv))
        attributes.to_csv(csv, index=True,
                          index_label=outrigger.common.EVENT_ID)
        util.done()
Ejemplo n.º 13
0
    def execute(self):
        """Calculate percent spliced in (psi) of splicing events"""

        logger = logging.getLogger('outrigger.psi')
        if self.debug:
            logger.setLevel(10)

        junction_reads = self.csv()

        metadata_csv = os.path.join(self.junctions_folder, METADATA_CSV)
        self.junction_metadata(junction_reads, metadata_csv)

        junction_reads_2d = junction_reads.pivot(index=self.sample_id_col,
                                                 columns=self.junction_id_col,
                                                 values=self.reads_col)
        junction_reads_2d.fillna(0, inplace=True)
        junction_reads_2d = junction_reads_2d.astype(int)

        logger.debug('\n--- Splice Junction reads ---')
        logger.debug(repr(junction_reads.head()))

        psis = []
        for splice_name, splice_abbrev in outrigger.common.SPLICE_TYPES:
            filename = self.maybe_get_validated_events(splice_abbrev)
            if not os.path.exists(filename):
                util.progress('No {name} ({abbrev}) events found, '
                              'skipping.'. format(name=splice_name,
                                                  abbrev=splice_abbrev))
                continue
            # event_type = os.path.basename(filename).split('.csv')[0]
            util.progress('Reading {name} ({abbrev}) events from {filename}'
                          ' ...'.format(name=splice_name, abbrev=splice_abbrev,
                                        filename=filename))

            event_annotation = pd.read_csv(filename, index_col=0,
                                           low_memory=self.low_memory)
            util.done()

            isoform_junctions = outrigger.common.ISOFORM_JUNCTIONS[
                splice_abbrev]
            logger.debug('\n--- Splicing event annotation ---')
            logger.debug(repr(event_annotation.head()))

            util.progress(
                'Calculating percent spliced-in (Psi) scores on '
                '{name} ({abbrev}) events ...'.format(
                    name=splice_name, abbrev=splice_abbrev))
            # Splice type percent spliced-in (psi) and summary
            type_psi, summary = compute.calculate_psi(
                event_annotation, junction_reads_2d,
                min_reads=self.min_reads, n_jobs=self.n_jobs,
                method=self.method,
                uneven_coverage_multiplier=self.uneven_coverage_multiplier,
                **isoform_junctions)

            # Write this event's percent spliced-in matrix
            csv = os.path.join(self.psi_folder, splice_abbrev,
                               'psi.csv'.format(splice_abbrev))
            util.progress('Writing {name} ({abbrev}) Psi values to {filename}'
                          ' ...'.format(name=splice_name, abbrev=splice_abbrev,
                                        filename=csv))
            self.maybe_make_folder(os.path.dirname(csv))
            type_psi.to_csv(csv, na_rep='NA')

            # Write this event's summary of events and why they weren't or were
            # calculated Psi on
            csv = os.path.join(self.psi_folder, splice_abbrev,
                               'summary.csv'.format(splice_abbrev))
            util.progress('Writing {name} ({abbrev}) event summaries (e.g. '
                          'number of reads, why an event does not have a Psi '
                          'score) to {filename} ...'
                          ''.format(name=splice_name, abbrev=splice_abbrev,
                                    filename=csv))
            self.maybe_make_folder(os.path.dirname(csv))
            summary.to_csv(csv, na_rep='NA', index=False)
            psis.append(type_psi)
            util.done()

        util.progress('Concatenating all calculated psi scores '
                      'into one big matrix...')
        splicing = pd.concat(psis, axis=1)
        util.done()
        splicing = splicing.T
        csv = os.path.join(self.psi_folder, 'outrigger_psi.csv')
        util.progress('Writing a samples x features matrix of Psi '
                      'scores to {} ...'.format(csv))
        splicing.to_csv(csv, na_rep='NA')
        util.done()
Ejemplo n.º 14
0
    def execute(self):
        valid_splice_sites = check_splice_sites.splice_site_str_to_tuple(
            self.valid_splice_sites)

        for splice_name, splice_abbrev in common.SPLICE_TYPES:
            splice_name_spaces = splice_name.replace('_', ' ').title()
            util.progress('Finding valid splice sites in {} ({}) '
                          'splice type ...'.format(splice_name_spaces,
                                                   splice_abbrev.upper()))
            isoform_exons = common.SPLICE_TYPE_ISOFORM_EXONS[splice_abbrev]

            validated_folder = os.path.join(self.index_folder, splice_abbrev,
                                            'validated')
            self.maybe_make_folder(validated_folder)

            splice_sites_seriess = []

            for isoform, exons in isoform_exons.items():
                valid_str = ' or '.join(valid_splice_sites)
                util.progress('\tFinding valid splice sites for {isoform} of'
                              ' {splice_name} events which match '
                              '{valid_splice_sites}'
                              '...'.format(isoform=isoform,
                                           splice_name=splice_name_spaces,
                                           valid_splice_sites=valid_str))
                exon_pairs = zip(exons, exons[1:])
                for exonA, exonB in exon_pairs:
                    util.progress('\t\tFinding splice sites for {exonA} and '
                                  '{exonB} ...'.format(exonA=exonA,
                                                       exonB=exonB))
                    intron_splice_site = self.exon_pair_splice_sites(
                        exonA, exonB, splice_abbrev)
                    splice_sites_seriess.append(intron_splice_site)
                    util.done(4)
                util.done(3)
            splice_sites = pd.concat(splice_sites_seriess, axis=1)

            csv = os.path.join(self.index_folder, splice_abbrev,
                               'splice_sites.csv')
            util.progress('\tWriting splice sites to {csv} ...'.format(
                csv=csv))
            splice_sites.to_csv(csv)
            util.done(3)

            n_total = len(splice_sites.groupby(level=0, axis=0))
            splice_sites_is_valid = splice_sites.isin(valid_splice_sites)
            valid_events_rows = splice_sites_is_valid.all(axis=1)
            splice_sites_validated = splice_sites.loc[valid_events_rows]
            n_valid = len(splice_sites_validated.groupby(level=0, axis=0))

            util.progress("\tValidated {valid}/{total} {splice_name} "
                          "({splice_abbrev}) events. "
                          "".format(valid=n_valid, total=n_total,
                                    splice_name=splice_name_spaces,
                                    splice_abbrev=splice_abbrev.upper()))

            original_events_csv = os.path.join(self.input_index,
                                               splice_abbrev, EVENTS_CSV)
            validated_events_csv = os.path.join(validated_folder, EVENTS_CSV)
            util.progress('\tWriting validated events to {csv} ...'.format(
                csv=validated_events_csv))

            with open(validated_events_csv, 'w') as f_validated:
                with open(original_events_csv) as f_original:
                    for i, line in enumerate(f_original):
                        if i == 0:
                            f_validated.write(line)
                            continue
                        if line.split(',')[0] in splice_sites_validated.index:
                            f_validated.write(line)
            util.done(3)
Ejemplo n.º 15
0
 def maybe_make_folder(self, folder):
     util.progress("Creating folder {} ...".format(folder))
     if not os.path.exists(folder):
         os.makedirs(folder)
     util.done()