def write_new_gtf(self, db): gtf = os.path.join(self.gtf_folder, os.path.basename(self.gtf_filename)) util.progress('Write new GTF to {} ...'.format(gtf)) with open(gtf, 'w') as f: for feature in db.all_features(order_by=common.ORDER_BY): f.write(str(feature) + '\n') util.done()
def make_graph(junction_exon_triples, db): """Create graph database of exon-junction adjacencies""" util.progress('Populating graph database of the ' 'junction-direction-exon triples ...') event_maker = events.EventMaker(junction_exon_triples, db) util.done() return event_maker
def csv(self): """Create a csv file of compiled splice junctions""" if not os.path.exists(self.junction_reads): splice_junctions = self.make_junction_reads_file() else: util.progress('Found compiled junction reads file in {} and ' 'reading it in ...'.format(self.junction_reads)) splice_junctions = pd.read_csv(self.junction_reads, low_memory=self.low_memory) util.done() return splice_junctions
def junction_metadata(spliced_reads, csv): """Get just the junction info from the concatenated read files""" util.progress('Creating splice junction metadata of merely where ' 'junctions start and stop') metadata = star.make_metadata(spliced_reads) util.done() if not os.path.exists(csv): util.progress('Writing metadata of junctions to {csv}' ' ...'.format(csv=csv)) metadata.to_csv(csv, index=False) return metadata
def maybe_read_junction_reads(self): try: dtype = {self.reads_col: np.float32} util.progress( 'Reading splice junction reads from {} ...'.format( self.junction_reads)) junction_reads = pd.read_csv( self.junction_reads, dtype=dtype, low_memory=self.low_memory) util.done() except OSError: raise IOError( "There is no junction reads file at the expected location" " ({csv}). Are you in the correct directory?".format( csv=self.junction_reads)) return junction_reads
def filter_junctions_on_reads(self, spliced_reads): # Filter junction metadata to only get junctions with minimum reads util.progress('Filtering for only junctions with minimum {} reads ' '...'.format(self.min_reads)) original = len(spliced_reads.groupby(common.JUNCTION_ID)) enough_reads_rows = spliced_reads[self.reads_col] >= self.min_reads spliced_reads = spliced_reads.loc[enough_reads_rows] enough_reads = len(spliced_reads.groupby(common.JUNCTION_ID)) filtered = original - enough_reads util.progress('\t{enough}/{original} junctions remain after ' 'filtering out {filtered} junctions with < ' '{min_reads} ' 'reads.'.format(filtered=filtered, enough=enough_reads, original=original, min_reads=self.min_reads)) util.done(2) return spliced_reads
def make_junction_reads_file(self): if self.bams is None: util.progress( 'Reading SJ.out.files and creating a big splice junction' ' table of reads spanning exon-exon junctions...') splice_junctions = star.read_multiple_sj_out_tab( self.sj_out_tab, ignore_multimapping=self.ignore_multimapping) else: util.progress('Reading bam files and creating a big splice ' 'junction table of reads spanning exon-exon ' 'junctions') splice_junctions = bam.read_multiple_bams( self.bams, self.ignore_multimapping, self.n_jobs) dirname = os.path.dirname(self.junction_reads) if not os.path.exists(dirname): os.makedirs(dirname) util.progress('Writing {} ...\n'.format(self.junction_reads)) splice_junctions.to_csv(self.junction_reads, index=False) util.done() return splice_junctions
def maybe_make_db(self): """Get GFFutils database from file or create from a gtf""" if self.gffutils_db is not None: copied_db = os.path.join(self.gtf_folder, os.path.basename(self.gffutils_db)) util.progress('Copying gffutils database from {} to {} ' '...'.format(self.gffutils_db, copied_db)) shutil.copyfile(self.gffutils_db, copied_db) util.done() util.progress('Reading gffutils database from {} ...'.format( copied_db)) db = gffutils.FeatureDB(copied_db) util.done() else: basename = os.path.basename(self.gtf_filename) db_filename = os.path.join(self.gtf_folder, '{}.db'.format(basename)) util.progress("Found GTF file in {}".format(self.gtf_filename)) try: db = gffutils.FeatureDB(db_filename) util.progress( "Found existing built outrigger-built gffutils database " "file in {}".format(db_filename)) except (ValueError, TypeError): util.progress( 'Creating a "gffutils" ' 'database {} ...'.format(db_filename)) db = gtf.create_db(self.gtf_filename, db_filename) util.done() return db
def make_exon_junction_adjacencies(self, metadata, db): """Get annotated exon_cols next to junctions in data""" exon_junction_adjacencies = adjacencies.ExonJunctionAdjacencies( metadata, db, max_de_novo_exon_length=self.max_de_novo_exon_length) novel_exons_gtf = os.path.join(self.gtf_folder, 'novel_exons.gtf') if self.maybe_overwrite(novel_exons_gtf): util.progress('Detecting de novo exons based on gaps between ' 'junctions ...') exon_junction_adjacencies.detect_exons_from_junctions() util.done() novel_exons_gtf = os.path.join(self.gtf_folder, 'novel_exons.gtf') novel_exons = exon_junction_adjacencies.db.features_of_type( adjacencies.NOVEL_EXON) n_novel_exons = sum(1 for _ in novel_exons) util.progress('Writing {n} novel exons to {gtf} ...'.format( n=n_novel_exons, gtf=novel_exons_gtf)) exon_junction_adjacencies.write_de_novo_exons(novel_exons_gtf) util.done() csv = os.path.join(self.index_folder, 'junction_exon_direction_triples.csv') if not os.path.exists(csv) or self.force: util.progress('Getting junction-direction-exon triples for graph ' 'database ...') junction_exon_triples = \ exon_junction_adjacencies.upstream_downstream_exons() util.done() util.progress('Writing junction-exon-direction triples' ' to {}...'.format(csv)) junction_exon_triples.to_csv(csv, index=False) util.done() elif self.resume: junction_exon_triples = pd.read_csv(csv, low_memory=self.low_memory) else: raise ValueError("Found existing junction-exon-triples file " "({csv}) but don't " "know whether you want me to continue where I " "stopped ('--resume') or force restart from " "scratch ('--force')! Exiting." ".".format(csv=csv)) return junction_exon_triples
def get_event_attributes(self, db, event_df, splice_type): util.progress( 'Making metadata file of {splice_type} events, ' 'annotating them with GTF attributes ...'.format( splice_type=splice_type.upper())) sa = gtf.SplicingAnnotator(db, event_df, splice_type.upper()) util.progress('Making ".bed" files for exons in each event ...') folder = os.path.join(self.index_folder, splice_type) sa.exon_bedfiles(folder=folder) util.done() attributes = sa.attributes() util.done() # Write to a file csv = os.path.join(self.index_folder, splice_type, EVENTS_CSV) util.progress('Writing {splice_type} events to {csv} ' '...'.format(splice_type=splice_type.upper(), csv=csv)) attributes.to_csv(csv, index=True, index_label=outrigger.common.EVENT_ID) util.done()
def execute(self): """Calculate percent spliced in (psi) of splicing events""" logger = logging.getLogger('outrigger.psi') if self.debug: logger.setLevel(10) junction_reads = self.csv() metadata_csv = os.path.join(self.junctions_folder, METADATA_CSV) self.junction_metadata(junction_reads, metadata_csv) junction_reads_2d = junction_reads.pivot(index=self.sample_id_col, columns=self.junction_id_col, values=self.reads_col) junction_reads_2d.fillna(0, inplace=True) junction_reads_2d = junction_reads_2d.astype(int) logger.debug('\n--- Splice Junction reads ---') logger.debug(repr(junction_reads.head())) psis = [] for splice_name, splice_abbrev in outrigger.common.SPLICE_TYPES: filename = self.maybe_get_validated_events(splice_abbrev) if not os.path.exists(filename): util.progress('No {name} ({abbrev}) events found, ' 'skipping.'. format(name=splice_name, abbrev=splice_abbrev)) continue # event_type = os.path.basename(filename).split('.csv')[0] util.progress('Reading {name} ({abbrev}) events from {filename}' ' ...'.format(name=splice_name, abbrev=splice_abbrev, filename=filename)) event_annotation = pd.read_csv(filename, index_col=0, low_memory=self.low_memory) util.done() isoform_junctions = outrigger.common.ISOFORM_JUNCTIONS[ splice_abbrev] logger.debug('\n--- Splicing event annotation ---') logger.debug(repr(event_annotation.head())) util.progress( 'Calculating percent spliced-in (Psi) scores on ' '{name} ({abbrev}) events ...'.format( name=splice_name, abbrev=splice_abbrev)) # Splice type percent spliced-in (psi) and summary type_psi, summary = compute.calculate_psi( event_annotation, junction_reads_2d, min_reads=self.min_reads, n_jobs=self.n_jobs, method=self.method, uneven_coverage_multiplier=self.uneven_coverage_multiplier, **isoform_junctions) # Write this event's percent spliced-in matrix csv = os.path.join(self.psi_folder, splice_abbrev, 'psi.csv'.format(splice_abbrev)) util.progress('Writing {name} ({abbrev}) Psi values to {filename}' ' ...'.format(name=splice_name, abbrev=splice_abbrev, filename=csv)) self.maybe_make_folder(os.path.dirname(csv)) type_psi.to_csv(csv, na_rep='NA') # Write this event's summary of events and why they weren't or were # calculated Psi on csv = os.path.join(self.psi_folder, splice_abbrev, 'summary.csv'.format(splice_abbrev)) util.progress('Writing {name} ({abbrev}) event summaries (e.g. ' 'number of reads, why an event does not have a Psi ' 'score) to {filename} ...' ''.format(name=splice_name, abbrev=splice_abbrev, filename=csv)) self.maybe_make_folder(os.path.dirname(csv)) summary.to_csv(csv, na_rep='NA', index=False) psis.append(type_psi) util.done() util.progress('Concatenating all calculated psi scores ' 'into one big matrix...') splicing = pd.concat(psis, axis=1) util.done() splicing = splicing.T csv = os.path.join(self.psi_folder, 'outrigger_psi.csv') util.progress('Writing a samples x features matrix of Psi ' 'scores to {} ...'.format(csv)) splicing.to_csv(csv, na_rep='NA') util.done()
def execute(self): valid_splice_sites = check_splice_sites.splice_site_str_to_tuple( self.valid_splice_sites) for splice_name, splice_abbrev in common.SPLICE_TYPES: splice_name_spaces = splice_name.replace('_', ' ').title() util.progress('Finding valid splice sites in {} ({}) ' 'splice type ...'.format(splice_name_spaces, splice_abbrev.upper())) isoform_exons = common.SPLICE_TYPE_ISOFORM_EXONS[splice_abbrev] validated_folder = os.path.join(self.index_folder, splice_abbrev, 'validated') self.maybe_make_folder(validated_folder) splice_sites_seriess = [] for isoform, exons in isoform_exons.items(): valid_str = ' or '.join(valid_splice_sites) util.progress('\tFinding valid splice sites for {isoform} of' ' {splice_name} events which match ' '{valid_splice_sites}' '...'.format(isoform=isoform, splice_name=splice_name_spaces, valid_splice_sites=valid_str)) exon_pairs = zip(exons, exons[1:]) for exonA, exonB in exon_pairs: util.progress('\t\tFinding splice sites for {exonA} and ' '{exonB} ...'.format(exonA=exonA, exonB=exonB)) intron_splice_site = self.exon_pair_splice_sites( exonA, exonB, splice_abbrev) splice_sites_seriess.append(intron_splice_site) util.done(4) util.done(3) splice_sites = pd.concat(splice_sites_seriess, axis=1) csv = os.path.join(self.index_folder, splice_abbrev, 'splice_sites.csv') util.progress('\tWriting splice sites to {csv} ...'.format( csv=csv)) splice_sites.to_csv(csv) util.done(3) n_total = len(splice_sites.groupby(level=0, axis=0)) splice_sites_is_valid = splice_sites.isin(valid_splice_sites) valid_events_rows = splice_sites_is_valid.all(axis=1) splice_sites_validated = splice_sites.loc[valid_events_rows] n_valid = len(splice_sites_validated.groupby(level=0, axis=0)) util.progress("\tValidated {valid}/{total} {splice_name} " "({splice_abbrev}) events. " "".format(valid=n_valid, total=n_total, splice_name=splice_name_spaces, splice_abbrev=splice_abbrev.upper())) original_events_csv = os.path.join(self.input_index, splice_abbrev, EVENTS_CSV) validated_events_csv = os.path.join(validated_folder, EVENTS_CSV) util.progress('\tWriting validated events to {csv} ...'.format( csv=validated_events_csv)) with open(validated_events_csv, 'w') as f_validated: with open(original_events_csv) as f_original: for i, line in enumerate(f_original): if i == 0: f_validated.write(line) continue if line.split(',')[0] in splice_sites_validated.index: f_validated.write(line) util.done(3)
def maybe_make_folder(self, folder): util.progress("Creating folder {} ...".format(folder)) if not os.path.exists(folder): os.makedirs(folder) util.done()