def get_log(self): ribo_utils.make_dir( os.path.join(self.experiment_settings.get_rdir(), 'logs')) log = os.path.join( self.experiment_settings.get_rdir(), 'logs', '%(sample_name)s.log' % {'sample_name': self.sample_name}) return log
def make_genome_mapping_index(self): make_index = False if ribo_utils.file_exists( self.settings.get_property('star_genome_dir')): self.settings.write_to_log( 'STAR index exists at %s' % self.settings.get_property('star_genome_dir')) if self.settings.get_property('rebuild_star_index'): self.settings.write_to_log('STAR index rebuild forced') make_index = True else: self.settings.write_to_log('using existing STAR index') else: make_index = True ribo_utils.make_dir(self.settings.get_property('star_genome_dir')) if make_index: self.settings.write_to_log('building STAR index') if self.settings.get_property('transcriptome_mapping_only'): command_to_run = 'STAR --runThreadN %d --runMode genomeGenerate --genomeDir %s --genomeFastaFiles %s --genomeSAsparseD %d 1>>%s 2>>%s' % ( self.threads, self.settings.get_property('star_genome_dir'), self.settings.get_transcriptome_FASTA(), self.settings.get_property('star_index_sparsity'), self.settings.get_log(), self.settings.get_log()) else: command_to_run = 'STAR --runThreadN %d --runMode genomeGenerate --genomeDir %s --genomeFastaFiles %s*.fa --genomeSAsparseD %d 1>>%s 2>>%s' % ( self.threads, self.settings.get_property('star_genome_dir'), self.settings.get_genome_sequence_dir(), self.settings.get_property('star_index_sparsity'), self.settings.get_log(), self.settings.get_log()) self.settings.write_to_log(command_to_run) subprocess.Popen(command_to_run, shell=True).wait() self.settings.write_to_log('STAR index ready')
def make_tables(self): ribo_utils.make_dir(self.rdir_path('tables')) ribo_tables.make_readthrough_table(self) ribo_tables.make_detailed_readthrough_table(self) ribo_tables.transcriptome_features_table(self) ribo_tables.make_cds_rpkm_table(self) ribo_tables.make_cds_counts_table(self)
def map_reads_to_ncrna(self): """ map all reads using bowtie :return: """ if not self.settings.get_property('force_remapping'): for lib_settings in self.settings.iter_lib_settings(): if not lib_settings.ncrna_mapped_reads_exist(): break else: self.settings.write_to_log( 'using existing noncoding RNA mapped reads') return else: self.settings.write_to_log('remapping forced') self.settings.write_to_log('mapping reads') ribo_utils.make_dir(self.rdir_path('ncrna_mapped_reads')) ribo_utils.parmap(lambda lib_setting: self.map_one_library_to_ncrna( lib_setting, self.genome_threads_per_instance), [ lib_setting for lib_setting in self.settings.iter_lib_settings() if not lib_setting.ncrna_mapping_finished() ], nprocs=self.genome_num_instances) self.settings.write_to_log('finished mapping reads to noncoding RNA')
def initialize_libs(self): self.settings.write_to_log('initializing libraries, counting reads') ribo_utils.make_dir(self.rdir_path('transcript_counts')) self.libs = [] ribo_utils.parmap(lambda lib_settings: ribo_lib.assign_tx_reads( self, self.settings, lib_settings), self.settings.iter_lib_settings(), nprocs=self.threads) map(lambda lib_settings: self.initialize_lib(lib_settings), self.settings.iter_lib_settings()) self.settings.write_to_log( 'initializing libraries, counting reads, done')
def deduplicate_reads(self): for lib_settings in self.settings.iter_lib_settings(): if not lib_settings.deduplicated_reads_exist(): break else: #else clause executes if break did not occur self.settings.write_to_log('using existing deduplicated reads') return self.settings.write_to_log('deduplicating reads with tally') ribo_utils.make_dir(self.rdir_path('deduplicated')) ribo_utils.parmap( lambda lib_setting: self.deduplicate_reads_one_lib(lib_setting), self.settings.iter_lib_settings(), nprocs=self.num_instances) self.settings.write_to_log('done deduplicating reads with tally')
def __init__(self, settings, threads): self.threads = threads self.settings = settings self.all_settings = [ lib_setting for lib_setting in self.settings.iter_lib_settings() ] self.num_datasets = len(self.all_settings) self.num_instances = min(self.num_datasets, self.threads) self.threads_per_instance = max(self.threads / self.num_instances - 1, 1) self.genome_num_instances = min(min(self.num_datasets, self.threads), 5) self.genome_threads_per_instance = max( self.threads / self.genome_num_instances, 1) self.settings.write_to_log( 'Initializing experiment %s' % self.settings.get_property('experiment_name')) self.num_libs = len([x for x in settings.iter_lib_settings()]) self.make_ncRNA_mapping_index() self.settings.write_to_log('loading GTF annotations') self.GTF_annotations = ribo_utils.gtf_data( self.settings.get_annotation_GTF_file(), add_3_for_stop=self.settings.get_property('add_3_for_stop')) self.settings.write_to_log('loading genome sequence') self.genome = ribo_utils.genome_sequence( self.settings.get_genome_sequence_files()) if self.settings.get_property('transcriptome_mapping_only'): #only mapping to the transcriptome, not the genome, so need to generate the genome to map to ribo_utils.make_dir(self.settings.get_transcriptome_dir()) self.GTF_annotations.write_transcript_sequences_to_FASTA( self.settings.get_transcriptome_FASTA(), self.genome) self.settings.write_to_log('loading transcriptome sequence') self.transcriptome_sequence = ribo_utils.genome_sequence( self.settings.get_transcriptome_FASTA()) self.make_genome_mapping_index() self.deduplicate_reads() self.trim_reads() self.remove_adaptor() self.map_reads_to_ncrna() self.map_reads_to_genome() self.initialize_libs() self.settings.write_to_log( 'Finished initializing experiment %s\n' % self.settings.get_property('experiment_name'))
def remove_adaptor(self): if not self.settings.get_property('force_retrim'): for lib_settings in self.settings.iter_lib_settings(): if not lib_settings.adaptorless_reads_exist(): break else: self.settings.write_to_log('using existing adaptorless reads') return else: self.settings.write_to_log('adaptor removal forced') self.settings.write_to_log('removing adaptors') ribo_utils.make_dir(self.rdir_path('adaptor_removed')) ribo_utils.parmap(lambda lib_setting: self.remove_adaptor_one_lib( lib_setting, self.threads_per_instance), self.settings.iter_lib_settings(), nprocs=self.num_instances) self.settings.write_to_log('done removing adaptors')
def trim_reads(self): if not self.settings.get_property('force_retrim'): for lib_settings in self.settings.iter_lib_settings(): if not lib_settings.trimmed_reads_exist(): break else: #else clause executes if break did not occur self.settings.write_to_log('using existing trimmed reads') return else: self.settings.write_to_log('read trimming forced') self.settings.write_to_log('trimming reads with seqtk') ribo_utils.make_dir(self.rdir_path('trimmed')) ribo_utils.parmap(lambda lib_setting: self.trim_reads_one_lib( lib_setting, self.threads_per_instance), self.settings.iter_lib_settings(), nprocs=self.num_instances) self.settings.write_to_log('done trimming reads with seqtk')
def __init__(self, experiment, experiment_settings, threads): """ Constructor for Library class """ self.threads = threads self.experiment = experiment self.experiment_settings = experiment_settings self.experiment_settings.write_to_log('initializing QC engine') self.get_property = self.experiment_settings.get_property self.get_rdir = experiment_settings.get_rdir ribo_utils.make_dir(self.experiment.rdir_path('QC')) self.genome = self.experiment.genome self.full_QC_GTF_annotations = ribo_utils.gtf_data( self.experiment_settings.get_QC_annotation_GTF_file(), add_3_for_stop=self.experiment_settings.get_property( 'add_3_for_stop')) self.lib_QCs = [ self.initialize_qc_lib(lib_settings) for lib_settings in self.experiment_settings.iter_lib_settings() ]
def make_plots(self): ribo_utils.make_dir(self.rdir_path('plots')) ribo_plotting.plot_start_codon_average( self, up=100, down=100, min_cds_reads=self.settings.get_property('comparison_read_cutoff'), read_end='3p', read_lengths='all') ribo_plotting.plot_stop_codon_average( self, up=100, down=100, min_cds_reads=self.settings.get_property('comparison_read_cutoff'), read_end='3p', read_lengths='all') ribo_plotting.plot_fragment_length_distributions(self) ribo_plotting.plot_frame_distributions(self)
def process_settings(self, settings_file): """ - reads the settings file and converts str to float, list, etc. - stores result in self.settings as a dict() - CRITICAL NOTE: All keys must be lower case """ # TODO: Add new parameters and comments to settings files int_keys = [ 'comparison_read_cutoff', 'min_post_trimming_length', 'max_post_trimming_length', 'sequence_quality_cutoff', 'trim_5p', 'star_index_sparsity', 'outfiltermultimapnmax', 'outfiltermismatchnmax', 'alignsjdboverhangmin', 'alignsjoverhangmin', 'alignintronmax', 'five_prime_p_offset' ] float_keys = ['atail_purity_cutoff'] str_keys = [ 'adaptor_3p_sequence', 'star_genome_dir', 'star_ncrna_dir', 'genome_sequence_dir', 'ncrna_sequence_dir', 'annotation_gtf_file', 'qc_annotation_gtf_file', 'alignendstype' ] #if transcriptome_only is true, will genreate a transcriptome-only genome to map to boolean_keys = [ 'transcriptome_mapping_only', 'deduplicate_reads', 'force_remapping', 'force_recount', 'rebuild_star_index', 'force_retrim', 'make_interactive_plots', 'reads_reversed', 'add_3_for_stop', 'forbid_non_polya_soft_clip', 'unique_mapping_only' ] list_str_keys = ['fastq_gz_files', 'sample_names'] #list_float_keys = ['concentrations', 'input_rna'] extant_files = ['genome_sequence_dir', 'annotation_gtf_file'] config = ConfigParser.ConfigParser() config.read(settings_file) settings = {} for section in config.sections(): for option in config.options(section): settings[option] = config.get(section, option) settings[section] = True for k in int_keys: settings[k] = int(settings[k]) for k in str_keys: settings[k] = settings[k] for k in float_keys: settings[k] = float(settings[k]) for k in boolean_keys: if not settings[k].lower() in ['true', 'false']: raise ValueError('Boolean value %s must be "true" or "false"' % k) settings[k] = settings[k].lower() == 'true' #for k in list_float_keys: # settings[k] = map(float, simplejson.loads(settings[k])) #for k in list_int_keys: # settings[k] = map(int, simplejson.loads(settings[k])) for k in list_str_keys: settings[k] = simplejson.loads(settings[k]) self.fqdir = settings['fastq_dir'] self.sample_names = settings['sample_names'] self.fastq_gz_files = settings['fastq_gz_files'] self.fastq_gz_file_handles = [ os.path.join(self.fqdir, fastq_gz_file) for fastq_gz_file in self.fastq_gz_files ] for file_handle in self.fastq_gz_file_handles: try: assert ribo_utils.file_exists(file_handle) except: print 'ERROR: nonexistent file ', file_handle sys.exit() for k in extant_files: try: assert ribo_utils.file_exists(settings[k]) except AssertionError: print 'file %s does not exist' % settings[k] sys.exit() self.settings = settings self.rdir = settings['results_dir'] ribo_utils.make_dir(self.rdir) shutil.copy(settings_file, self.rdir)
def get_rdir(self): ribo_utils.make_dir(self.rdir) return self.rdir