Beispiel #1
0
 def get_log(self):
     ribo_utils.make_dir(
         os.path.join(self.experiment_settings.get_rdir(), 'logs'))
     log = os.path.join(
         self.experiment_settings.get_rdir(), 'logs',
         '%(sample_name)s.log' % {'sample_name': self.sample_name})
     return log
 def make_genome_mapping_index(self):
     make_index = False
     if ribo_utils.file_exists(
             self.settings.get_property('star_genome_dir')):
         self.settings.write_to_log(
             'STAR index exists at %s' %
             self.settings.get_property('star_genome_dir'))
         if self.settings.get_property('rebuild_star_index'):
             self.settings.write_to_log('STAR index rebuild forced')
             make_index = True
         else:
             self.settings.write_to_log('using existing STAR index')
     else:
         make_index = True
         ribo_utils.make_dir(self.settings.get_property('star_genome_dir'))
     if make_index:
         self.settings.write_to_log('building STAR index')
         if self.settings.get_property('transcriptome_mapping_only'):
             command_to_run = 'STAR --runThreadN %d --runMode genomeGenerate --genomeDir %s --genomeFastaFiles %s --genomeSAsparseD %d 1>>%s 2>>%s' % (
                 self.threads,
                 self.settings.get_property('star_genome_dir'),
                 self.settings.get_transcriptome_FASTA(),
                 self.settings.get_property('star_index_sparsity'),
                 self.settings.get_log(), self.settings.get_log())
         else:
             command_to_run = 'STAR --runThreadN %d --runMode genomeGenerate --genomeDir %s --genomeFastaFiles %s*.fa --genomeSAsparseD %d 1>>%s 2>>%s' % (
                 self.threads,
                 self.settings.get_property('star_genome_dir'),
                 self.settings.get_genome_sequence_dir(),
                 self.settings.get_property('star_index_sparsity'),
                 self.settings.get_log(), self.settings.get_log())
         self.settings.write_to_log(command_to_run)
         subprocess.Popen(command_to_run, shell=True).wait()
     self.settings.write_to_log('STAR index ready')
 def make_tables(self):
     ribo_utils.make_dir(self.rdir_path('tables'))
     ribo_tables.make_readthrough_table(self)
     ribo_tables.make_detailed_readthrough_table(self)
     ribo_tables.transcriptome_features_table(self)
     ribo_tables.make_cds_rpkm_table(self)
     ribo_tables.make_cds_counts_table(self)
 def map_reads_to_ncrna(self):
     """
     map all reads using bowtie
     :return:
     """
     if not self.settings.get_property('force_remapping'):
         for lib_settings in self.settings.iter_lib_settings():
             if not lib_settings.ncrna_mapped_reads_exist():
                 break
         else:
             self.settings.write_to_log(
                 'using existing noncoding RNA mapped reads')
             return
     else:
         self.settings.write_to_log('remapping forced')
     self.settings.write_to_log('mapping reads')
     ribo_utils.make_dir(self.rdir_path('ncrna_mapped_reads'))
     ribo_utils.parmap(lambda lib_setting: self.map_one_library_to_ncrna(
         lib_setting, self.genome_threads_per_instance), [
             lib_setting
             for lib_setting in self.settings.iter_lib_settings()
             if not lib_setting.ncrna_mapping_finished()
         ],
                       nprocs=self.genome_num_instances)
     self.settings.write_to_log('finished mapping reads to noncoding RNA')
    def initialize_libs(self):
        self.settings.write_to_log('initializing libraries, counting reads')
        ribo_utils.make_dir(self.rdir_path('transcript_counts'))
        self.libs = []

        ribo_utils.parmap(lambda lib_settings: ribo_lib.assign_tx_reads(
            self, self.settings, lib_settings),
                          self.settings.iter_lib_settings(),
                          nprocs=self.threads)
        map(lambda lib_settings: self.initialize_lib(lib_settings),
            self.settings.iter_lib_settings())
        self.settings.write_to_log(
            'initializing libraries, counting reads, done')
 def deduplicate_reads(self):
     for lib_settings in self.settings.iter_lib_settings():
         if not lib_settings.deduplicated_reads_exist():
             break
     else:  #else clause executes if break did not occur
         self.settings.write_to_log('using existing deduplicated reads')
         return
     self.settings.write_to_log('deduplicating reads with tally')
     ribo_utils.make_dir(self.rdir_path('deduplicated'))
     ribo_utils.parmap(
         lambda lib_setting: self.deduplicate_reads_one_lib(lib_setting),
         self.settings.iter_lib_settings(),
         nprocs=self.num_instances)
     self.settings.write_to_log('done deduplicating reads with tally')
    def __init__(self, settings, threads):
        self.threads = threads
        self.settings = settings
        self.all_settings = [
            lib_setting for lib_setting in self.settings.iter_lib_settings()
        ]
        self.num_datasets = len(self.all_settings)
        self.num_instances = min(self.num_datasets, self.threads)
        self.threads_per_instance = max(self.threads / self.num_instances - 1,
                                        1)

        self.genome_num_instances = min(min(self.num_datasets, self.threads),
                                        5)
        self.genome_threads_per_instance = max(
            self.threads / self.genome_num_instances, 1)

        self.settings.write_to_log(
            'Initializing experiment %s' %
            self.settings.get_property('experiment_name'))
        self.num_libs = len([x for x in settings.iter_lib_settings()])
        self.make_ncRNA_mapping_index()
        self.settings.write_to_log('loading GTF annotations')
        self.GTF_annotations = ribo_utils.gtf_data(
            self.settings.get_annotation_GTF_file(),
            add_3_for_stop=self.settings.get_property('add_3_for_stop'))
        self.settings.write_to_log('loading genome sequence')
        self.genome = ribo_utils.genome_sequence(
            self.settings.get_genome_sequence_files())
        if self.settings.get_property('transcriptome_mapping_only'):
            #only mapping to the transcriptome, not the genome, so need to generate the genome to map to
            ribo_utils.make_dir(self.settings.get_transcriptome_dir())
            self.GTF_annotations.write_transcript_sequences_to_FASTA(
                self.settings.get_transcriptome_FASTA(), self.genome)
            self.settings.write_to_log('loading transcriptome sequence')
            self.transcriptome_sequence = ribo_utils.genome_sequence(
                self.settings.get_transcriptome_FASTA())
        self.make_genome_mapping_index()
        self.deduplicate_reads()
        self.trim_reads()
        self.remove_adaptor()
        self.map_reads_to_ncrna()
        self.map_reads_to_genome()

        self.initialize_libs()
        self.settings.write_to_log(
            'Finished initializing experiment %s\n' %
            self.settings.get_property('experiment_name'))
 def remove_adaptor(self):
     if not self.settings.get_property('force_retrim'):
         for lib_settings in self.settings.iter_lib_settings():
             if not lib_settings.adaptorless_reads_exist():
                 break
         else:
             self.settings.write_to_log('using existing adaptorless reads')
             return
     else:
         self.settings.write_to_log('adaptor removal forced')
     self.settings.write_to_log('removing adaptors')
     ribo_utils.make_dir(self.rdir_path('adaptor_removed'))
     ribo_utils.parmap(lambda lib_setting: self.remove_adaptor_one_lib(
         lib_setting, self.threads_per_instance),
                       self.settings.iter_lib_settings(),
                       nprocs=self.num_instances)
     self.settings.write_to_log('done removing adaptors')
 def trim_reads(self):
     if not self.settings.get_property('force_retrim'):
         for lib_settings in self.settings.iter_lib_settings():
             if not lib_settings.trimmed_reads_exist():
                 break
         else:  #else clause executes if break did not occur
             self.settings.write_to_log('using existing trimmed reads')
             return
     else:
         self.settings.write_to_log('read trimming forced')
     self.settings.write_to_log('trimming reads with seqtk')
     ribo_utils.make_dir(self.rdir_path('trimmed'))
     ribo_utils.parmap(lambda lib_setting: self.trim_reads_one_lib(
         lib_setting, self.threads_per_instance),
                       self.settings.iter_lib_settings(),
                       nprocs=self.num_instances)
     self.settings.write_to_log('done trimming reads with seqtk')
Beispiel #10
0
 def __init__(self, experiment, experiment_settings, threads):
     """
     Constructor for Library class
     """
     self.threads = threads
     self.experiment = experiment
     self.experiment_settings = experiment_settings
     self.experiment_settings.write_to_log('initializing QC engine')
     self.get_property = self.experiment_settings.get_property
     self.get_rdir = experiment_settings.get_rdir
     ribo_utils.make_dir(self.experiment.rdir_path('QC'))
     self.genome = self.experiment.genome
     self.full_QC_GTF_annotations = ribo_utils.gtf_data(
         self.experiment_settings.get_QC_annotation_GTF_file(),
         add_3_for_stop=self.experiment_settings.get_property(
             'add_3_for_stop'))
     self.lib_QCs = [
         self.initialize_qc_lib(lib_settings)
         for lib_settings in self.experiment_settings.iter_lib_settings()
     ]
    def make_plots(self):

        ribo_utils.make_dir(self.rdir_path('plots'))

        ribo_plotting.plot_start_codon_average(
            self,
            up=100,
            down=100,
            min_cds_reads=self.settings.get_property('comparison_read_cutoff'),
            read_end='3p',
            read_lengths='all')
        ribo_plotting.plot_stop_codon_average(
            self,
            up=100,
            down=100,
            min_cds_reads=self.settings.get_property('comparison_read_cutoff'),
            read_end='3p',
            read_lengths='all')

        ribo_plotting.plot_fragment_length_distributions(self)
        ribo_plotting.plot_frame_distributions(self)
Beispiel #12
0
    def process_settings(self, settings_file):
        """
        - reads the settings file and converts str to float, list, etc.
        - stores result in self.settings as a dict()
        - CRITICAL NOTE: All keys must be lower case
        """
        # TODO: Add new parameters and comments to settings files

        int_keys = [
            'comparison_read_cutoff', 'min_post_trimming_length',
            'max_post_trimming_length', 'sequence_quality_cutoff', 'trim_5p',
            'star_index_sparsity', 'outfiltermultimapnmax',
            'outfiltermismatchnmax', 'alignsjdboverhangmin',
            'alignsjoverhangmin', 'alignintronmax', 'five_prime_p_offset'
        ]
        float_keys = ['atail_purity_cutoff']
        str_keys = [
            'adaptor_3p_sequence', 'star_genome_dir', 'star_ncrna_dir',
            'genome_sequence_dir', 'ncrna_sequence_dir', 'annotation_gtf_file',
            'qc_annotation_gtf_file', 'alignendstype'
        ]
        #if transcriptome_only is true, will genreate a transcriptome-only genome to map to
        boolean_keys = [
            'transcriptome_mapping_only', 'deduplicate_reads',
            'force_remapping', 'force_recount', 'rebuild_star_index',
            'force_retrim', 'make_interactive_plots', 'reads_reversed',
            'add_3_for_stop', 'forbid_non_polya_soft_clip',
            'unique_mapping_only'
        ]
        list_str_keys = ['fastq_gz_files', 'sample_names']
        #list_float_keys = ['concentrations', 'input_rna']
        extant_files = ['genome_sequence_dir', 'annotation_gtf_file']
        config = ConfigParser.ConfigParser()
        config.read(settings_file)
        settings = {}
        for section in config.sections():
            for option in config.options(section):
                settings[option] = config.get(section, option)
                settings[section] = True
        for k in int_keys:
            settings[k] = int(settings[k])
        for k in str_keys:
            settings[k] = settings[k]
        for k in float_keys:
            settings[k] = float(settings[k])
        for k in boolean_keys:
            if not settings[k].lower() in ['true', 'false']:
                raise ValueError('Boolean value %s must be "true" or "false"' %
                                 k)
            settings[k] = settings[k].lower() == 'true'
        #for k in list_float_keys:
        #    settings[k] = map(float, simplejson.loads(settings[k]))
        #for k in list_int_keys:
        #    settings[k] = map(int, simplejson.loads(settings[k]))
        for k in list_str_keys:
            settings[k] = simplejson.loads(settings[k])
        self.fqdir = settings['fastq_dir']
        self.sample_names = settings['sample_names']
        self.fastq_gz_files = settings['fastq_gz_files']
        self.fastq_gz_file_handles = [
            os.path.join(self.fqdir, fastq_gz_file)
            for fastq_gz_file in self.fastq_gz_files
        ]
        for file_handle in self.fastq_gz_file_handles:
            try:
                assert ribo_utils.file_exists(file_handle)
            except:
                print 'ERROR: nonexistent file ', file_handle
                sys.exit()
        for k in extant_files:
            try:
                assert ribo_utils.file_exists(settings[k])
            except AssertionError:
                print 'file %s does not exist' % settings[k]
                sys.exit()

        self.settings = settings
        self.rdir = settings['results_dir']
        ribo_utils.make_dir(self.rdir)
        shutil.copy(settings_file, self.rdir)
Beispiel #13
0
 def get_rdir(self):
     ribo_utils.make_dir(self.rdir)
     return self.rdir