Ejemplo n.º 1
0
 def make_genome_mapping_index(self):
     make_index = False
     if ribo_utils.file_exists(
             self.settings.get_property('star_genome_dir')):
         self.settings.write_to_log(
             'STAR index exists at %s' %
             self.settings.get_property('star_genome_dir'))
         if self.settings.get_property('rebuild_star_index'):
             self.settings.write_to_log('STAR index rebuild forced')
             make_index = True
         else:
             self.settings.write_to_log('using existing STAR index')
     else:
         make_index = True
         ribo_utils.make_dir(self.settings.get_property('star_genome_dir'))
     if make_index:
         self.settings.write_to_log('building STAR index')
         if self.settings.get_property('transcriptome_mapping_only'):
             command_to_run = 'STAR --runThreadN %d --runMode genomeGenerate --genomeDir %s --genomeFastaFiles %s --genomeSAsparseD %d 1>>%s 2>>%s' % (
                 self.threads,
                 self.settings.get_property('star_genome_dir'),
                 self.settings.get_transcriptome_FASTA(),
                 self.settings.get_property('star_index_sparsity'),
                 self.settings.get_log(), self.settings.get_log())
         else:
             command_to_run = 'STAR --runThreadN %d --runMode genomeGenerate --genomeDir %s --genomeFastaFiles %s*.fa --genomeSAsparseD %d 1>>%s 2>>%s' % (
                 self.threads,
                 self.settings.get_property('star_genome_dir'),
                 self.settings.get_genome_sequence_dir(),
                 self.settings.get_property('star_index_sparsity'),
                 self.settings.get_log(), self.settings.get_log())
         self.settings.write_to_log(command_to_run)
         subprocess.Popen(command_to_run, shell=True).wait()
     self.settings.write_to_log('STAR index ready')
Ejemplo n.º 2
0
 def sequence_counts_exist(self):
     sequence_counts = self.get_transcript_counts()
     return ribo_utils.file_exists(sequence_counts)
Ejemplo n.º 3
0
    def process_settings(self, settings_file):
        """
        - reads the settings file and converts str to float, list, etc.
        - stores result in self.settings as a dict()
        - CRITICAL NOTE: All keys must be lower case
        """
        # TODO: Add new parameters and comments to settings files

        int_keys = [
            'comparison_read_cutoff', 'min_post_trimming_length',
            'max_post_trimming_length', 'sequence_quality_cutoff', 'trim_5p',
            'star_index_sparsity', 'outfiltermultimapnmax',
            'outfiltermismatchnmax', 'alignsjdboverhangmin',
            'alignsjoverhangmin', 'alignintronmax', 'five_prime_p_offset'
        ]
        float_keys = ['atail_purity_cutoff']
        str_keys = [
            'adaptor_3p_sequence', 'star_genome_dir', 'star_ncrna_dir',
            'genome_sequence_dir', 'ncrna_sequence_dir', 'annotation_gtf_file',
            'qc_annotation_gtf_file', 'alignendstype'
        ]
        #if transcriptome_only is true, will genreate a transcriptome-only genome to map to
        boolean_keys = [
            'transcriptome_mapping_only', 'deduplicate_reads',
            'force_remapping', 'force_recount', 'rebuild_star_index',
            'force_retrim', 'make_interactive_plots', 'reads_reversed',
            'add_3_for_stop', 'forbid_non_polya_soft_clip',
            'unique_mapping_only'
        ]
        list_str_keys = ['fastq_gz_files', 'sample_names']
        #list_float_keys = ['concentrations', 'input_rna']
        extant_files = ['genome_sequence_dir', 'annotation_gtf_file']
        config = ConfigParser.ConfigParser()
        config.read(settings_file)
        settings = {}
        for section in config.sections():
            for option in config.options(section):
                settings[option] = config.get(section, option)
                settings[section] = True
        for k in int_keys:
            settings[k] = int(settings[k])
        for k in str_keys:
            settings[k] = settings[k]
        for k in float_keys:
            settings[k] = float(settings[k])
        for k in boolean_keys:
            if not settings[k].lower() in ['true', 'false']:
                raise ValueError('Boolean value %s must be "true" or "false"' %
                                 k)
            settings[k] = settings[k].lower() == 'true'
        #for k in list_float_keys:
        #    settings[k] = map(float, simplejson.loads(settings[k]))
        #for k in list_int_keys:
        #    settings[k] = map(int, simplejson.loads(settings[k]))
        for k in list_str_keys:
            settings[k] = simplejson.loads(settings[k])
        self.fqdir = settings['fastq_dir']
        self.sample_names = settings['sample_names']
        self.fastq_gz_files = settings['fastq_gz_files']
        self.fastq_gz_file_handles = [
            os.path.join(self.fqdir, fastq_gz_file)
            for fastq_gz_file in self.fastq_gz_files
        ]
        for file_handle in self.fastq_gz_file_handles:
            try:
                assert ribo_utils.file_exists(file_handle)
            except:
                print 'ERROR: nonexistent file ', file_handle
                sys.exit()
        for k in extant_files:
            try:
                assert ribo_utils.file_exists(settings[k])
            except AssertionError:
                print 'file %s does not exist' % settings[k]
                sys.exit()

        self.settings = settings
        self.rdir = settings['results_dir']
        ribo_utils.make_dir(self.rdir)
        shutil.copy(settings_file, self.rdir)
Ejemplo n.º 4
0
 def genome_mapped_reads_exist(self):
     mapped_reads = self.get_genome_mapped_reads()
     return ribo_utils.file_exists(mapped_reads)
Ejemplo n.º 5
0
 def qc_pickle_exists(self):
     qc_pickle = self.get_qc_pickle()
     return ribo_utils.file_exists(qc_pickle)
Ejemplo n.º 6
0
 def ncrna_unmapped_reads_exist(self):
     mapped_reads = self.get_ncrna_unmapped_reads()
     return ribo_utils.file_exists(mapped_reads)
Ejemplo n.º 7
0
    def ncrna_mapping_finished(self):
        unmapped_reads_summary = os.path.join(
            self.experiment_settings.get_rdir(), 'ncrna_mapped_reads',
            '%(sample_name)sLog.final.out' % {'sample_name': self.sample_name})

        return ribo_utils.file_exists(unmapped_reads_summary)
Ejemplo n.º 8
0
 def trimmed_reads_exist(self):
     trimmed_reads = self.get_trimmed_reads()
     return ribo_utils.file_exists(trimmed_reads)
Ejemplo n.º 9
0
 def deduplicated_reads_exist(self):
     deduplicated_reads = self.get_deduplicated_reads()
     return ribo_utils.file_exists(deduplicated_reads)
Ejemplo n.º 10
0
 def adaptorless_reads_exist(self):
     trimmed_reads = self.get_adaptor_trimmed_reads()
     return ribo_utils.file_exists(trimmed_reads)