def map_for_contaminating_sequences_one_lib(self, lib_settings): #first, take unmapped sequences and map them to yeast rRNA, counting mapping stats if not tps_utils.file_exists(lib_settings.get_rRNA_unmapped_reads()): subprocess.Popen( 'bowtie2 -f -D 20 -R 3 -N 1 -L 15 -i S,1,0.50 -x %s -p %d -U %s --un-gz %s 2>>%s | samtools view -bS - > %s 2>>%s ' % ( self.experiment_settings.get_rRNA_bowtie_index(), self.threads, lib_settings.get_unmappable_reads(), lib_settings.get_rRNA_unmapped_reads(), lib_settings.get_rRNA_mapping_stats(), lib_settings.get_rRNA_mapped_reads(), lib_settings.get_log(), ), shell=True).wait() if not tps_utils.file_exists(lib_settings.get_genome_unmapped_reads()): #take still unmapped sequences and map them to the rest of the yeast genome, counting mapping stats subprocess.Popen( 'bowtie2 -f -D 20 -R 3 -N 1 -L 15 -i S,1,0.50 -x %s -p %d -U %s --un-gz %s 2>>%s | samtools view -bS - > %s 2>>%s ' % ( self.experiment_settings.get_genome_bowtie_index(), self.threads, lib_settings.get_rRNA_unmapped_reads(), lib_settings.get_genome_unmapped_reads(), lib_settings.get_genome_mapping_stats(), lib_settings.get_genome_mapped_reads(), lib_settings.get_log(), ), shell=True).wait()
def map_for_contaminating_sequences_one_lib(self, lib_settings): # first, take unmapped sequences and map them to yeast rRNA, counting mapping stats if not tps_utils.file_exists(lib_settings.get_rRNA_unmapped_reads()): subprocess.Popen( "bowtie2 -f -D 20 -R 3 -N 1 -L 15 -i S,1,0.50 -x %s -p %d -U %s --un-gz %s 2>>%s | samtools view -bS - > %s 2>>%s " % ( self.experiment_settings.get_rRNA_bowtie_index(), self.threads, lib_settings.get_unmappable_reads(), lib_settings.get_rRNA_unmapped_reads(), lib_settings.get_rRNA_mapping_stats(), lib_settings.get_rRNA_mapped_reads(), lib_settings.get_log(), ), shell=True, ).wait() if not tps_utils.file_exists(lib_settings.get_genome_unmapped_reads()): # take still unmapped sequences and map them to the rest of the yeast genome, counting mapping stats subprocess.Popen( "bowtie2 -f -D 20 -R 3 -N 1 -L 15 -i S,1,0.50 -x %s -p %d -U %s --un-gz %s 2>>%s | samtools view -bS - > %s 2>>%s " % ( self.experiment_settings.get_genome_bowtie_index(), self.threads, lib_settings.get_rRNA_unmapped_reads(), lib_settings.get_genome_unmapped_reads(), lib_settings.get_genome_mapping_stats(), lib_settings.get_genome_mapped_reads(), lib_settings.get_log(), ), shell=True, ).wait()
def process_settings(self, settings_file): """ - reads the settings file and converts str to float, list, etc. - stores result in self.settings as a dict() """ int_keys = [ 'first_base_to_keep', 'last_base_to_keep', 'max_reads_to_split', 'minimum_reads_for_inclusion', 'pool_5trim', 'pool_3trim', 'min_post_adaptor_length'] #float_keys = [] str_keys = ['adaptor_sequence', 'rrna_index', 'genome_index', 'pool_append', 'pool_prepend', 'primer_sequence'] boolean_keys = ['collapse_identical_reads', 'force_read_resplit', 'force_remapping', 'force_recollapse', 'force_recount', 'force_index_rebuild', 'force_retrim', 'trim_adaptor'] list_str_keys = ['fastq_gz_files', 'sample_names'] #list_float_keys = ['concentrations', 'input_rna'] extant_files = ['pool_fasta',] config = ConfigParser.ConfigParser() config.read(settings_file) settings = {} for section in config.sections(): for option in config.options(section): settings[option] = config.get(section, option) settings[section] = True for k in int_keys: settings[k] = int(settings[k]) for k in str_keys: settings[k] = settings[k] #for k in float_keys: # settings[k] = float(settings[k]) for k in boolean_keys: if not settings[k].lower() in ['true', 'false']: raise ValueError( 'Boolean value %s must be "true" or "false"' % k) settings[k] = settings[k].lower() == 'true' #for k in list_float_keys: # settings[k] = map(float, simplejson.loads(settings[k])) #for k in list_int_keys: # settings[k] = map(int, simplejson.loads(settings[k])) for k in list_str_keys: settings[k] = simplejson.loads(settings[k]) self.fqdir = settings['fastq_dir'] self.sample_names = settings['sample_names'] self.fastq_gz_file_handles = [os.path.join(self.fqdir, fastq_gz_file) for fastq_gz_file in settings['fastq_gz_files']] for file_handle in self.fastq_gz_file_handles: assert tps_utils.file_exists(file_handle) for k in extant_files: assert tps_utils.file_exists(settings[k]) self.settings = settings self.wdir = settings['working_dir'] self.rdir = settings['results_dir'] shutil.copy(settings_file, self.rdir)
def get_collapsed_read_fractions(self, lib_settings): out_name = os.path.join( self.experiment_settings.get_rdir(), 'QC', 'collapsed_fracs', '%(sample_name)s.collapsed_read_fractions.pkl' % {'sample_name': lib_settings.sample_name}) if not tps_utils.file_exists( out_name) and not self.experiment_settings.get_property( 'force_recollapse'): collapsed_reads_file = lib_settings.get_collapsed_reads() read_counts = [] f = gzip.open(collapsed_reads_file) for line in f: if not line.strip() == '' and not line.startswith( '#'): #ignore empty lines and commented out lines if line.startswith( '>'): #> marks the start of a new sequence num_reads = int(line[1:].strip().split('-')[1]) read_counts.append(num_reads) else: continue f.close() read_fractions = np.array(read_counts) / float(sum(read_counts)) bzUtils.makePickle(read_fractions, out_name) else: read_fractions = bzUtils.unPickle(out_name) return (lib_settings.sample_name, read_fractions)
def get_collapsed_read_fractions(self, lib_settings): out_name = os.path.join( self.experiment_settings.get_rdir(), "QC", "collapsed_fracs", "%(sample_name)s.collapsed_read_fractions.pkl" % {"sample_name": lib_settings.sample_name}, ) if not tps_utils.file_exists(out_name) and not self.experiment_settings.get_property("force_recollapse"): collapsed_reads_file = lib_settings.get_collapsed_reads() read_counts = [] f = gzip.open(collapsed_reads_file) for line in f: if not line.strip() == "" and not line.startswith("#"): # ignore empty lines and commented out lines if line.startswith(">"): # > marks the start of a new sequence num_reads = int(line[1:].strip().split("-")[1]) read_counts.append(num_reads) else: continue f.close() read_fractions = np.array(read_counts) / float(sum(read_counts)) bzUtils.makePickle(read_fractions, out_name) else: read_fractions = bzUtils.unPickle(out_name) return (lib_settings.sample_name, read_fractions)
def sequence_counts_exist(self): sequence_counts = self.get_sequence_counts() return tps_utils.file_exists(sequence_counts)
def mapped_reads_exist(self): mapped_reads = self.get_mapped_reads() return tps_utils.file_exists(mapped_reads)
def trimmed_reads_exist(self): trimmed_reads = self.get_trimmed_reads() return tps_utils.file_exists(trimmed_reads)
def primerless_reads_exist(self): primerless_reads = self.get_primer_trimmed_reads() return tps_utils.file_exists(primerless_reads)
def adaptorless_reads_exist(self): adaptorless_reads = self.get_adaptor_trimmed_reads() return tps_utils.file_exists(adaptorless_reads)
def collapsed_reads_exist(self): collapsed_reads = self.get_collapsed_reads() return tps_utils.file_exists(collapsed_reads)
def split_reads_exist(self): split_reads = self.get_split_reads() return tps_utils.file_exists(split_reads)
def bowtie_index_exists(self): return tps_utils.file_exists(self.get_bowtie_index()+'.1.bt2')