def process(self): # Reduce inputs to only first element if hasattr(self.input_dir, '__iter__'): self.input_dir = self.input_dir[0] self.input_dir = os.path.join(self.input_dir, "Data/Intensities/BaseCalls/") if type(self.sample_sheet) == list: if len(self.sample_sheet) > 1: raise Exception('Too many sample sheet files: %s' % ','.join(self.sample_sheet)) else: self.sample_sheet = self.sample_sheet[0] ss = SampleSheet(self.sample_sheet) mask_length, double_idx = ss.get_mask_length() if double_idx: self.use_base_mask = "y*,I{0},I{0},Y*".format(mask_length) else: self.use_base_mask = "y*,I{0},Y*".format(mask_length) self.use_base_mask = str(self.use_base_mask) super(CasavaDemux, self).process() prj_dir = os.path.join(self.output_dir, 'Project_' + self.meta['pipeline']['project_name']) self.output_files = utils.find(prj_dir, "*.fastq.gz") #set the metadata self.meta['job']['sample_id'] = [] sample_ids = ss.get_sample_ids() for output_file in self.output_files: for sample_id in sample_ids: if os.path.basename(output_file).startswith("%s_" % sample_id): self.meta['job']['sample_id'].append(sample_id) break
def create(sample_sheet, input_dir, output_dir=None, output_file_name=None): """ Crete a file of file names and return the path to it Args: sample_sheet: full path to the sample sheet input_dir: path to the directory containing the input files output_file: name of the output fofn. If is not specified the name """ if not os.path.exists(sample_sheet): raise Exception("input error: parameter `sample_sheet` %s does not exist" % sample_sheet) if not os.path.exists(input_dir): raise Exception("input error: parameter `input_dir` %s does not exist" % sample_sheet) print("*********************************") print("sample_sheet: %s" % os.path.abspath(sample_sheet)) print("input_dir: %s" % os.path.abspath(input_dir)) print("*********************************") #set default name of the output fofn if not output_file_name: output_file_name = os.path.basename(sample_sheet).rsplit(".", 1)[0] + "_fofn.csv" if not output_dir: output_dir = os.path.dirname(sample_sheet) output_file = os.path.join(output_dir, output_file_name) with open(output_file, 'w') as f_fofn: ss = SampleSheet(sample_sheet) sample_id_list = ss.get_sample_ids() for sample_id in sample_id_list: print("*********************************") print "sample_id : %s" %sample_id for root, dirs, file_list in os.walk(input_dir): #group the files by sample id and read number r1_files = [ os.path.join(root, file_name) for file_name in file_list if ( '%s_'%sample_id in file_name and Fofn.r1_regex.search(file_name) ) ] r2_files = [ os.path.join(root, file_name) for file_name in file_list if ( '%s_'%sample_id in file_name and Fofn.r2_regex.search(file_name) ) ] r1_file = "" r2_file = "" if r1_files: for r1_file in r1_files: #filter the R2 files that match the R1 file base r2_matchs = [ r2_file for r2_file in r2_files if ( Fofn.r1_regex.search(r1_file).group(1) in r2_file) ] if r2_matchs: Fofn._write_record(f_fofn, r1_file, r2_matchs[0], sample_id) else: if r2_files: print("No R2 found for sample Id %s" % sample_id) Fofn._write_record(f_fofn, r1_file, '', sample_id) else: if r2_files: for r2_file in r2_files: Fofn._write_record(f_fofn, r1_file, r2_file, sample_id) return output_file
def create(sample_sheet, input_dir, output_dir=None, output_file_name=None): """ Crete a file of file names and return the path to it Args: sample_sheet: full path to the sample sheet input_dir: path to the directory containing the input files output_file: name of the output fofn. If is not specified the name """ if not os.path.exists(sample_sheet): raise Exception( "input error: parameter `sample_sheet` %s does not exist" % sample_sheet) if not os.path.exists(input_dir): raise Exception( "input error: parameter `input_dir` %s does not exist" % sample_sheet) print("*********************************") print("sample_sheet: %s" % os.path.abspath(sample_sheet)) print("input_dir: %s" % os.path.abspath(input_dir)) print("*********************************") #set default name of the output fofn if not output_file_name: output_file_name = os.path.basename(sample_sheet).rsplit( ".", 1)[0] + "_fofn.csv" if not output_dir: output_dir = os.path.dirname(sample_sheet) output_file = os.path.join(output_dir, output_file_name) with open(output_file, 'w') as f_fofn: ss = SampleSheet(sample_sheet) sample_id_list = ss.get_sample_ids() for sample_id in sample_id_list: print("*********************************") print "sample_id : %s" % sample_id for root, dirs, file_list in os.walk(input_dir): #group the files by sample id and read number r1_files = [ os.path.join(root, file_name) for file_name in file_list if ('%s_' % sample_id in file_name and Fofn.r1_regex.search(file_name)) ] r2_files = [ os.path.join(root, file_name) for file_name in file_list if ('%s_' % sample_id in file_name and Fofn.r2_regex.search(file_name)) ] r1_file = "" r2_file = "" if r1_files: for r1_file in r1_files: #filter the R2 files that match the R1 file base r2_matchs = [ r2_file for r2_file in r2_files if (Fofn.r1_regex.search(r1_file).group(1) in r2_file) ] if r2_matchs: Fofn._write_record(f_fofn, r1_file, r2_matchs[0], sample_id) else: if r2_files: print("No R2 found for sample Id %s" % sample_id) Fofn._write_record(f_fofn, r1_file, '', sample_id) else: if r2_files: for r2_file in r2_files: Fofn._write_record(f_fofn, r1_file, r2_file, sample_id) return output_file