def __init__(self,config,key=int(-1),pipeline_config=None,prev_step=None,process_name='bwa_mem',pipeline=None,**kwargs): """ Initializes the process object. """ if not prev_step is None: if prev_step.__class__.__name__ == "ZcatMultiple": SampleQsubProcess.__init__(self,config,key=key,process_name=process_name,input_dir=prev_step.output_dir,output_dir=os.path.join(prev_step.output_dir,"align"),number_tasks=prev_step.number_tasks,**kwargs) multi_fastq_file = prev_step.multi_fastq_file if not multi_fastq_file is None: self.multi_fastq_file = multi_fastq_file multi_fastq = grab_yaml(self.multi_fastq_file) lane_numbers = list_from_multi_fastq_object(multi_fastq,"lane") flowcells = list_from_multi_fastq_object(multi_fastq,"flowcell") input_r1_fastqs = prev_step.r1_copied.split(":") input_r2_fastqs = prev_step.r2_copied.split(":") input_r1_fastqs = [re.sub(r".gz$","",fastq) for fastq in input_r1_fastqs] input_r2_fastqs = [re.sub(r".gz$","",fastq) for fastq in input_r2_fastqs] self.lane_number = ":".join(lane_numbers) self.flowcell_key = ":".join(flowcells) self.input_fastq1 = ":".join(input_r1_fastqs) self.input_fastq2 = ":".join(input_r2_fastqs) output_sams = [] for input_r1_fastq in input_r1_fastqs: output_sam = re.sub(prev_step.output_dir,self.output_dir,re.sub(".fastq",".sam",input_r1_fastq)) output_sam = re.sub("_R1","",output_sam) output_sams.append(output_sam) self.output_sam = ":".join(output_sams) self.bwa_threads = pipeline_config.get('Program specific parameters','bwa_threads') self.ref_fa = pipeline_config.get('References','genome_fasta') self.project = pipeline.project
def __init__(self,config,key=int(-1),process_name='zcat_multiple',multi_fastq_file=None,**kwargs): """ Initializes the zcat multiple process object. """ SampleQsubProcess.__init__(self,config,key=key,process_name=process_name,**kwargs) #Grab the first read files. r1_in_list = [] if not multi_fastq_file is None: self.multi_fastq_file = multi_fastq_file multi_fastq = grab_yaml(self.multi_fastq_file) r1_in_list = list_from_multi_fastq_object(multi_fastq,"r1_filename") self.r1_input = ":".join(r1_in_list) r1_out_list = [] r1_uncompressed_list = [] for i in range(len(r1_in_list)): filename = self.sample_key + "_" + str(i) + "_R1.fastq" r1_uncompressed_list.append(os.path.join(self.output_dir,filename)) if r1_in_list[i][-3:] == '.gz': filename += ".gz" r1_out_list.append(os.path.join(self.output_dir,filename)) ##For the copy process self.r1_copied = ":".join(r1_out_list) self.r1_uncompressed = ":".join(r1_uncompressed_list) #Grab the paired read files. r2_in_list = [] if not multi_fastq_file is None: r2_in_list = list_from_multi_fastq_object(multi_fastq,"r2_filename") self.r2_input = ":".join(r2_in_list) r2_out_list = [] r2_uncompressed_list = [] for i in range(len(r2_in_list)): filename = self.sample_key + "_" + str(i) + "_R2.fastq" r2_uncompressed_list.append(os.path.join(self.output_dir,filename)) if r2_in_list[i][-3:] == '.gz': filename += ".gz" r2_out_list.append(os.path.join(self.output_dir,filename)) self.r2_copied = ":".join(r2_out_list) self.r2_uncompressed = ":".join(r2_uncompressed_list) if len(r1_in_list) == len(r2_in_list): self.number_tasks = len(r1_in_list) tmp_dirs = [] complete_files = [] for i in range(len(r1_in_list)): task_number = i + 1 complete_file = os.path.join(self.output_dir, self.process_name + '.' + str(task_number) + '.complete') complete_files.append(complete_file) tmp_dir = os.path.join(self.output_dir, 'tmp.' + str(task_number)) if not os.path.isdir(tmp_dir) and not re.search('dummy',self.output_dir): os.makedirs(tmp_dir) tmp_dirs.append(tmp_dir) self.tmp_dir = ":".join(tmp_dirs) self.complete_file = ":".join(complete_files) else: raise Exception("The number of read and matched-pair read files for sample " + sample.key + " are not the same")
def __init__(self,config,key=int(-1),sample=None,process_name='bwa_sampe',multi_fastq_file=None,ref_fa='/mnt/speed/qc/sequencing/biodata/genomes/Hsapiens/GRCh37/bwa/GRCh37.fa',prev_step=None,pipeline=None,**kwargs): """ Initializes the process object. """ if not prev_step is None: if prev_step.__class__.__name__ == "BwaAln": if sample is None: sample = Sample(config,key="dummy_sample_key") if sample.__class__.__name__ != "Sample": raise Exception("Trying to start a qcpipeline process on a non-sample.") SampleQsubProcess.__init__(self,config,key=key,sample=sample,process_name=process_name,input_dir=prev_step.output_dir,output_dir=prev_step.output_dir,number_tasks=prev_step.number_tasks/2,**kwargs) self.project = pipeline.project self.sample_key = sample.key self.ref_fa = prev_step.ref_fa if not multi_fastq_file is None: self.multi_fastq_file = multi_fastq_file multi_fastq = grab_yaml(self.multi_fastq_file) lane_numbers = list_from_multi_fastq_object(multi_fastq,"lane") flowcells = list_from_multi_fastq_object(multi_fastq,"flowcell") self.lane_number = ":".join(lane_numbers) self.flowcell_key = ":".join(flowcells) input_fastqs = prev_step.input_fastq.split(":") input_r1_fastqs = input_fastqs[:self.number_tasks] input_r2_fastqs = input_fastqs[self.number_tasks:] self.input_fastq1 = ":".join(input_r1_fastqs) self.input_fastq2 = ":".join(input_r2_fastqs) input_sais = prev_step.output_sai.split(":") input_r1_sais = input_sais[:self.number_tasks] input_r2_sais = input_sais[self.number_tasks:] self.input_sai1 = ":".join(input_r1_sais) self.input_sai2 = ":".join(input_r2_sais) output_sams = [] for input_r1_fastq in input_r1_fastqs: output_sam = re.sub("_R1.fastq",".sam",input_r1_fastq) output_sams.append(output_sam) self.output_sam = ":".join(output_sams)
def __init__(self,config,key=int(-1),sample=None,flowcell=None,base_output_dir=None,r1_path=None,r2_path=None,description=None,upload_dir=None,process_name='bcbio',capture_target_bed=None,**kwargs): """ Initializes the bcbio process object. """ if flowcell is None: flowcell = Flowcell(config,key="dummy_flowcell_key") if flowcell.__class__.__name__ != "Flowcell": raise Exception("Trying to start a bcbio process on a non-flowcell.") SampleQsubProcess.__init__(self,config,key=key,sample=sample,base_output_dir=base_output_dir,process_name=process_name,**kwargs) self.input_dir = self.output_dir self.r1_path = r1_path self.r2_path = r2_path self.systems_file = os.path.join(self.input_dir,'system.yaml') self.sample_file = os.path.join(self.input_dir,'sample.yaml') self.upload_dir = upload_dir self.flowcell_key = flowcell.key self.description = description if os.path.isfile(self.sample_file): sample_yaml = grab_yaml(self.sample_file) else: sample_yaml = {} #snp_filename = self.sample_key + "_R_" + str(self.date_begin) + "_" + self.flowcell_key + "-sort-dup-gatkrecal-realign-variants-snp.vcf" try: bcbio_lane_name = sample_yaml["details"][0]["lane"] except KeyError: bcbio_lane_name = None if not capture_target_bed is None: bait_bed = capture_target_bed else: try: bait_bed = sample_yaml["details"][0]["algorithm"]["hybrid_bait"] except KeyError: bait_bed = None #exit(str(bcbio_lane_name)+"\n") if bait_bed is None: if bcbio_lane_name is None: snp_filename = "gatk/1_" + str(self.date_begin) + "_" + self.flowcell_key + "-sort-dup-gatkrecal-realign-variants-snp.vcf" else: snp_filename = "gatk/"+bcbio_lane_name+"_" + str(self.date_begin) + "_" + self.flowcell_key + "-sort-dup-gatkrecal-realign-variants-snp.vcf" self.analysis_ready_bam_path = None else: snp_filename = "gatk/" + str(sample.key) + "_" + str(self.date_begin) + "_" + self.flowcell_key + "-sort-variants-ploidyfix-snp.vcf" combined_variant_filename = "gatk/" + str(sample.key) + "_" + str(self.date_begin) + "_" + self.flowcell_key + "-sort-variants-ploidyfix-combined.vcf" self.combined_variant_path = os.path.join(self.output_dir,combined_variant_filename) bam_filename = "bamprep/" + str(description) + "/" + str(sample.key) + "_" + str(self.date_begin) + "_" + self.flowcell_key + "-sort-prep.bam" self.analysis_ready_bam_path = os.path.join(self.output_dir, bam_filename) self.snp_path = os.path.join(self.output_dir,snp_filename) sort_dup_bam = self.sample_key + "_R_" + str(self.date_begin) + "_" + self.flowcell_key + "-sort-dup.bam" self.sort_dup_bam = os.path.join(self.output_dir,sort_dup_bam) self.project_summary_file = os.path.join(self.output_dir,config.get('Filenames','project_summary')) self.restats_file = os.path.join(self.output_dir,config.get('Filenames','restats')) #Stats for this process self.total_reads = None self.percent_aligned = None self.percentage_duplicates = None self.insert_size = None self.gc_content = None self.percentage_on_target_bases = None self.mean_target_coverage = None self.percentage_with_at_least_10x_coverage = None self.percentage_0x_coverage = None self.total_variations = None self.percentage_in_db_snp = None self.titv_all = None self.titv_dbsnp = None self.titv_novel = None