def __init__(self,config,key=int(-1),sample=None,bcbio=None,concordance_filename=None,hethom_filename=None,indbsnp_filename=None,process_name='snp_stats',**kwargs): """ Initializes the snp stats process. """ if bcbio is None: bcbio = Bcbio(config,key=int(-1)) if bcbio.__class__.__name__ != "Bcbio": raise Exception("Trying to start a snp_stats process on a non-bcbio pipeline.") input_dir = bcbio.output_dir output_dir = input_dir SampleQsubProcess.__init__(self,config,key=key,sample=sample,input_dir=input_dir,output_dir=output_dir,process_name=process_name,**kwargs) self.snp_path = bcbio.snp_path if concordance_filename is None: concordance_filename = self.sample_key + ".con" self.concordance_path = os.path.join(self.output_dir,concordance_filename) if hethom_filename is None: hethom_filename = self.sample_key + ".hethom" self.hethom_path = os.path.join(self.output_dir,hethom_filename) if indbsnp_filename is None: indbsnp_filename = self.sample_key + ".indbsnp" self.indbsnp_path = os.path.join(self.output_dir,indbsnp_filename) #Stats for this process self.concordance_calls = None self.percentage_concordance = None self.hom = None self.het = None self.variants_total = None self.hethom_ratio = None self.in_dbsnp = None self.search_key = None
def __init__(self,config,key=int(-1),pipeline_config=None,prev_step=None,process_name='cp',pipeline=None,**kwargs): if not prev_step is None: if pipeline_config is None: pipeline_config = MyConfigParser() pipeline_config.read(config.get('Pipeline',pipeline.obj_type)) cp_input_dir_name = pipeline_config.safe_get('Common_directories','cp_subdir') if cp_input_dir_name is None: cp_input_dir_name = "" if prev_step.obj_type == "CleanBcbio": for root, dirs, files in os.walk(prev_step.output_dir,topdown=False): for filename in files: if filename.endswith(".vcf"): full_path = os.path.join(root,filename) cp_indput_dir = os.path.dirname(full_path) cp_input_dir = os.path.join(pipeline.output_dir,cp_input_dir_name) output_subdir_name = pipeline_config.safe_get('Common_directories','output_subdir','ngv3') cp_dir = os.path.join(pipeline.input_dir,output_subdir_name) if not os.path.exists(cp_dir): os.makedirs(cp_dir) self.cp_dir = cp_dir SampleQsubProcess.__init__(self,config,key=key,input_dir=cp_input_dir,output_dir=pipeline.output_dir,process_name=process_name,**kwargs) if self.sample_key is not None: self.md5_file = os.path.join(cp_dir,self.sample_key + "_exome_md5checksums.txt") else: self.md5_file = "exome_md5checksums.txt"
def __init__(self,config,key=int(-1),pipeline_config=None,prev_step=None,process_name='bwa_mem',pipeline=None,**kwargs): """ Initializes the process object. """ if not prev_step is None: if prev_step.__class__.__name__ == "ZcatMultiple": SampleQsubProcess.__init__(self,config,key=key,process_name=process_name,input_dir=prev_step.output_dir,output_dir=os.path.join(prev_step.output_dir,"align"),number_tasks=prev_step.number_tasks,**kwargs) multi_fastq_file = prev_step.multi_fastq_file if not multi_fastq_file is None: self.multi_fastq_file = multi_fastq_file multi_fastq = grab_yaml(self.multi_fastq_file) lane_numbers = list_from_multi_fastq_object(multi_fastq,"lane") flowcells = list_from_multi_fastq_object(multi_fastq,"flowcell") input_r1_fastqs = prev_step.r1_copied.split(":") input_r2_fastqs = prev_step.r2_copied.split(":") input_r1_fastqs = [re.sub(r".gz$","",fastq) for fastq in input_r1_fastqs] input_r2_fastqs = [re.sub(r".gz$","",fastq) for fastq in input_r2_fastqs] self.lane_number = ":".join(lane_numbers) self.flowcell_key = ":".join(flowcells) self.input_fastq1 = ":".join(input_r1_fastqs) self.input_fastq2 = ":".join(input_r2_fastqs) output_sams = [] for input_r1_fastq in input_r1_fastqs: output_sam = re.sub(prev_step.output_dir,self.output_dir,re.sub(".fastq",".sam",input_r1_fastq)) output_sam = re.sub("_R1","",output_sam) output_sams.append(output_sam) self.output_sam = ":".join(output_sams) self.bwa_threads = pipeline_config.get('Program specific parameters','bwa_threads') self.ref_fa = pipeline_config.get('References','genome_fasta') self.project = pipeline.project
def __init__(self,config,key=int(-1),sample=None,bcbio=None,input_dir=None,base_output_dir=None,output_dir=None,date=strftime("%Y%m%d",localtime()),time=strftime("%H:%M:%S",localtime()),process_name='clean',complete_file=None,**kwargs): if bcbio is None: bcbio = Bcbio(config,key=int(-1)) if bcbio.__class__.__name__ != "Bcbio": raise Exception("Trying to start a summary_stats process on a non-bcbio pipeline.") SampleQsubProcess.__init__(self,config,key=key,sample=sample,input_dir=input_dir,output_dir=output_dir,process_name=process_name,complete_file=complete_file,**kwargs) self.sample_file = bcbio.sample_file
def __init__(self,config,key=int(-1),sample=None,snp_stats=None,output_filename=None,process_name='concord_search',**kwargs): """ Initializes the concordance search process. """ if snp_stats is None: snp_stats = SnpStats(config,key=int(-1)) if snp_stats.__class__.__name__ != "SnpStats": raise Exception("Trying to start a concordance search process for a non-snp_stats process.") input_dir = snp_stats.output_dir output_dir = input_dir SampleQsubProcess.__init__(self,config,key=key,sample=sample,input_dir=input_dir,output_dir=output_dir,process_name=process_name,**kwargs) self.snp_path = snp_stats.snp_path self.first_match = self.sample_key self.first_concordance = snp_stats.percentage_concordance self.second_match = None self.second_concordance = None self.third_match = None self.third_concordance = None self.fourth_match = None self.fourth_concordance = None self.fifth_match = None self.fifth_concordance = None self.sub_qsub_file_front = os.path.join(self.output_dir,"against_") if output_filename is None: output_filename = self.sample_key + "_all.con" self.output_path = os.path.join(self.output_dir,output_filename)
def __init__(self,config,key=int(-1),prev_step=None,process_name='fastqc',input_dir=None,output_dir=None,**kwargs): """ Initializes the zcat process object. """ if not prev_step is None: SampleQsubProcess.__init__(self,config,key=key,input_dir=prev_step.output_dir,output_dir=prev_step.output_dir,process_name=process_name,**kwargs) elif not input_dir is None and not output_dir is None: SampleQsubProcess.__init__(self,config,key=key,input_dir=input_dir,output_dir=output_dir,process_name=process_name,**kwargs)
def __init__(self,config,key=int(-1),process_name='sam_conversion',prev_step=None,**kwargs): """ Initializes the process object. """ if not prev_step is None: if prev_step.__class__.__name__ == "BwaSampe" or prev_step.__class__.__name__ == "BwaMem": SampleQsubProcess.__init__(self,config,key=key,process_name=process_name,input_dir=prev_step.output_dir,output_dir=prev_step.output_dir,number_tasks=prev_step.number_tasks,**kwargs) self.input_sam = prev_step.output_sam self.output_bam, num = re.subn(r".sam",".bam",self.input_sam)
def __init__(self,config,key=int(-1),sample=None,base_output_dir=None,location=None,process_name='backup',**kwargs): """ Initializes the backup process object. Requires config details """ if base_output_dir is None: base_output_dir = config.get('Backup','output_dir') SampleQsubProcess.__init__(self,config,key=key,sample=sample,base_output_dir=base_output_dir,process_name=process_name,**kwargs) self.retry = 0 if location is None: self.location = config.get('Backup','dir_name')
def __init__(self,config,process_name='zcat',**kwargs): """ Initializes the zcat process object. """ SampleQsubProcess.__init__(self,config,process_name=process_name,**kwargs) extension = '' r1_fname = self.sample_key + '_R1.fastq' + extension r2_fname = self.sample_key + '_R2.fastq' + extension self.r1_path = os.path.join(self.output_dir,r1_fname) self.r2_path = os.path.join(self.output_dir,r2_fname)
def __init__(self,config,key=int(-1),process_name='zcat_multiple',multi_fastq_file=None,**kwargs): """ Initializes the zcat multiple process object. """ SampleQsubProcess.__init__(self,config,key=key,process_name=process_name,**kwargs) #Grab the first read files. r1_in_list = [] if not multi_fastq_file is None: self.multi_fastq_file = multi_fastq_file multi_fastq = grab_yaml(self.multi_fastq_file) r1_in_list = list_from_multi_fastq_object(multi_fastq,"r1_filename") self.r1_input = ":".join(r1_in_list) r1_out_list = [] r1_uncompressed_list = [] for i in range(len(r1_in_list)): filename = self.sample_key + "_" + str(i) + "_R1.fastq" r1_uncompressed_list.append(os.path.join(self.output_dir,filename)) if r1_in_list[i][-3:] == '.gz': filename += ".gz" r1_out_list.append(os.path.join(self.output_dir,filename)) ##For the copy process self.r1_copied = ":".join(r1_out_list) self.r1_uncompressed = ":".join(r1_uncompressed_list) #Grab the paired read files. r2_in_list = [] if not multi_fastq_file is None: r2_in_list = list_from_multi_fastq_object(multi_fastq,"r2_filename") self.r2_input = ":".join(r2_in_list) r2_out_list = [] r2_uncompressed_list = [] for i in range(len(r2_in_list)): filename = self.sample_key + "_" + str(i) + "_R2.fastq" r2_uncompressed_list.append(os.path.join(self.output_dir,filename)) if r2_in_list[i][-3:] == '.gz': filename += ".gz" r2_out_list.append(os.path.join(self.output_dir,filename)) self.r2_copied = ":".join(r2_out_list) self.r2_uncompressed = ":".join(r2_uncompressed_list) if len(r1_in_list) == len(r2_in_list): self.number_tasks = len(r1_in_list) tmp_dirs = [] complete_files = [] for i in range(len(r1_in_list)): task_number = i + 1 complete_file = os.path.join(self.output_dir, self.process_name + '.' + str(task_number) + '.complete') complete_files.append(complete_file) tmp_dir = os.path.join(self.output_dir, 'tmp.' + str(task_number)) if not os.path.isdir(tmp_dir) and not re.search('dummy',self.output_dir): os.makedirs(tmp_dir) tmp_dirs.append(tmp_dir) self.tmp_dir = ":".join(tmp_dirs) self.complete_file = ":".join(complete_files) else: raise Exception("The number of read and matched-pair read files for sample " + sample.key + " are not the same")
def __launch__(self,config,node_list=None): """ Checks to make sure there is enough storage. If not, sends email. If so, sends the job to SGE and records pertinent information. """ if node_list is None: node_list = config.get('Zcat','nodes') SampleQsubProcess.__launch__(self,config) #SampleQsubProcess.__launch__(self,config,node_list=node_list,queue_name='single') return True
def __init__(self,config,key=int(-1),process_name='unified_genotyper',pipeline_config=None,prev_step=None,**kwargs): """ Initializes the process object. """ if not prev_step is None: output_dir = re.sub(r"align$","gatk_ug",prev_step.output_dir) self.input_bam = prev_step.output_bam; SampleQsubProcess.__init__(self,config,key=key,new_bam_description="recal",process_name=process_name,input_dir=prev_step.output_dir,output_dir=output_dir,**kwargs) self.output_vcf = os.path.join(output_dir,self.sample_key + ".vcf"); self.ref_fa = pipeline_config.get('References','genome_fasta') self.dbsnp_vcf =pipeline_config.get('References','dbsnp_vcf') qc_dir = re.sub(r"gatk_ug$","qc",self.output_dir) if not os.path.isdir(qc_dir) and not re.search('dummy',self.output_dir): os.makedirs(qc_dir) self.output_metrics = os.path.join(qc_dir,self.sample_key + ".ug_metrics");
def __launch__(self,config,command=None,**kwargs): """ Sends the job to SGE and records pertinent information. """ if command is None: command = ['sleep 30;','qsub'] return SampleQsubProcess.__launch__(self,config,command=command,**kwargs)
def __init__(self,config,key=int(-1),process_name='bwa_aln',prev_step=None,bwa_threads=1,pipeline_config=None,**kwargs): """ Initializes the process object. """ if not prev_step is None: if prev_step.__class__.__name__ == "ZcatMultiple": self.input_fastq = prev_step.r1_uncompressed + ":" + prev_step.r2_uncompressed self.ref_fa = pipeline_config.get('References','genome_fasta') self.bwa_threads = bwa_threads input_fastqs = self.input_fastq.split(":") SampleQsubProcess.__init__(self,config,key=key,process_name=process_name,input_dir=prev_step.input_dir,output_dir=os.path.join(prev_step.output_dir,"align"),number_tasks=2*prev_step.number_tasks,**kwargs) output_sais = [] count = 1 for input_fastq in input_fastqs: output_sai = re.sub(prev_step.output_dir,self.output_dir,input_fastq) output_sai = re.sub(".fastq",".sai",output_sai) output_sais.append(output_sai) count += 1 self.output_sai = ":".join(output_sais)
def __init__(self,config,key=int(-1),sample=None,process_name='bwa_sampe',multi_fastq_file=None,ref_fa='/mnt/speed/qc/sequencing/biodata/genomes/Hsapiens/GRCh37/bwa/GRCh37.fa',prev_step=None,pipeline=None,**kwargs): """ Initializes the process object. """ if not prev_step is None: if prev_step.__class__.__name__ == "BwaAln": if sample is None: sample = Sample(config,key="dummy_sample_key") if sample.__class__.__name__ != "Sample": raise Exception("Trying to start a qcpipeline process on a non-sample.") SampleQsubProcess.__init__(self,config,key=key,sample=sample,process_name=process_name,input_dir=prev_step.output_dir,output_dir=prev_step.output_dir,number_tasks=prev_step.number_tasks/2,**kwargs) self.project = pipeline.project self.sample_key = sample.key self.ref_fa = prev_step.ref_fa if not multi_fastq_file is None: self.multi_fastq_file = multi_fastq_file multi_fastq = grab_yaml(self.multi_fastq_file) lane_numbers = list_from_multi_fastq_object(multi_fastq,"lane") flowcells = list_from_multi_fastq_object(multi_fastq,"flowcell") self.lane_number = ":".join(lane_numbers) self.flowcell_key = ":".join(flowcells) input_fastqs = prev_step.input_fastq.split(":") input_r1_fastqs = input_fastqs[:self.number_tasks] input_r2_fastqs = input_fastqs[self.number_tasks:] self.input_fastq1 = ":".join(input_r1_fastqs) self.input_fastq2 = ":".join(input_r2_fastqs) input_sais = prev_step.output_sai.split(":") input_r1_sais = input_sais[:self.number_tasks] input_r2_sais = input_sais[self.number_tasks:] self.input_sai1 = ":".join(input_r1_sais) self.input_sai2 = ":".join(input_r2_sais) output_sams = [] for input_r1_fastq in input_r1_fastqs: output_sam = re.sub("_R1.fastq",".sam",input_r1_fastq) output_sams.append(output_sam) self.output_sam = ":".join(output_sams)
def __launch__(self,configs,storage_device,node_list=None): """ Checks to make sure there is enough storage. If not, sends email. If so, sends the job to SGE and records pertinent information. """ #If storage device is full, send a notification and abort. if storage_device.__is_full__(configs['pipeline'].get('Storage','required_fastq_size')): send_email(self.__generate_full_error_text__(configs,storage_device)) return False #This differs from the previous check by the fact that the previous does not #account for jobs that are currently being copied. This error is not as #restrictive due to the fact that the required_fastq_size should be larger than #the actual fastq size thus leaving additional storage once complete. if not storage_device.__is_available__(configs['pipeline'].get('Storage','required_fastq_size')) and self.fail_reported == False: send_email(self.__generate_storage_error_text__(configs,storage_device)) self.fail_reported = True return False if node_list is None: node_list = configs['pipeline'].get('Backup','nodes') SampleQsubProcess.__launch__(self,configs['system'],node_list=node_list,queue_name='single') return True
def __init__(self,config,key=int(-1),pipeline_config=None,prev_step=None,process_name='clean',pipeline=None,**kwargs): if not prev_step is None: SampleQsubProcess.__init__(self,config,key=key,input_dir=prev_step.output_dir,output_dir=pipeline.output_dir,process_name=process_name,**kwargs)
def __init__(self,config,key=int(-1),sample=None,flowcell=None,base_output_dir=None,r1_path=None,r2_path=None,description=None,upload_dir=None,process_name='bcbio',capture_target_bed=None,**kwargs): """ Initializes the bcbio process object. """ if flowcell is None: flowcell = Flowcell(config,key="dummy_flowcell_key") if flowcell.__class__.__name__ != "Flowcell": raise Exception("Trying to start a bcbio process on a non-flowcell.") SampleQsubProcess.__init__(self,config,key=key,sample=sample,base_output_dir=base_output_dir,process_name=process_name,**kwargs) self.input_dir = self.output_dir self.r1_path = r1_path self.r2_path = r2_path self.systems_file = os.path.join(self.input_dir,'system.yaml') self.sample_file = os.path.join(self.input_dir,'sample.yaml') self.upload_dir = upload_dir self.flowcell_key = flowcell.key self.description = description if os.path.isfile(self.sample_file): sample_yaml = grab_yaml(self.sample_file) else: sample_yaml = {} #snp_filename = self.sample_key + "_R_" + str(self.date_begin) + "_" + self.flowcell_key + "-sort-dup-gatkrecal-realign-variants-snp.vcf" try: bcbio_lane_name = sample_yaml["details"][0]["lane"] except KeyError: bcbio_lane_name = None if not capture_target_bed is None: bait_bed = capture_target_bed else: try: bait_bed = sample_yaml["details"][0]["algorithm"]["hybrid_bait"] except KeyError: bait_bed = None #exit(str(bcbio_lane_name)+"\n") if bait_bed is None: if bcbio_lane_name is None: snp_filename = "gatk/1_" + str(self.date_begin) + "_" + self.flowcell_key + "-sort-dup-gatkrecal-realign-variants-snp.vcf" else: snp_filename = "gatk/"+bcbio_lane_name+"_" + str(self.date_begin) + "_" + self.flowcell_key + "-sort-dup-gatkrecal-realign-variants-snp.vcf" self.analysis_ready_bam_path = None else: snp_filename = "gatk/" + str(sample.key) + "_" + str(self.date_begin) + "_" + self.flowcell_key + "-sort-variants-ploidyfix-snp.vcf" combined_variant_filename = "gatk/" + str(sample.key) + "_" + str(self.date_begin) + "_" + self.flowcell_key + "-sort-variants-ploidyfix-combined.vcf" self.combined_variant_path = os.path.join(self.output_dir,combined_variant_filename) bam_filename = "bamprep/" + str(description) + "/" + str(sample.key) + "_" + str(self.date_begin) + "_" + self.flowcell_key + "-sort-prep.bam" self.analysis_ready_bam_path = os.path.join(self.output_dir, bam_filename) self.snp_path = os.path.join(self.output_dir,snp_filename) sort_dup_bam = self.sample_key + "_R_" + str(self.date_begin) + "_" + self.flowcell_key + "-sort-dup.bam" self.sort_dup_bam = os.path.join(self.output_dir,sort_dup_bam) self.project_summary_file = os.path.join(self.output_dir,config.get('Filenames','project_summary')) self.restats_file = os.path.join(self.output_dir,config.get('Filenames','restats')) #Stats for this process self.total_reads = None self.percent_aligned = None self.percentage_duplicates = None self.insert_size = None self.gc_content = None self.percentage_on_target_bases = None self.mean_target_coverage = None self.percentage_with_at_least_10x_coverage = None self.percentage_0x_coverage = None self.total_variations = None self.percentage_in_db_snp = None self.titv_all = None self.titv_dbsnp = None self.titv_novel = None
def __init__(self,config,key=int(-1),sample=None,bcbio=None,capture_target_bed=None,process_name='summary_stats',**kwargs): """ Initializes the summary stats process. """ if bcbio is None: bcbio = Bcbio(config,key=int(-1)) if bcbio.__class__.__name__ != "Bcbio": raise Exception("Trying to start a summary_stats process on a non-bcbio pipeline.") self.capture_target_bed = capture_target_bed input_dir = bcbio.output_dir output_dir = os.path.join(bcbio.output_dir, "qc/"+str(bcbio.description)) SampleQsubProcess.__init__(self,config,key=key,sample=sample,input_dir=input_dir,output_dir=output_dir,process_name=process_name,**kwargs) self.snp_path = bcbio.snp_path if not os.path.isfile(self.snp_path) and bcbio.key != -1: snp_dir = os.path.dirname(self.snp_path) if os.path.isdir(snp_dir): for file in os.listdir(snp_dir): if file.endswith("combined-effects.vcf"): self.snp_path = os.path.join(snp_dir,file) self.bam_path = bcbio.analysis_ready_bam_path if self.bam_path is None and not bcbio.description is None: bam_path = os.path.join(bcbio.output_dir,"bamprep/"+bcbio.description) for file in os.listdir(bam_path): if file.endswith("-sort-prep.bam"): self.bam_path = os.path.join(bam_path,file) break if self.bam_path is None: raise Exception("The previous process didn't finish correctly.") self.systems_file = bcbio.systems_file self.ref_path = get_genome_ref(bcbio.sample_file,bcbio.systems_file) if self.ref_path is None: self.ref_path = "Not_found" if config.has_section("Summary stats") and config.has_option("Summary stats","hethom_ext"): hethom_filename = self.sample_key + config.get("Summary stats","hethom_ext") else: hethom_filename = self.sample_key + ".hethom" self.hethom_path = os.path.join(self.output_dir,hethom_filename) if config.has_section("Summary stats") and config.has_option("Summary stats","indbsnp_ext"): indbsnp_filename = self.sample_key + config.get("Summary stats","indbsnp_ext") else: indbsnp_filename = self.sample_key + ".indbsnp" self.indbsnp_path = os.path.join(self.output_dir,indbsnp_filename) if config.has_section("Summary stats") and config.has_option("Summary stats","hs_metrics_ext"): hs_metrics_filename = self.sample_key + config.get("Summary stats","hs_metrics_ext") else: hs_metrics_filename = self.sample_key + ".hs_metrics" self.hs_metrics_path = os.path.join(self.output_dir,hs_metrics_filename) if config.has_section("Summary stats") and config.has_option("Summary stats","bamtools_stats_ext"): bamtools_stats_filename = self.sample_key + config.get("Summary stats","bamtools_stats_ext") else: bamtools_stats_filename = self.sample_key + ".bamtools_stats" self.bamtools_stats_path = os.path.join(self.output_dir,bamtools_stats_filename) if config.has_section("Summary stats") and config.has_option("Summary stats","vcfstats_stats_ext"): vcfstats_stats_filename = self.sample_key + config.get("Summary stats","vcfstats_stats_ext") else: vcfstats_stats_filename = self.sample_key + ".vcfstats_stats" self.vcfstats_stats_path = os.path.join(self.output_dir,vcfstats_stats_filename) self.summary_stats_path = os.path.join(self.output_dir,"summary-stats.csv") #Stats for this process self.hom = None self.het = None self.variants_total = None self.hethom_ratio = None self.total_reads = None self.percent_aligned = None self.percentage_duplicates = None self.insert_size = None self.percentage_on_target_bases = None self.percentage_near_target_bases = None self.mean_target_coverage = None self.percentage_with_at_least_10x_coverage = None self.percentage_0x_coverage = None self.percentage_in_db_snp = None self.ts_tv_ratio = None