Esempio n. 1
0
 def __init__(self,config,key=int(-1),pipeline_config=None,prev_step=None,process_name='bwa_mem',pipeline=None,**kwargs):
     """
     Initializes the  process object.
     """
     if not prev_step is None:
         if prev_step.__class__.__name__ == "ZcatMultiple":
             SampleQsubProcess.__init__(self,config,key=key,process_name=process_name,input_dir=prev_step.output_dir,output_dir=os.path.join(prev_step.output_dir,"align"),number_tasks=prev_step.number_tasks,**kwargs)
             multi_fastq_file = prev_step.multi_fastq_file
             if not multi_fastq_file is None:
                 self.multi_fastq_file = multi_fastq_file
                 multi_fastq = grab_yaml(self.multi_fastq_file)
                 lane_numbers = list_from_multi_fastq_object(multi_fastq,"lane")
                 flowcells = list_from_multi_fastq_object(multi_fastq,"flowcell")
                 input_r1_fastqs =  prev_step.r1_copied.split(":")
                 input_r2_fastqs =  prev_step.r2_copied.split(":")
                 input_r1_fastqs = [re.sub(r".gz$","",fastq) for fastq in input_r1_fastqs]
                 input_r2_fastqs = [re.sub(r".gz$","",fastq) for fastq in input_r2_fastqs]
                 self.lane_number = ":".join(lane_numbers)
                 self.flowcell_key = ":".join(flowcells)
                 self.input_fastq1 = ":".join(input_r1_fastqs)
                 self.input_fastq2 = ":".join(input_r2_fastqs)
                 output_sams = []
                 for input_r1_fastq in input_r1_fastqs:
                     output_sam = re.sub(prev_step.output_dir,self.output_dir,re.sub(".fastq",".sam",input_r1_fastq))
                     output_sam = re.sub("_R1","",output_sam)
                     output_sams.append(output_sam)
                 self.output_sam = ":".join(output_sams)
             self.bwa_threads = pipeline_config.get('Program specific parameters','bwa_threads')
             self.ref_fa = pipeline_config.get('References','genome_fasta')
             self.project = pipeline.project
Esempio n. 2
0
    def __init__(self,config,key=int(-1),process_name='zcat_multiple',multi_fastq_file=None,**kwargs):
        """
        Initializes the zcat multiple process object.
        """
        SampleQsubProcess.__init__(self,config,key=key,process_name=process_name,**kwargs)
        #Grab the first read files.
        r1_in_list = []
        if not multi_fastq_file is None:
            self.multi_fastq_file = multi_fastq_file
            multi_fastq = grab_yaml(self.multi_fastq_file)
            r1_in_list = list_from_multi_fastq_object(multi_fastq,"r1_filename")
        self.r1_input = ":".join(r1_in_list)
        r1_out_list = []
        r1_uncompressed_list = []
        for i in range(len(r1_in_list)):
            filename = self.sample_key + "_" + str(i) + "_R1.fastq" 
            r1_uncompressed_list.append(os.path.join(self.output_dir,filename))
            if r1_in_list[i][-3:] == '.gz':
                filename += ".gz"
            r1_out_list.append(os.path.join(self.output_dir,filename)) ##For the copy process
        self.r1_copied = ":".join(r1_out_list)
        self.r1_uncompressed = ":".join(r1_uncompressed_list)

        #Grab the paired read files.
        r2_in_list = []
        if not multi_fastq_file is None:
            r2_in_list = list_from_multi_fastq_object(multi_fastq,"r2_filename")
        self.r2_input = ":".join(r2_in_list)
        r2_out_list = []
        r2_uncompressed_list = []
        for i in range(len(r2_in_list)):
            filename = self.sample_key + "_" + str(i) + "_R2.fastq" 
            r2_uncompressed_list.append(os.path.join(self.output_dir,filename))
            if r2_in_list[i][-3:] == '.gz':
                filename += ".gz"
            r2_out_list.append(os.path.join(self.output_dir,filename))
        self.r2_copied = ":".join(r2_out_list)
        self.r2_uncompressed = ":".join(r2_uncompressed_list)

        if len(r1_in_list) == len(r2_in_list):
            self.number_tasks = len(r1_in_list)
            tmp_dirs = []
            complete_files = []
            for i in range(len(r1_in_list)):
                task_number = i + 1
                complete_file = os.path.join(self.output_dir, self.process_name + '.' + str(task_number) + '.complete')
                complete_files.append(complete_file)
                tmp_dir = os.path.join(self.output_dir, 'tmp.' + str(task_number))
                if not os.path.isdir(tmp_dir) and not re.search('dummy',self.output_dir):
                    os.makedirs(tmp_dir)
                tmp_dirs.append(tmp_dir)
            self.tmp_dir = ":".join(tmp_dirs)
            self.complete_file = ":".join(complete_files)
        else:
            raise Exception("The number of read and matched-pair read files for sample " + sample.key + " are not the same")
Esempio n. 3
0
    def __init__(self,config,key=int(-1),sample=None,process_name='bwa_sampe',multi_fastq_file=None,ref_fa='/mnt/speed/qc/sequencing/biodata/genomes/Hsapiens/GRCh37/bwa/GRCh37.fa',prev_step=None,pipeline=None,**kwargs):
        """
        Initializes the  process object.
        """
        if not prev_step is None:
            if prev_step.__class__.__name__ == "BwaAln":
                if sample is None:
                    sample = Sample(config,key="dummy_sample_key")
                if sample.__class__.__name__ != "Sample":
                    raise Exception("Trying to start a qcpipeline process on a non-sample.")
                SampleQsubProcess.__init__(self,config,key=key,sample=sample,process_name=process_name,input_dir=prev_step.output_dir,output_dir=prev_step.output_dir,number_tasks=prev_step.number_tasks/2,**kwargs)
                self.project = pipeline.project
                self.sample_key = sample.key
                self.ref_fa = prev_step.ref_fa
                if not multi_fastq_file is None:
                    self.multi_fastq_file = multi_fastq_file
                    multi_fastq = grab_yaml(self.multi_fastq_file)
                    lane_numbers = list_from_multi_fastq_object(multi_fastq,"lane")
                    flowcells = list_from_multi_fastq_object(multi_fastq,"flowcell")
                    self.lane_number = ":".join(lane_numbers)
                    self.flowcell_key = ":".join(flowcells)

                input_fastqs = prev_step.input_fastq.split(":")
                input_r1_fastqs = input_fastqs[:self.number_tasks]
                input_r2_fastqs = input_fastqs[self.number_tasks:]
                self.input_fastq1 = ":".join(input_r1_fastqs)
                self.input_fastq2 = ":".join(input_r2_fastqs)

                input_sais = prev_step.output_sai.split(":")
                input_r1_sais = input_sais[:self.number_tasks]
                input_r2_sais = input_sais[self.number_tasks:]
                self.input_sai1 = ":".join(input_r1_sais)
                self.input_sai2 = ":".join(input_r2_sais)

                output_sams = []
                for input_r1_fastq in input_r1_fastqs:
                    output_sam = re.sub("_R1.fastq",".sam",input_r1_fastq)
                    output_sams.append(output_sam)
                self.output_sam = ":".join(output_sams)
Esempio n. 4
0
 def __init__(self,config,key=int(-1),sample=None,flowcell=None,base_output_dir=None,r1_path=None,r2_path=None,description=None,upload_dir=None,process_name='bcbio',capture_target_bed=None,**kwargs):
     """
     Initializes the bcbio process object.
     """
     if flowcell is None:
         flowcell = Flowcell(config,key="dummy_flowcell_key")
     if flowcell.__class__.__name__ != "Flowcell":
         raise Exception("Trying to start a bcbio process on a non-flowcell.")
     SampleQsubProcess.__init__(self,config,key=key,sample=sample,base_output_dir=base_output_dir,process_name=process_name,**kwargs)
     self.input_dir = self.output_dir
     self.r1_path = r1_path
     self.r2_path = r2_path
     self.systems_file = os.path.join(self.input_dir,'system.yaml')
     self.sample_file = os.path.join(self.input_dir,'sample.yaml')
     self.upload_dir = upload_dir
     self.flowcell_key = flowcell.key
     self.description = description
     if os.path.isfile(self.sample_file):
         sample_yaml = grab_yaml(self.sample_file)
     else:
         sample_yaml = {}
     #snp_filename = self.sample_key + "_R_" + str(self.date_begin) + "_" + self.flowcell_key + "-sort-dup-gatkrecal-realign-variants-snp.vcf"
     try:
         bcbio_lane_name = sample_yaml["details"][0]["lane"]
     except KeyError: 
         bcbio_lane_name = None
     if not capture_target_bed is None:
         bait_bed = capture_target_bed
     else: 
         try:
             bait_bed = sample_yaml["details"][0]["algorithm"]["hybrid_bait"]
         except KeyError: 
             bait_bed = None
     #exit(str(bcbio_lane_name)+"\n")
     if bait_bed is None:
       if bcbio_lane_name is None:
           snp_filename = "gatk/1_" + str(self.date_begin) + "_" + self.flowcell_key + "-sort-dup-gatkrecal-realign-variants-snp.vcf"
       else:
           snp_filename = "gatk/"+bcbio_lane_name+"_" + str(self.date_begin) + "_" + self.flowcell_key + "-sort-dup-gatkrecal-realign-variants-snp.vcf"
       self.analysis_ready_bam_path = None
     else:
       snp_filename = "gatk/" + str(sample.key) + "_" + str(self.date_begin) + "_" + self.flowcell_key + "-sort-variants-ploidyfix-snp.vcf"
       combined_variant_filename = "gatk/" + str(sample.key) + "_" + str(self.date_begin) + "_" + self.flowcell_key + "-sort-variants-ploidyfix-combined.vcf"
       self.combined_variant_path = os.path.join(self.output_dir,combined_variant_filename)
       bam_filename = "bamprep/" + str(description) + "/" + str(sample.key) + "_" + str(self.date_begin) + "_" + self.flowcell_key + "-sort-prep.bam"
       self.analysis_ready_bam_path = os.path.join(self.output_dir, bam_filename) 
     self.snp_path = os.path.join(self.output_dir,snp_filename)
     sort_dup_bam = self.sample_key + "_R_" + str(self.date_begin) + "_" + self.flowcell_key + "-sort-dup.bam"
     self.sort_dup_bam = os.path.join(self.output_dir,sort_dup_bam)
     self.project_summary_file = os.path.join(self.output_dir,config.get('Filenames','project_summary'))
     self.restats_file = os.path.join(self.output_dir,config.get('Filenames','restats'))
     #Stats for this process
     self.total_reads = None
     self.percent_aligned = None
     self.percentage_duplicates = None
     self.insert_size = None
     self.gc_content = None
     self.percentage_on_target_bases = None
     self.mean_target_coverage = None
     self.percentage_with_at_least_10x_coverage = None
     self.percentage_0x_coverage = None
     self.total_variations = None
     self.percentage_in_db_snp = None
     self.titv_all = None
     self.titv_dbsnp = None
     self.titv_novel = None