def __push_flowcells_into_relevant_pipelines__(self,configs,mockdb): """ Provides the interface from which all post casava flowcell pipelines are run. """ if configs["system"].get("Logging","debug") is "True": print " Starting post casava flowcell pipelines for " + self.flowcell_key flowcell_dir_name = os.path.basename(self.output_dir) automation_parameters_config = MyConfigParser() automation_parameters_config.read(configs["system"].get("Filenames","automation_config")) running_location = "Speed" parsed = parse_sample_sheet(configs['system'],mockdb,self.output_dir) description = parsed['description'].replace(parsed['SampleID']+'_','') description_dict = parse_description_into_dictionary(description) if 'Pipeline' in description_dict: pipeline_key = description_dict['Pipeline'] else: description_pieces = parsed['description'].split('-') pipeline_key = description_pieces[-1] if pipeline_key.startswith('CCGL'): pipeline_key='CCGL' pipeline_name = automation_parameters_config.safe_get("Flowcell pipeline",pipeline_key) if pipeline_name is None: return 1 if configs["system"].get("Logging","debug") is "True": print "Starting " + pipeline_name pipeline_config = MyConfigParser() pipeline_config.read(configs["system"].get('Pipeline',pipeline_name)) pipeline = mockdb[pipeline_name].__new__(configs['system'],input_dir=self.output_dir,pipeline_key=pipeline_key,seq_run_key=self.seq_run_key,project=parsed['project_name'],flowcell_dir_name=flowcell_dir_name,pipeline_config=pipeline_config,**parsed) return 1
def __is_complete__(self,configs,mockdb,*args,**kwargs): """ Checks to see if the pipeline is complete. If not, and it is ready to advance, then the pipeline is advanced. """ if GenericProcess.__is_complete__(self,*args,**kwargs): return True if not hasattr(self,"generic_copy_key") or self.generic_copy_key is None: if configs["system"].get("Logging","debug") is "True": print "Copying bcls" self.__launch_copy_bcls__(configs,mockdb) return False current_configs = {} current_configs["system"] = configs["system"] pipeline_config = MyConfigParser() current_configs["pipeline"] = pipeline_config pipeline_config.read(configs["system"].get('Pipeline','BclToFastqPipeline')) if self.__handle_linear_steps__(current_configs,mockdb,skip_finish=True,*args,**kwargs): casava = mockdb['Casava'].__get__(configs['system'],self.casava_key) if configs["system"].get("Logging","debug") is "True": print "Checking the pipeline first step results" if casava.__do_all_relevant_pipelines_have_first_step_complete__(current_configs,mockdb): self.__finish__(*args,**kwargs) return True return False
def __push_samples_into_relevant_pipelines__(self,configs,mockdb): """ Provides the interface from which all post casava sample pipelines are run. """ if configs["system"].get("Logging","debug") is "True": print " Starting post casava sample pipelines for " + self.flowcell_key print " Determining Sample dirs" sample_dirs = list_project_sample_dirs(self.output_dir.split(":")) if configs["system"].get("Logging","debug") is "True": print " Samples: " + str(sample_dirs) flowcell_dir_name = os.path.basename(self.output_dir) automation_parameters_config = MyConfigParser() automation_parameters_config.read(configs["system"].get("Filenames","automation_config")) fastqc_pipeline_config = MyConfigParser() fastqc_pipeline_config.read(configs["system"].get("Pipeline","FastQCPipeline")) for project in sample_dirs: for sample in sample_dirs[project]: #running_location = identify_running_location_with_most_currently_available(configs,storage_devices) running_location = "Speed" parsed = parse_sample_sheet(configs['system'],mockdb,sample_dirs[project][sample][0]) if configs["system"].get("Logging","debug") is "True": print " Pushing fastqc pipeline for " + sample fastqc_pipeline = mockdb["FastQCPipeline"].__new__(configs['system'],input_dir=sample_dirs[project][sample][0],flowcell_dir_name=flowcell_dir_name,project=parsed['project_name'],pipeline_config=fastqc_pipeline_config,seq_run_key=self.seq_run_key,**parsed) description_dict = parse_description_into_dictionary(parsed['description']) if 'Pipeline' in description_dict: pipeline_key = description_dict['Pipeline'] else: description_pieces = parsed['description'].split('-') pipeline_key = description_pieces[-1] pipeline_name = automation_parameters_config.safe_get("Pipeline",pipeline_key) if pipeline_name is None: continue if configs["system"].get("Logging","debug") is "True": print "Starting " + pipeline_name + " for " + sample pipeline = mockdb[pipeline_name].__new__(configs['system'],input_dir=sample_dirs[project][sample][0],pipeline_key=pipeline_key,seq_run_key=self.seq_run_key,project=parsed['project_name'],flowcell_dir_name=flowcell_dir_name,**parsed)
def things_to_do_if_initializing_flowcell_pipeline_with_input_directory(configs,storage_devices,mockdb,source_dir,pipeline_name=None,base_output_dir=None): """ Starts pipelines that read the entire flowcell data. """ if configs["system"].get("Logging","debug") is "True": print " Starting post casava flowcell pipelines" flowcell_dir_name = os.path.basename(source_dir) automation_parameters_config = MyConfigParser() automation_parameters_config.read(configs["system"].get("Filenames","automation_config")) running_location = "Speed" parsed = parse_sample_sheet(configs['system'],mockdb,source_dir) description = parsed['description'].replace(parsed['SampleID']+'_','') description_dict = parse_description_into_dictionary(description) if configs["system"].get("Logging","debug") is "True": print " Description = " + str(parsed['description']) if 'Pipeline' in description_dict: pipeline_key = description_dict['Pipeline'] else: description_pieces = parsed['description'].split('_') pipeline_key = description_pieces[-1] if pipeline_key.startswith('CCGL'): pipeline_key='CCGL' pipeline_name_check = automation_parameters_config.safe_get("Flowcell pipeline",pipeline_key) if pipeline_name_check != pipeline_name: return 1 if pipeline_name is None: return 1 if configs["system"].get("Logging","debug") is "True": print "Starting " + pipeline_name pipeline = mockdb[pipeline_name].__new__(configs['system'],input_dir=source_dir,pipeline_key=pipeline_key,seq_run_key=None,project=parsed['project_name'],flowcell_dir_name=flowcell_dir_name,running_location='Speed',pipeline_config=configs["pipeline"],**parsed) return 1
def __init__(self,config,key=int(-1),pipeline_config=None,prev_step=None,process_name='cp',pipeline=None,**kwargs): if not prev_step is None: if pipeline_config is None: pipeline_config = MyConfigParser() pipeline_config.read(config.get('Pipeline',pipeline.obj_type)) cp_input_dir_name = pipeline_config.safe_get('Common_directories','cp_subdir') if cp_input_dir_name is None: cp_input_dir_name = "" if prev_step.obj_type == "CleanBcbio": for root, dirs, files in os.walk(prev_step.output_dir,topdown=False): for filename in files: if filename.endswith(".vcf"): full_path = os.path.join(root,filename) cp_indput_dir = os.path.dirname(full_path) cp_input_dir = os.path.join(pipeline.output_dir,cp_input_dir_name) output_subdir_name = pipeline_config.safe_get('Common_directories','output_subdir','ngv3') cp_dir = os.path.join(pipeline.input_dir,output_subdir_name) if not os.path.exists(cp_dir): os.makedirs(cp_dir) self.cp_dir = cp_dir SampleQsubProcess.__init__(self,config,key=key,input_dir=cp_input_dir,output_dir=pipeline.output_dir,process_name=process_name,**kwargs) if self.sample_key is not None: self.md5_file = os.path.join(cp_dir,self.sample_key + "_exome_md5checksums.txt") else: self.md5_file = "exome_md5checksums.txt"
def __init__(self,config,key=int(-1),sample=None,flowcell=None,description=None,recipe=None,input_dir=None,pipeline_config=None,pipeline_key=None,process_name='qcpipeline',running_location='Speed',storage_needed=500000000,project=None,flowcell_dir_name=None,seq_run_key=None,date=strftime("%Y%m%d",localtime()),*args,**kwargs): if not pipeline_config is None or not pipeline_key is None: if sample is None: sample = Sample(config,key="dummy_sample_key") if sample.__class__.__name__ != "Sample": raise Exception("Trying to start a qcpipeline process on a non-sample.") if flowcell is None: sample = Flowcell(config,key="dummy_flowcell_key") if flowcell.__class__.__name__ != "Flowcell": raise Exception("Trying to start a qcpipeline process on a sample not from a flowcell.") automation_parameters_config = MyConfigParser() automation_parameters_config.read(config.get("Filenames","automation_config")) #Specific information about this pipeline self.description = description self.recipe = recipe self.storage_needed = storage_needed self.input_dir = input_dir self.running_location = running_location self.seq_run_key = seq_run_key capture_target_bed = automation_parameters_config.safe_get("Target",pipeline_key) if not capture_target_bed is None: self.capture_target_bed = capture_target_bed if pipeline_config is None: pipeline_name = automation_parameters_config.safe_get("Pipeline",pipeline_key) pipeline_config = MyConfigParser() pipeline_config.read(config.get('Pipeline',pipeline_name)) pipeline_steps = pipeline_config.get('Pipeline','steps').split(',') for step in pipeline_steps: setattr(self,step+"_key",None) base_output_dir = pipeline_config.get('Common_directories','archive_directory') if flowcell_dir_name is None: self.client_dir = self.input_dir else: sample_dir_name = sample.key if not str(sample_dir_name).startswith("Sample_"): sample_dir_name = "Sample_" + sample_dir_name self.client_dir = os.path.join(config.get('Common_directories','casava_output'),flowcell_dir_name+"/Project_"+str(project)+"/"+sample_dir_name) self.flowcell_key = flowcell.key base_client_dir = config.get('Common_directories','casava_output') if project is None: if base_output_dir is None: base_output_dir = "" self.output_dir = os.path.join(base_output_dir,sample.key + '_' + str(date)) else: project_out = re.sub('_','-',project) self.project = project_out if re.search("[0-9]",project_out[0:1]): project_out = "Project-" + project_out if base_output_dir == None: base_output_dir = "" self.output_dir = os.path.join(base_output_dir,project_out + "_" + sample.key + '_' + str(date)) if not os.path.exists(self.output_dir) and not re.search('dummy',sample.key): os.makedirs(self.output_dir) GenericProcess.__init__(self,config,key=key,process_name=process_name,**kwargs) self.date = date self.sample_key = sample.key self.altered_parameters = None
def __launch_copy_bcls__(self,configs,mockdb): """ This launches the process that will archive the fastq directories. """ input_dir = os.path.join(self.input_dir,"Data/Intensities") output_dir = os.path.join(self.output_dir,"Data/Intensities") if not os.path.exists(output_dir): os.makedirs(output_dir) copy_all_xml(self.input_dir,self.output_dir) copy_all_xml(os.path.join(self.input_dir,"Data"),os.path.join(self.output_dir,"Data")) current_configs = {} current_configs["system"] = configs["system"] pipeline_config = MyConfigParser() current_configs["pipeline"] = pipeline_config pipeline_config.read(configs["system"].get('Pipeline','BclToFastqPipeline')) copy_bcls = mockdb['GenericCopy'].__new__(configs['system'],input_dir=input_dir,output_dir=output_dir) self.generic_copy_key = copy_bcls.key copy_bcls.__fill_qsub_file__(current_configs) copy_bcls.__launch__(configs['system'])
def things_to_do_if_initializing_pipeline_with_input_directory(configs,storage_devices,mockdb,source_dir,pipeline_name=None,base_output_dir=None,combine_projects=True): if combine_project: sample_dirs["dummy_project"] = list_sample_dirs(source_dir) else: sample_dirs = list_project_sample_dirs(source_dir) target_config = MyConfigParser() target_config.read(configs["system"].get("Filenames","target_config")) for project in sample_dirs: for sample in sample_dirs[project]: running_location = identify_running_location_with_most_currently_available(configs,storage_devices) parsed = parse_sample_sheet(configs['system'],mockdb,sample_dirs[project][sample][0]) if base_output_dir is None: base_output_dir = configs['pipeline'].get('Common_directories','archive_directory') automation_parameters_config = MyConfigParser() automation_parameters_config.read(configs["system"].get("Filenames","automation_config")) description_dict = parse_description_into_dictionary(parsed['description']) if 'Pipeline' in description_dict: pipeline_key = description_dict['Pipeline'] else: description_pieces = parsed['description'].split('_') pipeline_key = description_pieces[-1] pipeline_name_for_sample = automation_parameters_config.safe_get("Pipeline",pipeline_key) if not pipeline_name_for_sample == pipeline_name: continue mockdb[pipeline_name].__new__(configs['system'],input_dir=sample_dirs[project][sample][0],pipeline_config=configs["pipeline"],project=parsed['project_name'],pipeline_key=pipeline_key,**parsed) flowcell_dict = mockdb['SequencingRun'].__attribute_value_to_object_dict__('flowcell_key') flowcell_dict = mockdb['SequencingRun'].__attribute_value_to_object_dict__('flowcell_key') if parsed['flowcell'].key in flowcell_dict: seq_run = flowcell_dict[parsed['flowcell'].key] pass else: try: base_dir = get_sequencing_run_base_dir(source_dir) [date,machine_key,run_number,side,flowcell_key] = parse_sequencing_run_dir(base_dir) machine = mockdb['HiSeqMachine'].__get__(configs['system'],machine_key) run_type = determine_run_type(base_dir) seq_run = mockdb['SequencingRun'].__new__(configs['system'],flowcell,machine,date,run_number,output_dir=base_dir,side=side,run_type=run_type) fill_demultiplex_stats(configs['system'],mockdb,seq_run.output_dir,flowcell,machine) except: pass return 1
def __do_all_relevant_pipelines_have_first_step_complete__(self,configs,mockdb): """ Since the first step of post casava pipelines is to copy the data, moving the data after these pipelines are started must wait for this step to complete. This is only a concern when everything is automated. This checks that step and whether the FastQCPipeline is finished. """ pipeline_names = configs["system"].get('Pipeline','post_casava_automated').split(',') for pipeline_name in pipeline_names: try: seq_run_key_dict = mockdb[pipeline_name].__attribute_value_to_object_dict__('seq_run_key') pipeline_config = MyConfigParser() pipeline_config.read(config.get('Pipeline',pipeline_name)) for pipeline in seq_run_key_dict[self.seq_run_key]: if pipeline_name == "FastQCPipeline": if not pipeline.__is_complete__(): return False if not pipeline.__check_first_step__(pipeline_config): return False except: continue return True
parser = argparse.ArgumentParser(description='Manages data and submits new jobs.') parser.add_argument('-i', dest='source_dir', nargs='+', help='fastq source', default=None) parser.add_argument('-o', dest='dest_dir', help='vcf destination', default=None) parser.add_argument('-p', '--pipeline', dest='pipeline', help='The version of the pipeline', default='QualityControlPipeline') parser.add_argument('--analyze_sequencing_run', dest='seq_run', action='store_true', help='Reanalyze the give sequencing run.', default=False) parser.add_argument('--system_config', dest='system_config_file', help='The system configuration file', default='/home/sequencing/src/pipeline_project/pipeline/config/ihg_system.cfg') parser.add_argument('-d', '--debug', dest='debug', action='store_true', help='Turn debugging on', default=False) parser.add_argument('--sample_sheet', dest='sample_sheet', type=str, help='For use for re-initializing a sequencing run. Specifies the samples sheet to be used for casava.', default=None) options = parser.parse_args() if options.debug is True: print "Options are " + str(options) #Load configs configs = {} system_config = MyConfigParser() system_config.read(options.system_config_file) system_config.add_section("Logging") if options.debug is True: system_config.set("Logging","debug","True") else: system_config.set("Logging","debug","False") configs.update({'system':system_config}) config_instance = MyConfigParser() configs.update({"seq_run":config_instance}) config_instance.read(system_config.get('Pipeline',"seq_run")) pipelines = system_config.get('Pipeline','opts').split(',') pipeline_config = {} for pipeline_name in pipelines: config_instance = MyConfigParser()