def configFromFile(self, config_file_path): import ConfigParser user_config = ConfigParser.ConfigParser() user_config.read(config_file_path) # take care of the general section G = lambda v: user_config.get('general', v) self.run_date = G('run_date') self.platform = G('platform') self.input_dir = G('input_dir') self.output_dir = G('output_dir') self.input_files = [file.strip() for file in G('input_files').split(',')] self.input_file_type = G('input_file_type') # populate sample information for every run_key for run_key in [s for s in user_config.sections() if s != 'general']: #print run_key # looks like: 1:ACACT S = lambda v: user_config.get(run_key, v) sample = Sample(run_key) # has defaults -not required try: sample.proximal_primers = S('forward_primers').strip("'").strip('"').split(',') except: sample.proximal_primers = [] try: sample.distal_primers = S('reverse_primers').strip("'").strip('"').split(',') except: sample.distal_primers = [] try: sample.stop_sequences = S('stop_sequences').strip("'").strip('"').split(',') except: sample.stop_sequences = [] try: sample.anchor = S('anchor') except: sample.anchor = '' # required sample.direction = S('direction') sample.project = S('project_name') sample.dataset = S('dataset_name') sample.dna_region = S('dna_region') sample.taxonomic_domain = S('taxonomic_domain') # a list of run_keys # convert: change ':' to '_' key = run_key[:1]+'_'+run_key[2:] self.run_keys.append(key) # a dictionary of samples self.samples[key] = sample
def initializeFromDictionary(self, configDict): # get the general stuff general_config = configDict['general'] #if general_config['gast_data_source'] != 'database': self.run = general_config['run'] self.platform = general_config.get('platform', "unknown") self.input_dir = general_config.get('input_dir', None) self.require_distal = general_config.get('require_distal', True) self.minimumLength = general_config.get('minimumLength', C.minimumLength) self.maximumLength = general_config.get('maximumLength', C.maximumLength) self.minAvgQual = general_config.get('minAvgQual', C.minAvgQual) self.force_runkey = general_config.get('force_runkey', None) self.use_cluster = general_config['use_cluster'] try: self.idx_keys = general_config['idx_keys'] except: self.idx_keys = "" if self.platform == 'vamps': self.user = general_config['user'] self.dna_region = general_config['dna_region'] self.input_files = general_config['input_files'] self.project = general_config['project'] self.dataset = general_config['dataset'] self.site = general_config['site'] self.env_source_id = general_config['envsource'] try: self.fasta_file = general_config['fasta_file'] except: self.fasta_file =None if self.platform == 'illumina': self.compressed = general_config['compressed'] self.database_name = general_config['database_name'] self.database_host = general_config['database_host'] # added gast_input_source for vamps uploads # so when users want to gast at a later time they will # look in the database and not the files (which may be missing) # see /xraid2-2/vampsweb/vampsdev/vamps_trim.py self.gast_input_source = 'files' # for regular gast pipeline if 'gast_input_source' in general_config: self.gast_input_source = general_config['gast_input_source'] if 'files_list' in general_config: input_file_names = general_config['files_list'] self.input_files = ','.join(general_config['files_list']) else: input_file_names = [input_str.strip() for input_str in general_config['input_files'].split(',')] self.input_files = general_config['input_files'] # # # for ini file: (no plurals) # # 1) if input_file_format is a comma sep list then it should match the count of input_file_name # # The same with input_file_lane # # 2) if input_file_format is supplied and is a single item it will apply to all the input files # # either in input_dir or the list (or single) of input_file_name # # 3) EITHER input_dir OR input_file_name will be supplied (but not both) # # # if self.platform == '454': # # if 'input_file_format' in general_config and general_config['input_file_format'] != '': # input_file_types = general_config['input_file_format'] # elif 'file_formats_list' in general_config: # input_file_types = general_config['file_formats_list'] # else: # input_file_types = [input_str.strip() for input_str in general_config['input_file_formats'].split(',')] # # print 'input_file_types= ',input_file_types # if len(input_file_names) != len(input_file_types): # raise Exception("Mismatch between the number of input_file_names(" + str(len(input_file_names)) + ") and input_file_types(" + str(len(input_file_types)) + ") in configuration information") # # if 'lanes_list' in general_config: # input_file_lanes = general_config['lanes_list'] # else: # lane_info = general_config['input_file_lanes'].strip() # input_file_lanes = [] if lane_info == '' else [input_str.strip() for input_str in lane_info.split(',')] # # # no lane info? better by our custom fasta-mbl format then # if len(input_file_lanes) == 0 and len([ type for type in input_file_types if type != 'fasta-mbl' ]) > 0: # raise Exception("Only fasta-mbl formatted sequence files are allowed to not provide a value for input_file_lanes") # # # if they give any lane information it then needs to either be 1 value (for all files) or match them exactly # if len(input_file_lanes) > 1 and (len(input_file_names) != len(input_file_lanes)): # raise Exception("Mismatch between the number of input_file_names(" + str(len(input_file_names)) + ") and lanes(" + str(len(input_file_lanes)) + ") in configuration information") # else: # input_file_types = [] # input_file_lanes = [] # # # # self.input_file_info = {} # print general_config for idx,input_file in enumerate(input_file_names): if "input_file_format" in general_config: file_format = general_config['input_file_format'] else: # default file_format = 'fasta' if file_format not in C.input_file_formats: raise Exception("Invalid sequence input file format: " + config_dict['input_file_format']) if "input_file_lane" in general_config: file_lane = general_config['input_file_lane'] else: # default file_lane = '' # make up a hash...they are allowed to not put in any input_file_lanes...could be 3 mbl fasta files which would all have lane # info encoded on each id/description line of the sequence record self.input_file_info[input_file] = { "name" : input_file, "format" : file_format, "lane" : file_lane } # now deal with each lane_runkey combo (Sample) that is misnamed though # populate sample information for every run_key for lane_run_key in [s for s in configDict.keys() if s != 'general']: lane_run_dict = configDict[lane_run_key] sample = Sample(lane_run_key) # has defaults -not required try: sample.forward_primers = lane_run_dict['forward_primers'].split(',') except: sample.forward_primers = [] try: sample.reverse_primers = lane_run_dict['reverse_primers'].split(',') except: sample.reverse_primers = [] try: sample.stop_sequences = lane_run_dict['stop_sequences'].split(',') except: sample.stop_sequences = [] try: sample.anchor = lane_run_dict['anchor'] except: sample.anchor = '' # should we try to trim with mbl primers as well as custom ones try: sample.use_mbl_primers = lane_run_dict['use_mbl_primers'] except: sample.use_mbl_primers = 1 ################################# try: sample.run_key = lane_run_dict['run_key'] except: sample.run_key = '' try: sample.lane = lane_run_dict['lane'] except: sample.lane = '' try: sample.adaptor = lane_run_dict['adaptor'] except: sample.adaptor = '' try: sample.barcode = lane_run_dict['barcode'] except: sample.barcode = '' try: sample.seq_operator = lane_run_dict['seq_operator'] except: sample.seq_operator = '' try: sample.amp_operator = lane_run_dict['amp_operator'] except: sample.amp_operator = '' try: sample.primer_suite = lane_run_dict['primer_suite'] except: sample.primer_suite = '' try: sample.tubelabel = lane_run_dict['tubelabel'] except: sample.tubelabel = '' try: sample.dna_region = lane_run_dict['dna_region'] except: sample.dna_region = '' sample.data_owner = lane_run_dict['data_owner'] sample.first_name = lane_run_dict['first_name'] sample.last_name = lane_run_dict['last_name'] sample.email = lane_run_dict['email'] sample.institution = lane_run_dict['institution'] sample.project_title = lane_run_dict['project_title'] sample.project_description = lane_run_dict['project_description'] sample.funding = lane_run_dict['funding'] sample.env_sample_source = lane_run_dict['env_sample_source'] sample.dataset_description = lane_run_dict['dataset_description'] if self.platform == 'illumina': # req specifically for illumina sample.barcode_index = lane_run_dict['barcode_index'] sample.overlap = lane_run_dict['overlap'] sample.read_length = lane_run_dict['read_length'] sample.file_prefix = lane_run_dict['file_prefix'] sample.insert_size = lane_run_dict['insert_size'] # concatenate: barcode_index and run_key and lane key = lane_run_dict['barcode_index'] +'_'+ lane_run_dict['run_key'] +'_'+ lane_run_dict['lane'] #sample.key = key self.run_keys.append(key) elif self.platform == '454': # required for 454 sample.direction = lane_run_dict['direction'] sample.taxonomic_domain = lane_run_dict['domain'] # a list of run_keys # convert: change ':' to '_' key = lane_run_key[:1]+'_'+lane_run_key[2:] #sample.key = key self.run_keys.append(key) sample.project = lane_run_dict['project'] sample.dataset = lane_run_dict['dataset'] # a dictionary of samples self.samples[key] = sample
def initializeFromDictionary(self, configDict): # get the general stuff general_config = configDict['general'] print( 'General Config0:',general_config) #if general_config['gast_data_source'] != 'database': self.run = general_config['run'] self.platform = general_config.get('platform', "unknown") self.input_dir = general_config.get('input_dir', None) self.require_distal = general_config.get('require_distal', True) self.minimumLength = general_config.get('minimumLength', C.minimumLength) self.maximumLength = general_config.get('maximumLength', C.maximumLength) self.minAvgQual = general_config.get('minAvgQual', C.minAvgQual) self.force_runkey = general_config.get('force_runkey', None) try: self.idx_keys = general_config['idx_keys'] except: self.idx_keys = "" if self.vamps_user_upload: self.site = general_config['site'] if self.site == 'new_vamps': self.project_dir = general_config['project_dir'] self.node_db = general_config['node_db'] self.process_dir = general_config['process_dir'] self.hostname = general_config['hostname'] self.ref_db_dir = general_config['ref_db_dir'] self.config_file = general_config['config_file'] self.project = general_config['project'] self.env_source_id = general_config['env_source_id'] self.user = general_config['user'] #self.datasets = configDict['datasets'] self.input_files = general_config['input_files'] #self.project = general_config['project'] #self.dataset = general_config['dataset'] self.dna_region = general_config['dna_region'] self.domain = general_config['domain'] self.load_vamps_database = general_config['load_vamps_database'] try: self.require_distal = general_config['require_distal'] except: self.require_distal = True try: self.minimumLength = general_config['minimum_length'] except: self.minimumLength = C.minimumLength try: self.maximumLength = general_config['maximum_length'] except: self.maximumLength = C.maximumLength try: self.use_cluster = general_config['use_cluster'] except: self.use_cluster = False try: self.use64bit = general_config['use64bit'] except: self.use64bit = False try: self.fasta_file = general_config['fasta_file'] except: self.fasta_file = None try: self.mobedac = general_config['mobedac'] except: self.mobedac = False try: self.use_full_length= general_config['use_full_length'] except: self.use_full_length= False try: self.classifier = general_config['classifier'] except: self.classifier= 'unknown' else: if self.platform in C.illumina_list: self.compressed = general_config['compressed'] self.database_name = general_config['database_name'] self.database_host = general_config['database_host'] self.site = general_config['site'] self.load_vamps_database = general_config['load_vamps_database'] if "archaea" in general_config: self.archaea = general_config['archaea'] if "do_perfect" in general_config: self.do_perfect = general_config['do_perfect'] else: self.do_perfect = C.pipeline_run_items[self.platform]['do_perfect'] if "lane_name" in general_config: self.lane_name = general_config['lane_name'] else: self.lane_name = C.pipeline_run_items[self.platform]['lane_name'] elif self.platform == '454': self.compressed = general_config['compressed'] self.database_name = general_config['database_name'] self.database_host = general_config['database_host'] self.site = general_config['site'] self.load_vamps_database = general_config['load_vamps_database'] else: pass # added gast_input_source for vamps uploads # so when users want to gast at a later time they will # look in the database and not the files (which may be missing) # see /xraid2-2/vampsweb/vampsdev/vamps_trim.py if 'gast_input_source' in general_config: self.gast_input_source = general_config['gast_input_source'] print( 'General Config:',general_config) if 'files_list' in general_config: input_file_names = general_config['files_list'] self.input_files = ','.join(general_config['files_list']) self.files_list = general_config['files_list'] else: input_file_names = [input_str.strip() for input_str in general_config['input_files'].split(',')] self.input_files = ','.join(general_config['input_files']) self.files_list = general_config['input_files'] self.input_file_info = {} print(general_config) for idx,input_file in enumerate(input_file_names): if "input_file_format" in general_config: file_format = general_config['input_file_format'] else: # default file_format = 'fasta' if file_format not in C.input_file_formats: raise Exception("Invalid sequence input file format: " + general_config['input_file_format']) if "input_file_lane" in general_config: file_lane = general_config['input_file_lane'] else: # default file_lane = '' # make up a hash...they are allowed to not put in any input_file_lanes...could be 3 mbl fasta files which would all have lane # info encoded on each id/description line of the sequence record self.input_file_info[input_file] = { "name" : input_file, "format" : file_format, "lane" : file_lane } # now deal with each lane_runkey combo (Sample) that is misnamed though # populate sample information for every run_key for lane_run_key in [s for s in configDict.keys() if s != 'general']: # change ':' to '_' # key = lane_run_key[:1]+'_'+lane_run_key[2:] lane_run_dict = configDict[lane_run_key] sample = Sample(lane_run_key) # has defaults -not required try: sample.forward_primers = lane_run_dict['forward_primers'].split(',') except: sample.forward_primers = [] try: sample.reverse_primers = lane_run_dict['reverse_primers'].split(',') except: sample.reverse_primers = [] try: sample.stop_sequences = lane_run_dict['stop_sequences'].split(',') except: sample.stop_sequences = [] try: sample.anchor = lane_run_dict['anchor'] except: sample.anchor = '' # should we try to trim with mbl primers as well as custom ones try: sample.use_mbl_primers = lane_run_dict['use_mbl_primers'] except: sample.use_mbl_primers = 1 ################################# try: sample.run_key = lane_run_dict['run_key'] except: sample.run_key = '' try: sample.lane = lane_run_dict['lane'] except: sample.lane = '' try: sample.adaptor = lane_run_dict['adaptor'] except: sample.adaptor = '' try: sample.barcode = lane_run_dict['barcode'] except: sample.barcode = '' try: sample.seq_operator = lane_run_dict['seq_operator'] except: sample.seq_operator = '' try: sample.amp_operator = lane_run_dict['amp_operator'] except: sample.amp_operator = '' try: sample.primer_suite = lane_run_dict['primer_suite'] except: sample.primer_suite = '' try: sample.tubelabel = lane_run_dict['tubelabel'] except: sample.tubelabel = '' try: sample.dna_region = lane_run_dict['dna_region'] except: sample.dna_region = '' if sample.primer_suite: sample.taxonomic_domain = sample.primer_suite.split()[0] else: sample.taxonomic_domain = 'unknown' sample.project_title = lane_run_dict['project_title'] sample.project_description = lane_run_dict['project_description'] sample.env_sample_source_id = lane_run_dict['env_sample_source_id'] sample.dataset_description = lane_run_dict['dataset_description'] sample.project = lane_run_dict['project'] sample.dataset = lane_run_dict['dataset'] # print('lane_run_key '+lane_run_key) if self.vamps_user_upload: # required for 454 sample.direction = lane_run_dict['direction'] #sample.taxonomic_domain = lane_run_dict['taxonomic_domain'] # a list of run_keys # convert: change ':' to '_' #lane_run_key = '_'.join(lane_run_key.split(':')) key = lane_run_key[:1]+'_'+lane_run_key[2:] #sample.key = key self.run_keys.append(key) # a dictionary of samples self.samples[key] = sample else: if self.platform in C.illumina_list: # req specifically for illumina sample.data_owner = lane_run_dict['data_owner'] sample.first_name = lane_run_dict['first_name'] sample.last_name = lane_run_dict['last_name'] sample.email = lane_run_dict['email'] sample.institution = lane_run_dict['institution'] sample.funding = lane_run_dict['funding'] sample.barcode_index = lane_run_dict['barcode_index'] sample.overlap = lane_run_dict['overlap'] sample.read_length = lane_run_dict['read_length'] # sample.file_prefix = lane_run_dict['file_prefix'] sample.insert_size = lane_run_dict['insert_size'] #sample.taxonomic_domain = lane_run_dict['domain'] # concatenate: barcode_index and run_key and lane key = lane_run_dict['barcode_index'] +'_'+ lane_run_dict['run_key'] +'_'+ lane_run_dict['lane'] #sample.key = key self.run_keys.append(key) # a dictionary of samples self.samples[lane_run_key] = sample elif self.platform == '454': # required for 454 sample.direction = lane_run_dict['direction'] sample.data_owner = lane_run_dict['data_owner'] sample.first_name = lane_run_dict['first_name'] sample.last_name = lane_run_dict['last_name'] sample.email = lane_run_dict['email'] sample.institution = lane_run_dict['institution'] sample.funding = lane_run_dict['funding'] #sample.taxonomic_domain = lane_run_dict['domain'] # a list of run_keys # convert: change ':' to '_' key = lane_run_key[:1]+'_'+lane_run_key[2:] #sample.key = key self.run_keys.append(key) # a dictionary of samples self.samples[lane_run_key] = sample
def initializeFromDictionary(self, configDict): # get the general stuff general_config = configDict['general'] #if general_config['gast_data_source'] != 'database': self.run = general_config['run'] self.platform = general_config.get('platform', "unknown") self.input_dir = general_config.get('input_dir', None) self.require_distal = general_config.get('require_distal', True) self.minimumLength = general_config.get('minimumLength', C.minimumLength) self.maximumLength = general_config.get('maximumLength', C.maximumLength) self.minAvgQual = general_config.get('minAvgQual', C.minAvgQual) self.force_runkey = general_config.get('force_runkey', None) self.use_cluster = general_config['use_cluster'] try: self.idx_keys = general_config['idx_keys'] except: self.idx_keys = "" if self.vamps_user_upload: self.user = general_config['user'] self.input_files = general_config['input_files'] #self.project = general_config['project'] #self.dataset = general_config['dataset'] self.dna_region = general_config['dna_region'] self.domain = general_config['domain'] self.site = general_config['site'] self.load_vamps_database = general_config['load_vamps_database'] try: self.fasta_file = general_config['fasta_file'] except: self.fasta_file = None try: self.mobedac = general_config['mobedac'] except: self.mobedac = False try: self.use_full_length= general_config['use_full_length'] except: self.use_full_length= False try: self.classifier = general_config['classifier'] except: self.classifier= 'unknown' else: if self.platform == 'illumina': self.compressed = general_config['compressed'] self.database_name = general_config['database_name'] self.database_host = general_config['database_host'] self.site = general_config['site'] self.load_vamps_database = general_config['load_vamps_database'] if general_config.has_key("archaea"): self.archaea = general_config['archaea'] if general_config.has_key("do_perfect"): self.do_perfect = general_config['do_perfect'] else: self.do_perfect = C.pipeline_run_items['illumina']['do_perfect'] if general_config.has_key("lane_name"): self.lane_name = general_config['lane_name'] else: self.lane_name = C.pipeline_run_items['illumina']['lane_name'] elif self.platform == '454': self.compressed = general_config['compressed'] self.database_name = general_config['database_name'] self.database_host = general_config['database_host'] self.site = general_config['site'] self.load_vamps_database = general_config['load_vamps_database'] else: pass # added gast_input_source for vamps uploads # so when users want to gast at a later time they will # look in the database and not the files (which may be missing) # see /xraid2-2/vampsweb/vampsdev/vamps_trim.py if 'gast_input_source' in general_config: self.gast_input_source = general_config['gast_input_source'] print 'General Config:',general_config if 'files_list' in general_config: input_file_names = general_config['files_list'] self.input_files = ','.join(general_config['files_list']) self.files_list = general_config['files_list'] else: input_file_names = [input_str.strip() for input_str in general_config['input_files'].split(',')] self.input_files = ','.join(general_config['input_files']) self.files_list = general_config['input_files'] self.input_file_info = {} print general_config for idx,input_file in enumerate(input_file_names): if "input_file_format" in general_config: file_format = general_config['input_file_format'] else: # default file_format = 'fasta' if file_format not in C.input_file_formats: raise Exception("Invalid sequence input file format: " + general_config['input_file_format']) if "input_file_lane" in general_config: file_lane = general_config['input_file_lane'] else: # default file_lane = '' # make up a hash...they are allowed to not put in any input_file_lanes...could be 3 mbl fasta files which would all have lane # info encoded on each id/description line of the sequence record self.input_file_info[input_file] = { "name" : input_file, "format" : file_format, "lane" : file_lane } # now deal with each lane_runkey combo (Sample) that is misnamed though # populate sample information for every run_key for lane_run_key in [s for s in configDict.keys() if s != 'general']: # change ':' to '_' # key = lane_run_key[:1]+'_'+lane_run_key[2:] lane_run_dict = configDict[lane_run_key] #print 'CD ',configDict sample = Sample(lane_run_key) #print 'sample',sample # has defaults -not required try: sample.forward_primers = lane_run_dict['forward_primers'].split(',') except: sample.forward_primers = [] try: sample.reverse_primers = lane_run_dict['reverse_primers'].split(',') except: sample.reverse_primers = [] try: sample.stop_sequences = lane_run_dict['stop_sequences'].split(',') except: sample.stop_sequences = [] try: sample.anchor = lane_run_dict['anchor'] except: sample.anchor = '' # should we try to trim with mbl primers as well as custom ones try: sample.use_mbl_primers = lane_run_dict['use_mbl_primers'] except: sample.use_mbl_primers = 1 ################################# try: sample.run_key = lane_run_dict['run_key'] except: sample.run_key = '' try: sample.lane = lane_run_dict['lane'] except: sample.lane = '' try: sample.adaptor = lane_run_dict['adaptor'] except: sample.adaptor = '' try: sample.barcode = lane_run_dict['barcode'] except: sample.barcode = '' try: sample.seq_operator = lane_run_dict['seq_operator'] except: sample.seq_operator = '' try: sample.amp_operator = lane_run_dict['amp_operator'] except: sample.amp_operator = '' try: sample.primer_suite = lane_run_dict['primer_suite'] except: sample.primer_suite = '' try: sample.tubelabel = lane_run_dict['tubelabel'] except: sample.tubelabel = '' try: sample.dna_region = lane_run_dict['dna_region'] except: sample.dna_region = '' if sample.primer_suite: sample.taxonomic_domain = sample.primer_suite.split()[0] else: sample.taxonomic_domain = 'unknown' sample.project_title = lane_run_dict['project_title'] sample.project_description = lane_run_dict['project_description'] sample.env_sample_source_id = lane_run_dict['env_sample_source_id'] sample.dataset_description = lane_run_dict['dataset_description'] sample.project = lane_run_dict['project'] sample.dataset = lane_run_dict['dataset'] print 'lane_run_key '+lane_run_key if self.vamps_user_upload: # required for 454 sample.direction = lane_run_dict['direction'] #sample.taxonomic_domain = lane_run_dict['taxonomic_domain'] # a list of run_keys # convert: change ':' to '_' #lane_run_key = '_'.join(lane_run_key.split(':')) key = lane_run_key[:1]+'_'+lane_run_key[2:] #sample.key = key self.run_keys.append(key) # a dictionary of samples self.samples[key] = sample else: if self.platform == 'illumina': # req specifically for illumina sample.data_owner = lane_run_dict['data_owner'] sample.first_name = lane_run_dict['first_name'] sample.last_name = lane_run_dict['last_name'] sample.email = lane_run_dict['email'] sample.institution = lane_run_dict['institution'] sample.funding = lane_run_dict['funding'] sample.barcode_index = lane_run_dict['barcode_index'] sample.overlap = lane_run_dict['overlap'] sample.read_length = lane_run_dict['read_length'] # sample.file_prefix = lane_run_dict['file_prefix'] sample.insert_size = lane_run_dict['insert_size'] #sample.taxonomic_domain = lane_run_dict['domain'] # concatenate: barcode_index and run_key and lane key = lane_run_dict['barcode_index'] +'_'+ lane_run_dict['run_key'] +'_'+ lane_run_dict['lane'] #sample.key = key self.run_keys.append(key) # a dictionary of samples self.samples[lane_run_key] = sample elif self.platform == '454': # required for 454 sample.direction = lane_run_dict['direction'] sample.data_owner = lane_run_dict['data_owner'] sample.first_name = lane_run_dict['first_name'] sample.last_name = lane_run_dict['last_name'] sample.email = lane_run_dict['email'] sample.institution = lane_run_dict['institution'] sample.funding = lane_run_dict['funding'] #sample.taxonomic_domain = lane_run_dict['domain'] # a list of run_keys # convert: change ':' to '_' key = lane_run_key[:1]+'_'+lane_run_key[2:] #sample.key = key self.run_keys.append(key) # a dictionary of samples self.samples[lane_run_key] = sample