def info(self, lane_keys): """ fill vamps_project_info table """ logger.info("Starting vamps_upload: projects_info") if self.runobj.site == 'vamps': db_host = 'vampsdb' db_name = 'vamps' else: db_host = 'vampsdev' db_name = 'vamps' myconn = MyConnection(host=db_host, db=db_name) query = "SELECT last_name,first_name,email,institution from vamps_auth where user='******'" % (self.runobj.user) data = myconn.execute_fetch_select(query) fh = open(self.projects_info_file,'w') title="title" description='description' contact= data[0][1]+' '+data[0][0] email= data[0][2] institution= data[0][3] user = self.runobj.user fh.write("\t".join(["HEADER","project","title","description","contact", "email","institution","user","env_source_id"] )+"\n") fh.write("\t".join(["0",self.project, title, description, contact, email, institution, user, self.runobj.env_source_id] )+"\n") # if this project already exists in the db??? # the next step should update the table rather than add new to the db fh.close() logger.info("Finishing VAMPS info()")
def env_source_to_id(self, headers): logger.error("self.utils.is_local() LLL2 metadata") logger.error(self.utils.is_local()) if self.utils.is_local(): self.my_conn = MyConnection(host = 'localhost', db="test_env454") else: self.my_conn = MyConnection(host='bpcdb1', db="env454") # self.my_conn = MyConnection() my_sql = """SELECT * FROM env_sample_source""" self.env = self.my_conn.execute_fetch_select(my_sql) self.res_headers = ["env_sample_source_id" if x=="env_sample_source" else x for x in headers]
def get_my_conn(self): try: host = self.general_config_dict['database_host'] except: raise try: db = self.general_config_dict['database_name'] except: raise if self.utils.is_local(): host = 'localhost' db = "test_env454" self.my_conn = MyConnection(host = host, db = db)
def check_projects_and_datasets(self,data): self.my_conn = MyConnection(host='newbpcdb2.jbpc-np.mbl.edu', db="env454") # self.my_conn = MyConnection() project_dataset = {} projects = {} datasets = {} error =False warn =False for item in data: if item != 'general': #project_dataset[data[item]['project']+'--'+data[item]['dataset']] = 1 datasets[data[item]['dataset']] = data[item]['project'] projects[data[item]['project']] = 1 for p in projects: #print p my_sql = """SELECT project FROM project WHERE project = '%s'""" % (p) res = self.my_conn.execute_fetch_select(my_sql) if res: logger.warning("project '"+p+"' already exists in the database - is this okay?") warn = True else: logger.debug("project '"+p+"' is new") ds_found_count = 0 for d in datasets: if datasets[d] == p: #print "\t%s" % (d) my_sql = """SELECT dataset FROM dataset WHERE dataset = '%s'""" % (d) res = self.my_conn.execute_fetch_select(my_sql) if res: ds_found_count += 1 if ds_found_count >3: logger.warning("\t\tPossibly more .... - Exiting after just three") break logger.warning("\tdataset '"+d+"' already exists in the database - is this okay?") warn=True else: logger.debug("\tdataset '"+d+"' is new") logger.debug("\tDataset Count: "+str(len(datasets))) return (error,warn)
def start_gast(myobject): """ Doc string """ project = myobject['project'] dataset = myobject['dataset'] dna_region = myobject['dna_region'] domain = myobject['domain'] runcode = myobject['runcode'] site = myobject['site'] #user_cursor = myobject['user_cursor'] datetime = myobject['datetime'] user = myobject['user'] from_fasta = myobject['from_fasta'] load_db = myobject['load_db'] env_source_id = myobject['env_source_id'] steps = myobject['steps'] fasta_file_from_cl = myobject['fasta_file'] use_cluster = myobject['use_cluster'] #myobject['baseoutputdir'] seq_count = 0 site_base = '/xraid2-2/vampsweb/'+site file_prefix = user+runcode output_dir = myobject['output_dir'] #output_dir = os.path.join(site_base, 'tmp',user+"_"+runcode+'_gast') # use the files from file_base directory # but we get the primers and keys from the database # which were stored there during the loading phase # check for directory: user_runcode # if present use the data from there # if not: go to the database if os.path.exists(output_dir): print "files path exists:",output_dir #gast_input_source = 'files' #file_base = output_dir # This may be a mobedac upload and we should try to use the files here # rather than look to the database for data else: output_dir = os.path.join(site_base, 'tmp',user+"_"+runcode+'_gast') print "Files path doesn't exist: attempting to get data from database" print "Creating directory",output_dir os.mkdir(output_dir) from pipeline.run import Run from pipelineprocessor import process myRunDict = {} # this is a minimal run dictionary for the general stanza myRunDict['general'] = {'run_date':datetime, 'vamps_user_upload':True, 'gast_input_source':'database', 'input_file_names':'vamps_upload', 'input_file_lanes':'1', 'input_file_formats':'fasta', 'run':runcode, 'use_cluster':use_cluster, 'platform':'vamps', 'user':user, 'site':site, 'load_vamps_database':True, 'input_files':None, 'files_list':[], 'output_dir':output_dir, 'file_prefix':file_prefix} #print myRunDict # # # run = Run(myRunDict, "/xraid2-2/vampsweb/"+site) # # # # pack the things we'll need for GAST run.project = project run.dataset = dataset run.load_db = load_db run.env_source_id=env_source_id run.site = site run.from_fasta = from_fasta run.fasta_file_from_cl=fasta_file_from_cl run.runcode = runcode run.user = user run.samples = {} run.dna_region = dna_region #run.basedir = file_base # fastaunique_cmd = '/bioware/bin/fastaunique' fastaunique_cmd = 'fastaunique' if run.from_fasta: print run.from_fasta # copy file to fasta_file = os.path.join(output_dir,run.user+run.runcode+'.fa') shutil.copyfile(run.fasta_file_from_cl, fasta_file) grep_cmd = ['grep','-c','>',fasta_file] run.dataset_count = subprocess.check_output(grep_cmd).strip() else: # from database from pipeline.db_upload import MyConnection if site == 'vamps': db_host_user = '******' db_name_user = '******' else: db_host_user = '******' db_name_user = '******' myconn = MyConnection(host=db_host_user,db=db_name_user) # should create the fasta file and names file here and not in gast.py ds_list = [] if dataset: ds_list = [dataset] query ="select read_id,sequence,dataset from vamps_upload_trimseq where project='"+project+"' and dataset='"+dataset+"' and user='******' " print query rows = myconn.execute_fetch_select(query) fasta_file = os.path.join(output_dir, 'fasta.fa') unique_file = os.path.join(output_dir, 'unique.fa') names_file = os.path.join(output_dir, 'names') fh = open(fasta_file, 'w') if not rows: print "No data found using query:", query for r in rows: id = r[0] seq = r[1] fh.write(">"+id+"\n"+seq+"\n") fh.close() fastaunique_cmd = fastaunique_cmd +" -x -i "+fasta_file+" -o "+unique_file+" -n "+names_file subprocess.call(fastaunique_cmd, shell=True) else: # looks for vamps_projects_datasets_pipe in vamps_user_uploads q0 = "select distinct dataset from vamps_projects_datasets_pipe where project='"+project+"' and dataset != '' and dataset != 'NoKey'" print q0 dsrows = myconn.execute_fetch_select(q0) if not dsrows: print "No datasets found using query:", q0 sys.exit() for ds in dsrows: ds = ds[0] ds_list.append(ds) query ="select read_id, sequence, dataset from vamps_upload_trimseq where project='"+project+"' and dataset='"+ds+"' and user='******' " print query rows = myconn.execute_fetch_select(query) ds_dir = os.path.join(output_dir, ds) if os.path.exists(ds_dir): # Start with and empty directory shutil.rmtree(ds_dir, True) os.mkdir(ds_dir) else: os.mkdir(ds_dir) fasta_file = os.path.join(output_dir, ds, 'fasta.fa') unique_file = os.path.join(output_dir, ds, 'unique.fa') names_file = os.path.join(output_dir, ds, 'names') #dataset_file=os.path.join(output_dir, 'datasets') fh = open(fasta_file, 'w') if not rows: print "No data found using query:", query for r in rows: id = r[0] seq = r[1] ds = r[2] fh.write(">"+id+"\n"+seq+"\n") fh.close() fastaunique_call = fastaunique_cmd +" "+fasta_file+" -o "+unique_file+" -n "+names_file + " -f" subprocess.call(fastaunique_call, shell=True) run.datasets = ds_list ############################################################### # This starts the MBL GAST python pipeline at the GAST STEP # # now do all the work # possible steps: trim,chimera,gast,vampsupload process(run, steps) print "done with gast"
def env_source_to_id(self, headers): self.my_conn = MyConnection(host='newbpcdb2.jbpc-np.mbl.edu', db="env454") # self.my_conn = MyConnection() my_sql = """SELECT * FROM env_sample_source""" self.env = self.my_conn.execute_fetch_select(my_sql) self.res_headers = ["env_sample_source_id" if x=="env_sample_source" else x for x in headers]
class MetadataUtils: """ Class to read metadata files (csv and ini style) validate and create a dictionary from them Two parts: 1) From pipeline-ui.py to validate the input args 2) From runconfig.py to write the final ini file and create the dictionary that is used to create the run object """ Name = "MetadataUtils" def __init__(self, command_line_args = None, configuration_dictionary = None): self.args = command_line_args self.general_config_dict = configuration_dictionary self.known_header_list = C.csv_header_list self.pipeline_run_items = C.pipeline_run_items self.primer_suites = self.convert_primer_suites(C.primer_suites) self.dna_regions = C.dna_regions self.data_object = {} self.data_object['general'] = {} self.warn_msg = """\n\tThe config File seems to be okay. If the items above look correct then press 'c' to continue the pipeline\n""" self.res_headers = [] self.env = {} def convert_and_save_ini(self, analysis_dir): new_ini_file = os.path.join(analysis_dir, self.general_config_dict['run'] + '.ini') #new_ini_file = os.path.join(self.general_config_dict['output_dir'],self.general_config_dict['run'],self.general_config_dict['run'] + '.ini') # converts csv to ini and saves to output_dir if self.general_config_dict['platform'] == 'vamps': self.save_ini_file(new_ini_file) else: self.convert_csv_to_ini(new_ini_file) 'TODO: Andy, what mean the next two lines?' # self.general_config_dict['configPath'] # self.general_config_dict['configPath_original'] = self.general_config_dict['configPath'] self.general_config_dict['configPath'] = new_ini_file # change path and type to new ini # regardless of what they were before def validate(self, analysis_dir): if self.general_config_dict['platform'] == 'illumina': self.warn_msg = self.validate_illumina_ini(analysis_dir) elif self.general_config_dict['platform'] == '454': data = self.validate_454_ini(analysis_dir) elif self.general_config_dict['platform'] == 'ion_torrent': pass elif self.general_config_dict['platform'] == 'vamps': data = self.validate_vamps_ini(analysis_dir) else: sys.exit("Unknown platform and configFile type for validation") return self.data_object def get_general_data(self): """ """ return self.data_object['general'] # def create_dictionary_from_ini(self): # """ # # read an ini config file and convert to a dictionary # """ # import ConfigParser # if os.path.exists(self.general_config_dict['configPath']): # data_object = {} # user_config = ConfigParser.ConfigParser() # user_config.read(self.general_config_dict['configPath']) # # for section in user_config.sections(): # # section_dict = data_object[section] = {} # for option in user_config.options(section): # section_dict[option] = user_config.get(section,option) # # else: # print "error could not open config file: ",self.general_config_dict['configPath'] # # return data_object # def get_command_line_items(self, general_data): # # # command line items take precedence over ini file items of the same name # # defaults should be here and NOT in argparse/commandline # if self.args.input_dir: # general_data['input_dir'] = self.args.input_dir # else: # if not general_data['input_dir']: # general_data['input_dir'] = './' # # if self.args.run: # general_data['run'] = self.args.run # general_data['run_date'] = self.args.run # else: # if 'run' in general_data: # general_data['run_date'] = general_data['run'] # elif 'run_date' in general_data: # general_data['run'] = general_data['run_date'] # else: # sys.exit("Cannot find the run or run_date on command line or in config file - Exiting") # # make sure RUN is before OUTPUT_DIR # try: # general_data['output_dir'] = os.path.join(self.args.baseoutputdir,self.args.run) # except: # if 'output_dir' not in general_data: # general_data['output_dir'] = os.path.join('.',self.args.run) # #getattr(args,'force_runkey', "") # # # if self.args.platform: # general_data['platform'] = self.args.platform # else: # if 'platform' not in general_data: # sys.exit("Cannot find the platform from command line or in config file - Exiting") # # # if self.args.input_file_format: # general_data['input_file_format'] = self.args.input_file_format # else: # if 'input_file_format' not in general_data: # general_data['input_file_format'] = '' # if self.args.input_file_suffix: # general_data['input_file_suffix'] = self.args.input_file_suffix # else: # if 'input_file_suffix' not in general_data: # general_data['input_file_suffix'] = '' # # return general_data # def validate_454_csv(self, args, my_csv): # print "TODO: write validate def for 454/csv" # data_object = self.populate_data_object_454(args, my_csv) def validate_vamps_ini(self, analysis_dir): # configPath is the new configPath 'todo: Andy, what should be here, just directory name or directory + number.ini?' self.data_object = self.configDictionaryFromFile_ini(self.general_config_dict['configPath']) if 'fasta_file' in self.data_object and not os.path.exists(self.data_object['fasta_file']): sys.exit("Fasta file path doesn't exist: "+self.data_object['fasta_file'] ) elif 'fasta_file' in self.data_object['general'] and not os.path.exists(self.data_object['general']['fasta_file']): sys.exit("Fasta file path doesn't exist: "+self.data_object['general']['fasta_file'] ) def validate_454_ini(self, analysis_dir): print "TODO - write validation def for 454/ini" #self.data_object = self.create_dictionary_from_ini() # 454 ini file requirements: def validate_illumina_ini(self, analysis_dir): """ The csv headers are checked earlier """ print "Validating ini type Config File (may have been converted from csv)" new_ini_file = os.path.join(analysis_dir, self.general_config_dict['run'] + '.ini') print "New ini file location: "+new_ini_file return_code = False error_code = False warn_code = False msg = '' error=False warn=False #print 'configpath',self.general_config_dict['configPath'] # configPath here is the new configPath self.data_object = self.configDictionaryFromFile_ini(self.general_config_dict['configPath']) (error_code,warn_code) = self.check_for_missing_values(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_for_datasets(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_domain_suite_region(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_project_name(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_dataset_name(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_projects_and_datasets(self.data_object) if error_code: error=True if warn_code: warn=True #print self.data_object['input_dir'] #print self.data_object['input_files'] if 'input_dir' not in self.data_object['general'] and 'input_files' not in self.data_object['general']: logger.warning("No input directory and no input files") warn=True elif not os.path.isdir(self.data_object['general']['input_dir']): logger.error("That is not a directory: "+self.data_object['general']['input_dir']) error=True elif self.data_object['general']['input_file_format'] == 'fastq' and self.data_object['general']['platform'] == 'illumina': file_exists = False # if 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir']: for dirname, dirnames, filenames in os.walk(self.data_object['general']['input_dir']): # if not filenames: for file_name in filenames: if os.path.isfile(os.path.join(dirname, file_name)): file_exists = True break if not file_exists: logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir']) error=True elif 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir'] and ('input_files' not in self.data_object['general'] or not self.data_object['general']['input_files']): logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir']) error=True if error: sys.exit( """\n\t\033[91mTHERE WERE SEVERE PROBLEMS WITH THE CSV and/or CONFIG FILE - EXITING PLEASE CORRECT THEM AND START OVER.\033[0m\n To view the errors add ' --loglevel info' to the command line.\n""") elif warn: msg = """\n\t\033[93mTHERE WERE NON-FATAL PROBLEMS WITH THE CSV and/or CONFIG FILE THAT MAY OR MAY NOT CAUSE PROBLEMS.\033[0m\n To view the warnings add ' --loglevel warning' to the command line.\n""" print "\033[92mCSV File Passed Vaidation! (with warnings)\033[0m" else: print "\033[92mCSV File Passed Vaidation!\033[0m" return msg def validate_dictionary(self, config_info): """ This is only used for data that comes in as a dictionary rather than a file such as with vamps user uploads """ print "TODO - Validating input dictionary" # must be a general section # should I create a dict here??? -That would render much code in # runconfig useless. # are we going to continue developing ini style config files if # no one uses them? configDict = config_info return configDict def populate_data_object_454(self, args): data = {} data['general'] = {} test_datasets = {} dataset_counter = {} headers = '' if self.runobj: infile = self.runobj.configPath else: infile = args.configPath data['general']['input_dir'] = args.input_dir #data['general']['output_dir'] = os.path.join(args.output_dir,args.run) data['general']['output_dir'] = args.output_dir data['general']['platform'] = args.platform data['general']['run'] = args.run #data['general']['run_date'] = args.run data['general']["input_file_format"] = args.input_file_format data['general']["input_file_suffix"] = args.input_file_suffix return data['general'] def get_input_files(self): files_list = [] if os.path.isdir(self.general_config_dict['input_dir']): for infile in glob.glob( os.path.join(self.general_config_dict['input_dir'], '*') ): if os.path.isdir(infile) == True: for infile2 in glob.glob( os.path.join( infile,'*') ): if os.path.isdir(infile2) == True: pass else: sub_dir = os.path.basename(infile) files_list.append(os.path.join(sub_dir,os.path.basename(infile2))) else: files_list.append(os.path.basename(infile)) # else: # if fasta_file: # pass # logger.warning("No input directory or directory permissions problem: "+self.general_config_dict['input_dir']) return files_list def check_for_input_files(self, data_object): file_count = 0 files_list = [] imports_list = [] lanes_list = [] #input_dir = os.path.join(data_object['general']['input_dir'],"fasta") input_dir = data_object['general']['input_dir'] if os.path.isdir(input_dir): p = data_object['general']['input_dir'], '*'+data_object['general']['input_file_suffix'] for infile in glob.glob( os.path.join(input_dir, '*'+data_object['general']['input_file_suffix']) ): files_list.append(os.path.basename(infile)) for x in data_object: if 'file_prefix' in data_object[x]: pass #print data_object[x]['file_prefix'] #if os.path.basename(infile).split('-')[0] == data_object[x]['file_prefix']: #lanes_list.append(data_object[x]['lane']) file_count += 1 else: logger.info("No input directory or directory permissions problem: "+input_dir) print "No input directory or directory permissions problem: "+input_dir if not file_count: #sys.exit("ERROR: No files were found in '"+input_dir+"' with a suffix of '"+data_object['general']['input_file_suffix']+"'") logger.info("ERROR: No files were found in '"+input_dir+"' with a suffix of '"+data_object['general']['input_file_suffix']+"'") data_object['general']['files_list'] = files_list data_object['general']['file_count'] = file_count # all the files in an illumina directory should be the same type #data_object['general']['file_formats_list'] = [data_object['general']["input_file_format"]] * file_count #data_object['general']['lanes_list'] = lanes_list #print "Files LIST",data_object['general']['files_list'] return data_object def check_for_missing_values(self, data): missing_key = '' error = False warn = False for item in data: if item == 'general': for k,v in data[item].iteritems(): if not k: #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting") logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing") warn=True if v == '': logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing") warn=True for item in data: if item != 'general': for k,v in data[item].iteritems(): if not k: #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting") logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing") warn=True if not v: if (k == 'barcode' or k == 'adaptor'): #these could be empty logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing") else: logger.error("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing") error=True return (error,warn) def check_for_datasets(self,data): error = False warn=False for item in data: if item != 'general': #print 'ds',data[item]['dataset'] if not data[item]['dataset']: #if 'dataset' not in data[item]: logger.error("Current dataset name is missing or corrupt - Exiting (key: "+item+")") error=True return (error,warn) def check_domain_suite_region(self,data): error = False warn=False for item in data: if item != 'general': primer_suite = self.convert_primer_suites(data[item]['primer_suite']) dna_region = self.convert_primer_suites(data[item]['dna_region']) # CHECK MUST MATCH: "Domain","Primer Suite","DNA Region" if primer_suite not in self.primer_suites: logger.error("Primer Suite not found: "+primer_suite+" - Exiting (key: "+item+")") error=True if dna_region not in self.dna_regions: logger.error("DNA Region not found: "+dna_region+" - Exiting (key: "+item+")") error=True if dna_region not in primer_suite: logger.error("DNA Region ("+dna_region+") not found in Primer Suite ("+primer_suite+") - Exiting (key: "+item+")") error=True return (error, warn) def convert_primer_suites(self, suite): if type(suite) is list: conv_suite = [item.lower().translate(None, '_- ') for item in suite] if type(suite) is str: conv_suite = suite.lower().translate(None, '_- ') return conv_suite def check_project_name(self, data): """ # CHECK: project name format: 3 parts; end with Bv6,Ev9,Av6 or something similar """ error =False warn =False for item in data: if item != 'general': try: (a,b,c) = data[item]['project'].split('_') except: logger.error("project not in correct format: "+data[item]['project']+" - Exiting (key: "+data[item]+")") error=True (a,b,c) = data[item]['project'].split('_') #if c[0] not in [i[0].upper() for i in domains]: # sys.exit("ERROR : Project suffix has incorrect/non-existant domain: "+c) if (c[1:] not in self.dna_regions) and (c.lower() not in self.dna_regions): logger.error("Project suffix has incorrect DNA region: "+c+" - Exiting (key: "+data[item]+")") error = True return (error,warn) def check_dataset_name(self,data): """ # CHECK: dataset name can be ONLY alphanumeric and underscore and cannot start with a number! """ error =False warn =False for item in data: if item != 'general': dataset_name = data[item]['dataset'] if not re.match("^[A-Za-z0-9_]*$", dataset_name): logger.error("Dataset name has illeagal character(s): "+dataset_name+" (must be alphanumeric and underscore only)") error = True #if re.match("^[0-9]", dataset_name): # logger.error("Dataset name cannot begin with a digit: "+dataset_name) # error = True return (error,warn) def check_projects_and_datasets(self,data): self.my_conn = MyConnection(host='newbpcdb2.jbpc-np.mbl.edu', db="env454") # self.my_conn = MyConnection() project_dataset = {} projects = {} datasets = {} error =False warn =False for item in data: if item != 'general': #project_dataset[data[item]['project']+'--'+data[item]['dataset']] = 1 datasets[data[item]['dataset']] = data[item]['project'] projects[data[item]['project']] = 1 for p in projects: #print p my_sql = """SELECT project FROM project WHERE project = '%s'""" % (p) res = self.my_conn.execute_fetch_select(my_sql) if res: logger.warning("project '"+p+"' already exists in the database - is this okay?") warn = True else: logger.debug("project '"+p+"' is new") ds_found_count = 0 for d in datasets: if datasets[d] == p: #print "\t%s" % (d) my_sql = """SELECT dataset FROM dataset WHERE dataset = '%s'""" % (d) res = self.my_conn.execute_fetch_select(my_sql) if res: ds_found_count += 1 if ds_found_count >3: logger.warning("\t\tPossibly more .... - Exiting after just three") break logger.warning("\tdataset '"+d+"' already exists in the database - is this okay?") warn=True else: logger.debug("\tdataset '"+d+"' is new") logger.debug("\tDataset Count: "+str(len(datasets))) return (error,warn) def get_confirmation(self, steps, general_data): print "\n" for item,value in general_data.iteritems(): #print len(value) if type(value) != bool and len(value) > 80: tmp = value.split(',') print "%-20s = %s .. %s" % (item,tmp[0],tmp[-1]) else: print "%-20s = %-20s" % (item,value) print "\nStep(s) to be performed: \033[1;36m",steps,'\033[0m' print "\n"+self.warn_msg+"\n" if 'validate' in steps.split(','): # print we are done sys.exit() if PipelneUtils().is_local: return 'c' else: return raw_input("\nDoes this look okay? (q to quit, v to view configFile, c to continue) ") def convert_csv_to_ini(self, new_ini_file): #print self.args from pipeline.get_ini import readCSV print 'CSV path', self.general_config_dict['csvPath'] my_csv = readCSV(file_path = self.general_config_dict['csvPath']) content = my_csv.read_csv() headers = content[1].keys() headers_clean = [x.strip('"').replace(" ", "_").lower() for x in headers] projects = {} #print #print content[1] #print # get list of keys keys_list = [] if self.check_headers(headers_clean): logger.info("CSV headers okay") for k,values in content.iteritems(): keys_list.append(values['barcode_index']+"_"+values['run_key']+"_"+values['lane']) fh = open(new_ini_file,'w') # general section fh.write("#\n#\tCreated by MBL Pipeline for run: "+self.general_config_dict['run']+" on "+self.general_config_dict['date']+"\n#\n\n") fh.write("[general]\n") fh.write("run = "+self.general_config_dict['run']+"\n") fh.write("configPath = "+new_ini_file+"\n") fh.write("configPath_orig = " + self.general_config_dict['configPath']+"\n") fh.write("platform = " + self.general_config_dict['platform']+"\n") fh.write("output_dir = " + os.path.dirname(new_ini_file)+"\n") #fh.write("output_dir = "+os.path.join(self.general_config_dict['baseoutputdir'],self.general_config_dict['run'])+"\n") if self.general_config_dict['platform'] == 'illumina': #fh.write("input_file_suffix = " + self.general_config_dict['input_file_suffix']+"\n") fh.write("input_file_format = " + self.general_config_dict['input_file_format']+"\n") fh.write("anchor_file = " + self.general_config_dict['anchor_file']+"\n") fh.write("primer_file = " + self.general_config_dict['primer_file']+"\n") fh.write("compressed = " + str(self.general_config_dict['compressed'])+"\n") fh.write("do_perfect = " + str(self.general_config_dict['do_perfect'])+"\n") fh.write("lane_name = " + str(self.general_config_dict['lane_name'])+"\n") fh.write("database_host = " + self.general_config_dict['database_host']+"\n") fh.write("database_name = " + self.general_config_dict['database_name']+"\n") fh.write("input_dir = " + self.general_config_dict['input_dir']+"\n") fh.write("require_distal = " + str(self.general_config_dict['require_distal'])+"\n") fh.write("use_cluster = " + str(self.general_config_dict['use_cluster'])+"\n") fh.write("date = " + str(datetime.date.today())+"\n") fh.write("site = " + self.general_config_dict['site']+"\n") fh.write("load_vamps_database = " + str(self.general_config_dict['load_vamps_database'])+"\n") fh.write("idx_keys = " +','.join(keys_list)+"\n") if 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != '': file_list = self.get_input_files() fh.write("input_files = " + ','.join(file_list)+"\n") else: fh.write("input_files = \n") #fh.write(getattr(args,'force_runkey', "")) for k, values in content.iteritems(): fh.write("\n") if self.general_config_dict['platform'] == 'illumina': fh.write("["+values['barcode_index']+"_"+values['run_key']+"_"+values['lane']+"]\n") elif self.general_config_dict['platform'] == '454': fh.write("["+values['lane']+"_"+values['run_key']+"]\n") for v in values: if v == "env_sample_source": try: new_val = [str(j[0]) for j in self.env if j[1] == values[v]][0] except: print """There was an error in env_sample_source. Please check your metadata. Possible values: ----------- air extreme habitat host associated human associated human-amniotic-fluid human-blood human-gut human-oral human-skin human-urine human-vaginal indoor microbial mat/biofilm miscellaneous_natural_or_artificial_environment plant associated sediment soil/sand unknown wastewater/sludge water-freshwater water-marine ----------- """ raise fh.write("env_sample_source_id = "+new_val+"\n") else: fh.write(v+" = "+values[v]+"\n") fh.close() return new_ini_file def save_ini_file(self,new_ini_file): # give it a new name out_fh = open(new_ini_file,'w') #for line in open(os.path.abspath(self.general_config_dict['configPath']),"r"): # out_fh.write(line) self.general_config_dict['configPath_original'] = self.general_config_dict['configPath'] self.general_config_dict['configPath'] = new_ini_file out_fh.write("#\n#\tCreated by MBL Pipeline for run: "+self.general_config_dict['run']+" on "+self.general_config_dict['date']+"\n#\n\n") out_fh.write("[general]\n") for item in self.general_config_dict: out_fh.write(item+" = "+str(self.general_config_dict[item]) + "\n") #out_fh.write("\n["+self.general_config_dict['platform']+"]\n") #for item in self.general_config_dict: # if item not in C.general_run_items: # out_fh.write(item+" = "+str(self.general_config_dict[item]) + "\n") if 'fasta_file' in self.general_config_dict and self.general_config_dict['fasta_file'] != '': (path,fasta) = os.path.split(self.general_config_dict['fasta_file']) if 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != path: sys.exit("Your input_dir and fasta_file directory don't agree - Exiting\n\t"+self.general_config_dict['input_dir']+" != "+self.general_config_dict['fasta_file']) out_fh.write("input_dir = "+path+"\n") out_fh.write("input_files = "+fasta+"\n") #out_fh.write("input_file_suffix = fasta\n") elif 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != '': file_list = self.get_input_files() out_fh.write("input_files = " + ','.join(file_list)+"\n") else: out_fh.write("input_files = \n") out_fh.close() def check_headers(self, headers): if self.general_config_dict['platform']=='illumina': known_header_list= self.known_header_list['illumina'] elif self.general_config_dict['platform'] == '454': known_header_list = self.known_header_list['454'] else: logger.error("in utils: check_headers - unknown platform") #print sorted(known_header_list) #print sorted(headers) self.res_headers = headers if "env_sample_source" in headers: self.env_source_to_id(headers) if sorted(known_header_list) != sorted(self.res_headers): print "=" * 40 print "csv file header problem" print "%-20s %-20s" % ("REQUIRED", "YOUR CSV") for i in sorted(known_header_list): if i in headers: print "%-20s%-20s" % (i,i) else: print "%-20s%-20s" % (i,"----------- <--- missing") for i in headers: if i not in known_header_list: print "%-20s%-20s" % (" ",i+" <--- extra") print "=" * 40 sys.exit("ERROR : unknown or missing headers\n") else: return True def env_source_to_id(self, headers): self.my_conn = MyConnection(host='newbpcdb2.jbpc-np.mbl.edu', db="env454") # self.my_conn = MyConnection() my_sql = """SELECT * FROM env_sample_source""" self.env = self.my_conn.execute_fetch_select(my_sql) self.res_headers = ["env_sample_source_id" if x=="env_sample_source" else x for x in headers] def configDictionaryFromFile_ini(self, config_file_path): import ConfigParser configDict = {} user_config = ConfigParser.ConfigParser() user_config.read(config_file_path) for section in user_config.sections(): section_dict = configDict[section] = {} for option in user_config.options(section): section_dict[option] = user_config.get(section,option) if section_dict[option] == 'True' or section_dict[option] == 'true': section_dict[option] = True elif section_dict[option] == 'False' or section_dict[option] == 'false': section_dict[option] = False return configDict def get_values(self, args, general_config_dict = {} ): collector={} for item in self.pipeline_run_items[args.platform]: # set collector[item] to the default first collector[item] = self.pipeline_run_items[args.platform][item] # now look for args (then ini) values to replace if item in args and getattr( args, item ) != None: collector[item] = getattr( args, item ) elif general_config_dict and item in general_config_dict[args.platform] and general_config_dict[args.platform][item] != '': collector[item] = general_config_dict[args.platform][item] # get all the items from general_config_dict['general'] if 'general' in general_config_dict: for item in general_config_dict['general']: collector[item] = general_config_dict['general'][item] return collector def validate_args(self): """ # THOUGHTS # vamps users # single project and dataset # Supply an ini file OR commandline (for web interface), but no csv file # # MBL pipeline # REQUIRE a csv file and a ini file """ collector={} if self.args.configPath: general_config_dict = self.configDictionaryFromFile_ini(self.args.configPath) if self.args.platform in general_config_dict and 'general' in general_config_dict: collector= self.get_values( self.args, general_config_dict) else: sys.exit("The ini file needs both a [general] and ["+ self.args.platform +"] section - Exiting.") else: # no configPath collector= self.get_values( self.args ) if self.args.platform == 'illumina': print "Starting Illumina Pipeline" if not self.args.csvPath: sys.exit("illumina requires a csv file - Exiting") elif self.args.platform == 'vamps': print "Starting VAMPS Pipeline:" if 'project' not in collector or collector['project'] == '': collector['project'] = collector['project'][:1].capitalize() + collector['project'][1:] else: logger.debug("No project found in vamps pipeline") if self.args.fasta_file: collector['project'] = self.args.fasta_file collector['from_fasta'] = True elif self.args.platform == '454': print "Starting 454 Pipeline" elif self.args.platform == 'ion_torrent': print "Starting Ion Torrent Pipeline" else: sys.exit("Validate args: Unknown Platform") if self.args.configPath: collector['configPath'] = self.args.configPath else: collector['configPath'] = "" # these are all the bool items in the collector # they need to be converted fron str to bool here for i in collector: if collector[i] == 'True' or collector[i] == 'true': collector[i] = True elif collector[i] == 'False' or collector[i] == 'false': collector[i] = False #collector['runcode'] = self.args.run collector['run'] = self.args.run #collector['run_date'] = self.args.run #collector['steps'] = self.args.steps collector['platform'] = self.args.platform if self.args.input_dir: collector['input_dir'] = self.args.input_dir collector['date'] = str(datetime.date.today()) #print collector return collector
def load_database(self, lane_keys): """ """ logger.info("Starting load VAMPS data") # self.taxes_file = os.path.join(self.outdir,'vamps_data_cube_uploads.txt') # self.summed_taxes_file = os.path.join(self.outdir,'vamps_junk_data_cube_pipe.txt') # self.distinct_taxes_file = os.path.join(self.outdir,'vamps_taxonomy_pipe.txt') # self.sequences_file = os.path.join(self.outdir,'vamps_sequences_pipe.txt') # self.export_file = os.path.join(self.outdir,'vamps_export_pipe.txt') # self.projects_datasets_file = os.path.join(self.outdir,'vamps_projects_datasets_pipe.txt') # self.projects_info_file = os.path.join(self.outdir,'vamps_projects_info_pipe.txt') # USER: vamps_db_tables data_cube_table = 'vamps_data_cube_uploads' summed_cube_table = 'vamps_junk_data_cube_pipe' taxonomy_table = 'vamps_taxonomy_pipe' sequences_table = 'vamps_sequences_pipe' exports_table ='vamps_export_pipe' info_table_user = '******' info_table = 'vamps_projects_info' datasets_table = 'vamps_projects_datasets_pipe' users_table = 'vamps_users' # We only have a single project and dataset here: # if the project is new then we add the data to the upload_info and projects_datasets_pipe table # but if the project is not new: # check if the existing project belongs to the user # if it does then UPDATE the line in upload_info table and add line to projects_datasets_pipe table # (maybe check if dataset already exists and die if yes) # if the existing project doesn't belong to the owner then die with a warning to change project name # (or maybe change the name by adding _user) if self.runobj.site == 'vamps': db_host = 'vampsdb' db_name = 'vamps' else: db_host = 'vampsdev' db_name = 'vamps' myconn = MyConnection(host=db_host, db=db_name) query = "SELECT project_name from %s where project_name='%s' \ UNION \ SELECT project_name from %s where project_name='%s' \ " % (info_table_user,self.project,info_table,self.project) data = myconn.execute_fetch_select(query) if data: logger.info("found this project "+data[0][0]+" Exiting") sys.exit("Duplicate project name found; Canceling upload to database but your GASTed data are here: "+ self.outdir) else: # project is unknown in database - continue # # DATA_CUBE # for line in open(self.taxes_file,'r'): line = line.strip().split("\t") if line[0]=='HEADER': continue #line = line[1:] # remove leading empty tab qDataCube = "insert ignore into %s (project, dataset, taxon_string,superkingdom,phylum,class,\ orderx,family,genus,species,strain,rank,knt,frequency,dataset_count,classifier)\ VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" \ % (data_cube_table, line[0],line[1],line[2],line[3],line[4],line[5],line[6],line[7], line[8],line[9],line[10],line[11],line[12],line[13],line[14],line[15]) myconn.execute_no_fetch(qDataCube) # # SUMMED (JUNK) DATA_CUBE # for line in open(self.summed_taxes_file,'r'): line = line.strip().split("\t") if line[0]=='HEADER': continue #line = line[1:] # remove leading empty tab #taxonomy sum_tax_counts frequency dataset_count rank project dataset project--dataset classifier qSummedCube = "insert ignore into %s (taxon_string,knt, frequency, dataset_count, rank, project, dataset, project_dataset, classifier)\ VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s')" \ % (summed_cube_table, line[0],line[1],line[2],line[3],line[4],line[5],line[6],line[7], line[8]) myconn.execute_no_fetch(qSummedCube) # # TAXONOMY # for line in open(self.distinct_taxes_file,'r'): line = line.strip().split("\t") if line[0]=='HEADER': continue #line = line[1:] # remove leading empty tab qTaxonomy = "insert ignore into %s (taxon_string,rank,num_kids)\ VALUES('%s','%s','%s')" \ % (taxonomy_table, line[0],line[1],line[2]) myconn.execute_no_fetch(qTaxonomy) # # SEQUENCES # for line in open(self.sequences_file,'r'): line = line.strip().split("\t") if line[0]=='HEADER': continue #line = line[1:] # remove leading empty tab # project dataset taxonomy refhvr_ids rank seq_count frequency distance read_id project_dataset qSequences = "insert ignore into %s (sequence,project, dataset, taxonomy,refhvr_ids,rank,seq_count,frequency,distance,rep_id, project_dataset)\ VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" \ % (sequences_table, line[0],line[1],line[2],line[3],line[4],line[5],line[6],line[7], line[8],line[9],line[10]) myconn.execute_no_fetch(qSequences) # # PROJECTS_DATASETS # for line in open(self.projects_datasets_file,'r'): line = line.strip().split("\t") # [1:] # split and remove the leading 'zero' if line[0]=='HEADER': continue qDatasets = "insert ignore into %s (project, dataset, dataset_count,has_tax,date_trimmed,dataset_info)\ VALUES('%s','%s','%s','%s','%s','%s')" \ % (datasets_table, line[0],line[1],line[2],line[3],line[4],line[5]) myconn.execute_no_fetch(qDatasets) # # INFO # for line in open(self.projects_info_file,'r'): line = line.strip().split("\t") #[1:] # split on tab and remove the leading 'zero' if line[0]=='HEADER': continue qInfo = "insert into %s (project_name, title, description, contact, email, institution, user, env_source_id)\ VALUES('%s','%s','%s','%s','%s','%s','%s','%s')" \ % (info_table_user, line[0],line[1],line[2],line[3],line[4],line[5],line[6],line[7]) myconn.execute_no_fetch(qInfo) # # USERS # qUser = "******" \ % (users_table, self.project, self.runobj.user) myconn.execute_no_fetch(qUser) logger.info("Finished load VAMPS data")
class MetadataUtils: """ Class to read metadata files (csv and ini style) validate and create a dictionary from them Two parts: 1) From pipeline-ui.py to validate the input args 2) From runconfig.py to write the final ini file and create the dictionary that is used to create the run object """ Name = "MetadataUtils" def __init__(self, command_line_args = None, configuration_dictionary = None): self.args = command_line_args self.general_config_dict = configuration_dictionary self.known_header_list = C.csv_header_list self.pipeline_run_items = C.pipeline_run_items self.primer_suites = C.primer_suites self.dna_regions = C.dna_regions self.data_object = {} self.data_object['general'] = {} self.warn_msg = """\n\tThe config File seems to be okay. If the items above look correct then press 'c' to continue the pipeline\n""" def convert_and_save_ini(self): new_ini_file = os.path.join(self.general_config_dict['output_dir'],self.general_config_dict['run'] + '.ini') #new_ini_file = os.path.join(self.general_config_dict['output_dir'],self.general_config_dict['run'],self.general_config_dict['run'] + '.ini') # converts csv to ini and saves to output_dir if self.general_config_dict['platform'] == 'vamps': self.save_ini_file(new_ini_file) else: self.convert_csv_to_ini(new_ini_file) self.general_config_dict['configPath'] self.general_config_dict['configPath_original'] = self.general_config_dict['configPath'] self.general_config_dict['configPath'] = new_ini_file # change path and type to new ini # regardless of what they were before def validate(self): if self.general_config_dict['platform'] == 'illumina': self.warn_msg = self.validate_illumina_ini() elif self.general_config_dict['platform'] == '454': data = self.validate_454_ini() elif self.general_config_dict['platform'] == 'ion_torrent': pass elif self.general_config_dict['platform'] == 'vamps': data = self.validate_vamps_ini() else: sys.exit("Unknown platform and configFile type for validation") return self.data_object def get_general_data(self): """ """ return self.data_object['general'] # def create_dictionary_from_ini(self): # """ # # read an ini config file and convert to a dictionary # """ # import ConfigParser # if os.path.exists(self.general_config_dict['configPath']): # data_object = {} # user_config = ConfigParser.ConfigParser() # user_config.read(self.general_config_dict['configPath']) # # for section in user_config.sections(): # # section_dict = data_object[section] = {} # for option in user_config.options(section): # section_dict[option] = user_config.get(section,option) # # else: # print "error could not open config file: ",self.general_config_dict['configPath'] # # return data_object # def get_command_line_items(self, general_data): # # # command line items take precedence over ini file items of the same name # # defaults should be here and NOT in argparse/commandline # if self.args.input_dir: # general_data['input_dir'] = self.args.input_dir # else: # if not general_data['input_dir']: # general_data['input_dir'] = './' # # if self.args.run: # general_data['run'] = self.args.run # general_data['run_date'] = self.args.run # else: # if 'run' in general_data: # general_data['run_date'] = general_data['run'] # elif 'run_date' in general_data: # general_data['run'] = general_data['run_date'] # else: # sys.exit("Cannot find the run or run_date on command line or in config file - Exiting") # # make sure RUN is before OUTPUT_DIR # try: # general_data['output_dir'] = os.path.join(self.args.baseoutputdir,self.args.run) # except: # if 'output_dir' not in general_data: # general_data['output_dir'] = os.path.join('.',self.args.run) # #getattr(args,'force_runkey', "") # # # if self.args.platform: # general_data['platform'] = self.args.platform # else: # if 'platform' not in general_data: # sys.exit("Cannot find the platform from command line or in config file - Exiting") # # # if self.args.input_file_format: # general_data['input_file_format'] = self.args.input_file_format # else: # if 'input_file_format' not in general_data: # general_data['input_file_format'] = '' # if self.args.input_file_suffix: # general_data['input_file_suffix'] = self.args.input_file_suffix # else: # if 'input_file_suffix' not in general_data: # general_data['input_file_suffix'] = '' # # return general_data # def validate_454_csv(self, args, my_csv): # print "TODO: write validate def for 454/csv" # data_object = self.populate_data_object_454(args, my_csv) def validate_vamps_ini(self): # configPath is the new configPath self.data_object = self.configDictionaryFromFile_ini(self.general_config_dict['configPath']) def validate_454_ini(self): print "TODO - write validation def for 454/ini" #self.data_object = self.create_dictionary_from_ini() # 454 ini file requirements: def validate_illumina_ini(self): """ The csv headers are checked earlier """ print "Validating ini type Config File (may have been converted from csv)" return_code = False error_code = False warn_code = False msg = '' error=False warn=False #print 'configpath',self.general_config_dict['configPath'] # configPath here is the new configPath self.data_object = self.configDictionaryFromFile_ini(self.general_config_dict['configPath']) (error_code,warn_code) = self.check_for_missing_values(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_for_datasets(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_domain_suite_region(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_project_name(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_projects_and_datasets(self.data_object) if error_code: error=True if warn_code: warn=True #print self.data_object['input_dir'] #print self.data_object['input_files'] if 'input_dir' not in self.data_object['general'] and 'input_files' not in self.data_object['general']: logger.warning("No input directory and no input files") warn=True elif not os.path.isdir(self.data_object['general']['input_dir']): logger.error("That is not a directory: "+self.data_object['general']['input_dir']) error=True elif self.data_object['general']['input_file_format'] == 'fastq' and self.data_object['general']['platform'] == 'illumina': file_exists = False # if 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir']: for dirname, dirnames, filenames in os.walk(self.data_object['general']['input_dir']): # if not filenames: for file_name in filenames: if os.path.isfile(os.path.join(dirname, file_name)): file_exists = True break if not file_exists: logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir']) error=True elif 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir'] and ('input_files' not in self.data_object['general'] or not self.data_object['general']['input_files']): logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir']) error=True if error: sys.exit( """\n\tTHERE WERE SEVERE PROBLEMS WITH THE CONFIG FILE - EXITING PLEASE CORRECT THEM AND START OVER.\n To view the errors add ' --loglevel info' to the command line.\n""") elif warn: msg = """\n\tTHERE WERE NON-FATAL PROBLEMS WITH THE CONFIG FILE THAT MAY OR MAY NOT CAUSE PROBLEMS.\n To view the warnings add ' --loglevel warning' to the command line.\n""" return msg def validate_dictionary(self, config_info): """ This is only used for data that comes in as a dictionary rather than a file such as with vamps user uploads """ print "TODO - Validating input dictionary" # must be a general section # should I create a dict here??? -That would render much code in # runconfig useless. # are we going to continue developing ini style config files if # no one uses them? configDict = config_info return configDict def populate_data_object_454(self, args): data = {} data['general'] = {} test_datasets = {} dataset_counter = {} headers = '' if self.runobj: infile = self.runobj.configPath else: infile = args.configPath data['general']['input_dir'] = args.input_dir #data['general']['output_dir'] = os.path.join(args.output_dir,args.run) data['general']['output_dir'] = args.output_dir data['general']['platform'] = args.platform data['general']['run'] = args.run #data['general']['run_date'] = args.run data['general']["input_file_format"] = args.input_file_format data['general']["input_file_suffix"] = args.input_file_suffix return data['general'] # def populate_data_object_illumina(self, args, my_csv): # data = {} # data['general'] = {} # test_datasets = {} # dataset_counter = {} # headers = '' # if self.run: # infile = self.run.configPath # data['general']['input_dir'] = self.run.input_dir # #megadata['general']['output_dir'] = self.args.output_dir # data['general']['platform'] = self.run.platform # data['general']['run'] = self.run.run_date # #data['general']['run_date'] = self.run.run_date # #megadata['general']['run'] = self.args.run # data['general']["input_file_format"] = self.run.input_file_format # #input_dir,"/xraid2-2/sequencing/Illumina/20120525_recalled/Project_Sandra_v6/analysis/" # data['general']["input_file_suffix"] = self.run.input_file_suffix # else: # infile = args.configPath # data['general']['input_dir'] = args.input_dir # #data['general']['output_dir'] = os.path.join(args.output_dir,args.run) # data['general']['output_dir'] = args.output_dir # data['general']['platform'] = args.platform # data['general']['run'] = args.run # #data['general']['run_date'] = args.run # #megadata['general']['run'] = self.args.run # data['general']["input_file_format"] = args.input_file_format # #input_dir,"/xraid2-2/sequencing/Illumina/20120525_recalled/Project_Sandra_v6/analysis/" # data['general']["input_file_suffix"] = args.input_file_suffix # # print "Validating csv type ConfigFile" # # # changes spaces to '_' and all lowercase # # temp = {} # # # # my_read_csv = readCSV(file_path = infile) # # my_read_csv.put_run_info() # # print "content[1].keys(): " # # print content[1].keys() # # # To see the list of statistics available for each line # # for k, v in content.items(): # # print k, v['dataset'], v # content = my_csv.read_csv() # headers = content[1].keys() # headers_clean = [x.strip('"').replace(" ", "_").lower() for x in headers] # projects = {} # if self.check_headers(headers_clean): # # # # # try: # # temp[headers[n]] = lst[n] # # except: # # sys.exit("ERROR:It looks like the header count and the data column count are different.") # for k, v in content.items(): # run_key = v['run_key'].replace('N','').upper() # temp['file_prefix'] = v['dataset']+'_'+ run_key # # print "v = %s\n" % v # # v = {'barcode_index': 'ATCACG', 'project': 'JCR_SPO_Bv6', 'lane': '3', 'run': '20120613', 'dna_region': 'v6', 'adaptor': '', # # 'barcode': '', 'seq_operator': 'JV', 'overlap': 'complete', 'dataset': 'H40', 'run_key': 'NNNNACGCA', 'read_length': '101', # # 'file_prefix': 'H40', 'data_owner': 'jreveillaud', 'primer_suite': 'Bacterial v6 Suite', 'tubelabel': 'H40', 'amp_operator': 'JR', 'insert_size': '230'}; # # temp['file_prefix'] = H40_ # unique_identifier = v['barcode_index']+'_'+run_key+'_'+v['lane'] # data[unique_identifier] = {} # if unique_identifier in test_datasets: # sys.exit("ERROR: duplicate run_key:barcode_index:lane: "+unique_identifier+" - Exiting") # else: # test_datasets[unique_identifier] = 1 # # print "test_datasets = %s;\ntemp['file_prefix'] = %s\nunique_identifier = %s" % (test_datasets,temp['file_prefix'], unique_identifier) # # data[unique_identifier]['dataset'] = v['dataset'] # data[unique_identifier]['project'] = v['project'] # # if v['project'] in dataset_counter: # dataset_counter[v['project']] += 1 # else: # dataset_counter[v['project']] = 1 # # #megadata[unique_identifier]['ds_count'] = 1 # data[unique_identifier]['project'] = v['project'] # data[unique_identifier]['run_key'] = v['run_key'] # data[unique_identifier]['lane'] = v['lane'] # data[unique_identifier]['tubelabel'] = v['tubelabel'] # data[unique_identifier]['barcode'] = v['barcode'] # data[unique_identifier]['adaptor'] = v['adaptor'] # data[unique_identifier]['dna_region'] = v['dna_region'] # data[unique_identifier]['amp_operator'] = v['amp_operator'] # data[unique_identifier]['seq_operator'] = v['seq_operator'] # data[unique_identifier]['barcode_index'] = v['barcode_index'] # data[unique_identifier]['overlap'] = v['overlap'] # data[unique_identifier]['insert_size'] = v['insert_size'] # data[unique_identifier]['file_prefix'] = v['file_prefix'] # data[unique_identifier]['read_length'] = v['read_length'] # data[unique_identifier]['primer_suite'] = v['primer_suite'] # data[unique_identifier]['first_name'] = v['first_name'] # data[unique_identifier]['last_name'] = v['last_name'] # data[unique_identifier]['email'] = v['email'] # data[unique_identifier]['institution'] = v['institution'] # data[unique_identifier]['project_title'] = v['project_title'] # data[unique_identifier]['project_description'] = v['project_description'] # data[unique_identifier]['funding'] = v['funding'] # data[unique_identifier]['env_sample_source'] = v['env_sample_source'] # data[unique_identifier]['dataset_description'] = v['dataset_description'] # for item in data: # if item != 'general': # data[item]['primer_suite'] = data[item]['primer_suite'].lower().replace(" ", "_") # data[item]['dna_region'] = data[item]['dna_region'].lower().replace(" ", "_") # data[item]['barcode'] = data[item]['barcode'].upper() # data[item]['barcode_index'] = data[item]['barcode_index'].upper() # data[item]['ds_count'] = str(dataset_counter[data[item]['project']]) # # # return data def get_input_files(self): files_list = [] print self.general_config_dict['input_dir'] if os.path.isdir(self.general_config_dict['input_dir']): for infile in glob.glob( os.path.join(self.general_config_dict['input_dir'], '*') ): if os.path.isdir(infile) == True: pass else: files_list.append(os.path.basename(infile)) else: if fasta_file: pass logger.warning("No input directory or directory permissions problem: "+self.general_config_dict['input_dir']) return files_list def check_for_input_files(self,data_object): file_count = 0 files_list = [] imports_list = [] lanes_list = [] #input_dir = os.path.join(data_object['general']['input_dir'],"fasta") input_dir = data_object['general']['input_dir'] if os.path.isdir(input_dir): p = data_object['general']['input_dir'], '*'+data_object['general']['input_file_suffix'] for infile in glob.glob( os.path.join(input_dir, '*'+data_object['general']['input_file_suffix']) ): files_list.append(os.path.basename(infile)) for x in data_object: if 'file_prefix' in data_object[x]: pass #print data_object[x]['file_prefix'] #if os.path.basename(infile).split('-')[0] == data_object[x]['file_prefix']: #lanes_list.append(data_object[x]['lane']) file_count += 1 else: logger.info("No input directory or directory permissions problem: "+input_dir) print "No input directory or directory permissions problem: "+input_dir if not file_count: #sys.exit("ERROR: No files were found in '"+input_dir+"' with a suffix of '"+data_object['general']['input_file_suffix']+"'") logger.info("ERROR: No files were found in '"+input_dir+"' with a suffix of '"+data_object['general']['input_file_suffix']+"'") data_object['general']['files_list'] = files_list data_object['general']['file_count'] = file_count # all the files in an illumina directory should be the same type #data_object['general']['file_formats_list'] = [data_object['general']["input_file_format"]] * file_count #data_object['general']['lanes_list'] = lanes_list #print "Files LIST",data_object['general']['files_list'] return data_object def check_for_missing_values(self, data): missing_key = '' error = False warn = False for item in data: if item == 'general': for k,v in data[item].iteritems(): if not k: #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting") logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing") warn=True if not v: logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing") warn=True for item in data: if item != 'general': for k,v in data[item].iteritems(): if not k: #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting") logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing") warn=True if not v: if (k == 'barcode' or k == 'adaptor'): #these could be empty logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing") else: logger.error("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing") error=True return (error,warn) def check_for_datasets(self,data): error = False warn=False for item in data: if item != 'general': #print 'ds',data[item]['dataset'] if not data[item]['dataset']: #if 'dataset' not in data[item]: logger.error("Current dataset name is missing or corrupt - Exiting (key: "+item+")") error=True return (error,warn) def check_domain_suite_region(self,data): error = False warn=False for item in data: if item != 'general': # CHECK MUST MATCH: "Domain","Primer Suite","DNA Region" if data[item]['primer_suite'] not in self.primer_suites: logger.error("Primer Suite not found: "+data[item]['primer_suite']+" - Exiting (key: "+item+")") error=True #if dataset_items['domain'] not in domains: # sys.exit("ERROR: Domain not found: "+dataset_items['domain']) if data[item]['dna_region'] not in self.dna_regions: logger.error("DNA Region not found: "+data[item]['dna_region']+" - Exiting (key: "+item+")") error=True # "Bacterial v6","BacterialV6Suite","v6" #if dataset_items['domain'][:6] != dataset_items['primer_suite'][:6]: # sys.exit("ERROR: Domain ("+dataset_items['domain']+") -- Primer Suite ("+dataset_items['primer_suite']+") mismatch.") #if dataset_items['domain'][-2:].lower() != dataset_items['dna_region'].lower(): # sys.exit("ERROR: DNA Region ("+dataset_items['dna_region']+") -- Domain ("+dataset_items['domain']+") mismatch.") if data[item]['dna_region'] not in data[item]['primer_suite']: logger.error("DNA Region ("+data[item]['dna_region']+") not found in Primer Suite ("+data[item]['primer_suite']+") - Exiting (key: "+item+")") error=True return (error,warn) def check_project_name(self,data): """ # CHECK: project name format: 3 parts; end with Bv6,Ev9,Av6 or something similar """ error =False warn =False for item in data: if item != 'general': try: (a,b,c) = data[item]['project'].split('_') except: logger.error("project not in correct format: "+data[item]['project']+" - Exiting (key: "+data[item]+")") error=True (a,b,c) = data[item]['project'].split('_') #if c[0] not in [i[0].upper() for i in domains]: # sys.exit("ERROR : Project suffix has incorrect/non-existant domain: "+c) if c[1:] not in self.dna_regions: logger.error("Project suffix has incorrect DNA region: "+c+" - Exiting (key: "+data[item]+")") error = True return (error,warn) def check_projects_and_datasets(self,data): self.my_conn = MyConnection(host='newbpcdb2', db="env454") project_dataset = {} projects = {} datasets = {} error =False warn =False for item in data: if item != 'general': #project_dataset[data[item]['project']+'--'+data[item]['dataset']] = 1 datasets[data[item]['dataset']] = data[item]['project'] projects[data[item]['project']] = 1 for p in projects: #print p my_sql = """SELECT project FROM project WHERE project = '%s'""" % (p) res = self.my_conn.execute_fetch_select(my_sql) if res: logger.warning("project '"+p+"' already exists in the database - is this okay?") warn = True else: logger.debug("project '"+p+"' is new") ds_found_count = 0 for d in datasets: if datasets[d] == p: #print "\t%s" % (d) my_sql = """SELECT dataset FROM dataset WHERE dataset = '%s'""" % (d) res = self.my_conn.execute_fetch_select(my_sql) if res: ds_found_count += 1 if ds_found_count >3: logger.warning("\t\tPossibly more .... - Exiting after just three") break logger.warning("\tdataset '"+d+"' already exists in the database - is this okay?") warn=True else: logger.debug("\tdataset '"+d+"' is new") logger.debug("\tDataset Count: "+str(len(datasets))) return (error,warn) def get_confirmation(self, steps, general_data): print "\n" for item,value in general_data.iteritems(): #print len(value) if type(value) != bool and len(value) > 80: tmp = value.split(',') print "%20s = %s .. %s" % (item,tmp[0],tmp[-1]) else: print "%20s = %-20s" % (item,value) print "\nStep(s) to be performed: ",steps print "\n"+self.warn_msg+"\n" if 'validate' in steps.split(','): # print we are done sys.exit() print os.uname() print os.uname()[1] if os.uname()[1] == 'ashipunova.mbl.edu' or os.uname()[1] == 'as-macbook.local': return "c" else: return raw_input("\nDoes this look okay? (q to quit, v to view configFile, c to continue) ") def convert_csv_to_ini(self,new_ini_file): #print self.args from pipeline.get_ini import readCSV print 'CSV path',self.general_config_dict['csvPath'] my_csv = readCSV(file_path = self.general_config_dict['csvPath']) content = my_csv.read_csv() headers = content[1].keys() headers_clean = [x.strip('"').replace(" ", "_").lower() for x in headers] projects = {} #print #print content[1] #print # get list of keys keys_list = [] if self.check_headers(headers_clean): logger.info("CSV headers okay") for k,values in content.iteritems(): keys_list.append(values['barcode_index']+"_"+values['run_key']+"_"+values['lane']) fh = open(new_ini_file,'w') # general section fh.write("#\n#\tCreated by MBL Pipeline for run: "+self.general_config_dict['run']+" on "+self.general_config_dict['date']+"\n#\n\n") fh.write("[general]\n") fh.write("run = "+self.general_config_dict['run']+"\n") fh.write("configPath = "+new_ini_file+"\n") fh.write("configPath_orig = "+self.general_config_dict['configPath']+"\n") fh.write("platform = "+self.general_config_dict['platform']+"\n") fh.write("output_dir = " + self.general_config_dict['output_dir']+"\n") #fh.write("output_dir = "+os.path.join(self.general_config_dict['baseoutputdir'],self.general_config_dict['run'])+"\n") if self.general_config_dict['platform'] == 'illumina': #fh.write("input_file_suffix = " + self.general_config_dict['input_file_suffix']+"\n") fh.write("input_file_format = " + self.general_config_dict['input_file_format']+"\n") fh.write("anchor_file = " + self.general_config_dict['anchor_file']+"\n") fh.write("primer_file = " + self.general_config_dict['primer_file']+"\n") fh.write("compressed = " + str(self.general_config_dict['compressed'])+"\n") fh.write("database_host = " + self.general_config_dict['database_host']+"\n") fh.write("database_name = " + self.general_config_dict['database_name']+"\n") fh.write("input_dir = " + self.general_config_dict['input_dir']+"\n") fh.write("require_distal = " + str(self.general_config_dict['require_distal'])+"\n") fh.write("use_cluster = " + str(self.general_config_dict['use_cluster'])+"\n") fh.write("date = " + str(datetime.date.today())+"\n") fh.write("idx_keys = " +','.join(keys_list)+"\n") if 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != '': file_list = self.get_input_files() fh.write("input_files = " + ','.join(file_list)+"\n") else: fh.write("input_files = \n") #fh.write(getattr(args,'force_runkey', "")) for k,values in content.iteritems(): fh.write("\n") if self.general_config_dict['platform'] == 'illumina': fh.write("["+values['barcode_index']+"_"+values['run_key']+"_"+values['lane']+"]\n") elif self.general_config_dict['platform'] == '454': fh.write("["+values['lane']+"_"+values['run_key']+"]\n") for v in values: fh.write(v+" = "+values[v]+"\n") fh.close() return new_ini_file def save_ini_file(self,new_ini_file): # give it a new name out_fh = open(new_ini_file,'w') #for line in open(os.path.abspath(self.general_config_dict['configPath']),"r"): # out_fh.write(line) self.general_config_dict['configPath_original'] = self.general_config_dict['configPath'] self.general_config_dict['configPath'] = new_ini_file out_fh.write("#\n#\tCreated by MBL Pipeline for run: "+self.general_config_dict['run']+" on "+self.general_config_dict['date']+"\n#\n\n") out_fh.write("[general]\n") for item in self.general_config_dict: out_fh.write(item+" = "+str(self.general_config_dict[item]) + "\n") #out_fh.write("\n["+self.general_config_dict['platform']+"]\n") #for item in self.general_config_dict: # if item not in C.general_run_items: # out_fh.write(item+" = "+str(self.general_config_dict[item]) + "\n") if 'fasta_file' in self.general_config_dict and self.general_config_dict['fasta_file'] != '': (path,fasta) = os.path.split(self.general_config_dict['fasta_file']) if 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != path: sys.exit("Your input_dir and fasta_file directory don't agree - Exiting\n\t"+self.general_config_dict['input_dir']+" != "+self.general_config_dict['fasta_file']) out_fh.write("input_dir = "+path+"\n") out_fh.write("input_files = "+fasta+"\n") #out_fh.write("input_file_suffix = fasta\n") elif 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != '': file_list = self.get_input_files() out_fh.write("input_files = " + ','.join(file_list)+"\n") else: out_fh.write("input_files = \n") out_fh.close() def check_headers(self,headers): if self.general_config_dict['platform']=='illumina': known_header_list= self.known_header_list['illumina'] elif self.general_config_dict['platform'] == '454': known_header_list = self.known_header_list['454'] else: logger.error("in utils: check_headers - unknown platform") #print sorted(known_header_list) #print sorted(headers) if sorted(known_header_list) != sorted(headers): print "="*40 print "csv file header problem" print "%-20s %-20s" % ("REQUIRED", "YOUR CSV") for i in sorted(known_header_list): if i in headers: print "%-20s%-20s" % (i,i) else: print "%-20s%-20s" % (i,"----------- <--- missing") for i in headers: if i not in known_header_list: print "%-20s%-20s" % (" ",i+" <--- extra") print "="*40 sys.exit("ERROR : unknown or missing headers\n") else: return True def configDictionaryFromFile_ini(self,config_file_path): import ConfigParser configDict = {} user_config = ConfigParser.ConfigParser() user_config.read(config_file_path) for section in user_config.sections(): section_dict = configDict[section] = {} for option in user_config.options(section): section_dict[option] = user_config.get(section,option) if section_dict[option] == 'True' or section_dict[option] == 'true': section_dict[option] = True elif section_dict[option] == 'False' or section_dict[option] == 'false': section_dict[option] = False return configDict def get_values(self, args, general_config_dict = {} ): collector={} for item in self.pipeline_run_items[args.platform]: # set collector[item] to the default first collector[item] = self.pipeline_run_items[args.platform][item] # now look for args (then ini) values to replace if item in args and getattr( args, item ) != None: collector[item] = getattr( args, item ) elif general_config_dict and item in general_config_dict[args.platform] and general_config_dict[args.platform][item] != '': collector[item] = general_config_dict[args.platform][item] # get all the items from general_config_dict['general'] if 'general' in general_config_dict: for item in general_config_dict['general']: collector[item] = general_config_dict['general'][item] return collector def validate_args(self): """ # THOUGHTS # vamps users # single project and dataset # Supply an ini file OR commandline (for web interface), but no csv file # # MBL pipeline # REQUIRE a csv file and a ini file """ collector={} if self.args.configPath: general_config_dict = self.configDictionaryFromFile_ini(self.args.configPath) if self.args.platform in general_config_dict and 'general' in general_config_dict: collector= self.get_values( self.args, general_config_dict) else: sys.exit("The ini file needs both a [general] and ["+ self.args.platform +"] section - Exiting.") else: # no configPath collector= self.get_values( self.args ) if self.args.platform == 'illumina': print "Illumina Pipeline" if not self.args.csvPath: sys.exit("illumina requires a csv file - Exiting") elif self.args.platform == 'vamps': print "VAMPS Pipeline:" if 'project' not in collector or collector['project'] == '': collector['project'] = collector['project'][:1].capitalize() + collector['project'][1:] else: logger.debug("No project found in vamps pipeline") if self.args.fasta_file: collector['project'] = self.args.fasta_file collector['from_fasta'] = True elif self.args.platform == '454': print "454 Pipeline" elif self.args.platform == 'ion_torrent': print "Ion Torrent Pipeline" else: sys.exit("Validate args: Unknown Platform") if self.args.configPath: collector['configPath'] = self.args.configPath else: collector['configPath'] = "" # these are all the bool items in the collector # they need to be converted fron str to bool here for i in collector: if collector[i] == 'True' or collector[i] == 'true': collector[i] = True elif collector[i] == 'False' or collector[i] == 'false': collector[i] = False #collector['runcode'] = self.args.run collector['run'] = self.args.run #collector['run_date'] = self.args.run #collector['steps'] = self.args.steps collector['platform'] = self.args.platform if self.args.input_dir: collector['input_dir'] = self.args.input_dir collector['date'] = str(datetime.date.today()) print collector return collector
class MetadataUtils: """ Class to read metadata files (csv and ini style) validate and create a dictionary from them Two parts: 1) From pipeline-ui.py to validate the input args 2) From runconfig.py to write the final ini file and create the dictionary that is used to create the run object """ Name = "MetadataUtils" def __init__(self, command_line_args = None, configuration_dictionary = None): self.args = command_line_args self.general_config_dict = configuration_dictionary self.known_header_list = C.csv_header_list self.pipeline_run_items = C.pipeline_run_items self.primer_suites = self.convert_primer_suites(C.primer_suites) self.dna_regions = C.dna_regions self.data_object = {} self.data_object['general'] = {} self.warn_msg = """\n\tThe config File seems to be okay. If the items above look correct then press 'c' to continue the pipeline\n""" self.res_headers = [] self.env = {} self.utils = PipelneUtils() def convert_and_save_ini(self, analysis_dir): new_ini_file = os.path.join(analysis_dir, self.general_config_dict['run'] + '.ini') #new_ini_file = os.path.join(self.general_config_dict['output_dir'],self.general_config_dict['run'],self.general_config_dict['run'] + '.ini') # converts csv to ini and saves to output_dir if self.general_config_dict['platform'] == 'vamps': self.save_ini_file(new_ini_file) else: self.convert_csv_to_ini(new_ini_file) self.general_config_dict['configPath'] = new_ini_file # change path and type to new ini # regardless of what they were before def validate(self, analysis_dir): if self.general_config_dict['platform'] in C.illumina_list: self.warn_msg = self.validate_illumina_ini(analysis_dir) elif self.general_config_dict['platform'] == '454': data = self.validate_454_ini(analysis_dir) elif self.general_config_dict['platform'] == 'ion_torrent': pass elif self.general_config_dict['platform'] == 'vamps': data = self.validate_vamps_ini(analysis_dir) else: sys.exit("Unknown platform and configFile type for validation") return self.data_object def get_general_data(self): """ """ return self.data_object['general'] def validate_vamps_ini(self, analysis_dir): # configPath is the new configPath 'todo: Andy, what should be here, just directory name or directory + number.ini?' self.data_object = self.configDictionaryFromFile_ini(self.general_config_dict['configPath']) if 'fasta_file' in self.data_object and not os.path.exists(self.data_object['fasta_file']): sys.exit("Fasta file path doesn't exist: "+self.data_object['fasta_file'] ) elif 'fasta_file' in self.data_object['general'] and not os.path.exists(self.data_object['general']['fasta_file']): sys.exit("Fasta file path doesn't exist: "+self.data_object['general']['fasta_file'] ) def validate_454_ini(self, analysis_dir): print("TODO - write validation def for 454/ini") #self.data_object = self.create_dictionary_from_ini() # 454 ini file requirements: def validate_illumina_ini(self, analysis_dir): """ The csv headers are checked earlier """ print("Validating ini type Config File (may have been converted from csv)") new_ini_file = os.path.join(analysis_dir, self.general_config_dict['run'] + '.ini') print("New ini file location: "+new_ini_file) return_code = False error_code = False warn_code = False msg = '' error=False warn=False #print('configpath',self.general_config_dict['configPath']) # configPath here is the new configPath self.data_object = self.configDictionaryFromFile_ini(self.general_config_dict['configPath']) (error_code,warn_code) = self.check_for_missing_values(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_for_datasets(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_domain_suite_region(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_project_name(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_dataset_name(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_projects_and_datasets(self.data_object) if error_code: error=True if warn_code: warn=True #print(self.data_object['input_dir']) #print(self.data_object['input_files']) if 'input_dir' not in self.data_object['general'] and 'input_files' not in self.data_object['general']: logger.warning("No input directory and no input files") warn=True elif not os.path.isdir(self.data_object['general']['input_dir']): logger.error("That is not a directory: "+self.data_object['general']['input_dir']) error=True elif self.data_object['general']['input_file_format'] == 'fastq' and self.data_object['general']['platform'] in C.illumina_list: file_exists = False # if 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir']: for dirname, dirnames, filenames in os.walk(self.data_object['general']['input_dir']): # if not filenames: for file_name in filenames: if os.path.isfile(os.path.join(dirname, file_name)): file_exists = True break if not file_exists: logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir']) error=True elif 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir'] and ('input_files' not in self.data_object['general'] or not self.data_object['general']['input_files']): logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir']) error=True if error: sys.exit( """\n\t\033[91mTHERE WERE SEVERE PROBLEMS WITH THE CSV and/or CONFIG FILE - EXITING PLEASE CORRECT THEM AND START OVER.\033[0m\n To view the errors add ' --loglevel info' to the command line.\n""") elif warn: msg = """\n\t\033[93mTHERE WERE NON-FATAL PROBLEMS WITH THE CSV and/or CONFIG FILE THAT MAY OR MAY NOT CAUSE PROBLEMS.\033[0m\n To view the warnings add ' --loglevel warning' to the command line.\n""" print("\033[92mCSV File Passed Vaidation! (with warnings)\033[0m") else: print("\033[92mCSV File Passed Vaidation!\033[0m") return msg def validate_dictionary(self, config_info): """ This is only used for data that comes in as a dictionary rather than a file such as with vamps user uploads """ print("TODO - Validating input dictionary") # must be a general section # should I create a dict here??? -That would render much code in # runconfig useless. # are we going to continue developing ini style config files if # no one uses them? configDict = config_info return configDict def populate_data_object_454(self, args): data = {} data['general'] = {} test_datasets = {} dataset_counter = {} headers = '' if self.runobj: infile = self.runobj.configPath else: infile = args.configPath data['general']['input_dir'] = args.input_dir #data['general']['output_dir'] = os.path.join(args.output_dir,args.run) data['general']['output_dir'] = args.output_dir data['general']['platform'] = args.platform data['general']['run'] = args.run #data['general']['run_date'] = args.run data['general']["input_file_format"] = args.input_file_format data['general']["input_file_suffix"] = args.input_file_suffix return data['general'] def get_input_files(self): files_list = [] if os.path.isdir(self.general_config_dict['input_dir']): for infile in glob.glob( os.path.join(self.general_config_dict['input_dir'], '*') ): if os.path.isdir(infile) == True: for infile2 in glob.glob( os.path.join( infile,'*') ): if os.path.isdir(infile2) == True: pass else: sub_dir = os.path.basename(infile) files_list.append(os.path.join(sub_dir,os.path.basename(infile2))) else: files_list.append(os.path.basename(infile)) # else: # if fasta_file: # pass # logger.warning("No input directory or directory permissions problem: "+self.general_config_dict['input_dir']) return files_list def check_for_input_files(self, data_object): file_count = 0 files_list = [] imports_list = [] lanes_list = [] #input_dir = os.path.join(data_object['general']['input_dir'],"fasta") input_dir = data_object['general']['input_dir'] if os.path.isdir(input_dir): p = data_object['general']['input_dir'], '*'+data_object['general']['input_file_suffix'] for infile in glob.glob( os.path.join(input_dir, '*'+data_object['general']['input_file_suffix']) ): files_list.append(os.path.basename(infile)) for x in data_object: if 'file_prefix' in data_object[x]: pass #print(data_object[x]['file_prefix']) #if os.path.basename(infile).split('-')[0] == data_object[x]['file_prefix']: #lanes_list.append(data_object[x]['lane']) file_count += 1 else: logger.info("No input directory or directory permissions problem: "+input_dir) print("No input directory or directory permissions problem: "+input_dir) if not file_count: #sys.exit("ERROR: No files were found in '"+input_dir+"' with a suffix of '"+data_object['general']['input_file_suffix']+"'") logger.info("ERROR: No files were found in '"+input_dir+"' with a suffix of '"+data_object['general']['input_file_suffix']+"'") data_object['general']['files_list'] = files_list data_object['general']['file_count'] = file_count # all the files in an illumina directory should be the same type #data_object['general']['file_formats_list'] = [data_object['general']["input_file_format"]] * file_count #data_object['general']['lanes_list'] = lanes_list #print("Files LIST",data_object['general']['files_list']) return data_object def check_for_missing_values(self, data): missing_key = '' error = False warn = False for item in data: if item == 'general': for k,v in data[item].items(): if not k: #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting") logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing") warn=True if v == '': logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing") warn=True for item in data: if item != 'general': for k,v in data[item].items(): if not k: #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting") logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing") warn=True if not v: if (k == 'barcode' or k == 'adaptor'): #these could be empty logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing") else: logger.error("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing") error=True return (error,warn) def check_for_datasets(self,data): error = False warn=False for item in data: if item != 'general': #print('ds',data[item]['dataset']) if not data[item]['dataset']: #if 'dataset' not in data[item]: logger.error("Current dataset name is missing or corrupt - Exiting (key: "+item+")") error=True return (error,warn) def check_domain_suite_region(self,data): error = False warn=False for item in data: if item != 'general': primer_suite = self.convert_primer_suites(data[item]['primer_suite']) dna_region = self.convert_primer_suites(data[item]['dna_region']) # CHECK MUST MATCH: "Domain","Primer Suite","DNA Region" if primer_suite not in self.primer_suites: logger.error("Primer Suite not found: "+primer_suite+" - Exiting (key: "+item+")") error=True if dna_region not in self.dna_regions: logger.error("DNA Region not found: "+dna_region+" - Exiting (key: "+item+")") error=True if dna_region not in primer_suite: logger.error("DNA Region ("+dna_region+") not found in Primer Suite ("+primer_suite+") - Exiting (key: "+item+")") error=True return (error, warn) def convert_primer_suites(self, suite): import re if type(suite) is list: conv_suite = [re.sub(r'[_ -]', '', item.lower()) for item in suite] if type(suite) is str: conv_suite = re.sub(r'[_ -]', '', suite.lower()) # suite.lower().translate(None, '_- ') return conv_suite def check_project_name(self, data): """ # CHECK: project name format: 3 parts; end with Bv6,Ev9,Av6 or something similar """ error =False warn =False for item in data: if item != 'general': try: (a,b,c) = data[item]['project'].split('_') except: logger.error("project not in correct format: ") logger.error(data[item]['project']) logger.error(" - Exiting (key: ") logger.error(data[item]) error=True (a,b,c) = data[item]['project'].split('_') #if c[0] not in [i[0].upper() for i in domains]: # sys.exit("ERROR : Project suffix has incorrect/non-existant domain: "+c) # logger.error("c[1:] = ") # logger.error(c[1:]) # logger.error("c.lower() =") # logger.error(c.lower()) # logger.error("self.dna_regions") # logger.error(self.dna_regions ) if (c[1:].lower() not in self.dna_regions) and (c.lower() not in self.dna_regions): logger.error("Project suffix has incorrect DNA region: ") logger.error(c) logger.error(" - Exiting (key: ") logger.error(data[item]) error = True return (error, warn) def check_dataset_name(self,data): """ # CHECK: dataset name can be ONLY alphanumeric and underscore and cannot start with a number! """ error =False warn =False for item in data: if item != 'general': dataset_name = data[item]['dataset'] if not re.match("^[A-Za-z0-9_]*$", dataset_name): logger.error("Dataset name has illeagal character(s): "+dataset_name+" (must be alphanumeric and underscore only)") error = True #if re.match("^[0-9]", dataset_name): # logger.error("Dataset name cannot begin with a digit: "+dataset_name) # error = True return (error, warn) def get_my_conn(self): try: host = self.general_config_dict['database_host'] except: raise try: db = self.general_config_dict['database_name'] except: raise if self.utils.is_local(): host = 'localhost' db = "test_env454" self.my_conn = MyConnection(host = host, db = db) def check_projects_and_datasets(self, data): self.get_my_conn() project_dataset = {} projects = {} datasets = {} error =False warn =False for item in data: if item != 'general': #project_dataset[data[item]['project']+'--'+data[item]['dataset']] = 1 datasets[data[item]['dataset']] = data[item]['project'] projects[data[item]['project']] = 1 for p in projects: #print(p) my_sql = """SELECT project FROM project WHERE project = '%s'""" % (p) res = self.my_conn.execute_fetch_select(my_sql) if res: logger.warning("project '"+p+"' already exists in the database - is this okay?") warn = True else: logger.debug("project '"+p+"' is new") ds_found_count = 0 for d in datasets: if datasets[d] == p: #print("\t%s" % (d)) my_sql = """SELECT dataset FROM dataset WHERE dataset = '%s'""" % (d) res = self.my_conn.execute_fetch_select(my_sql) if res: ds_found_count += 1 if ds_found_count >3: logger.warning("\t\tPossibly more .... - Exiting after just three") break logger.warning("\tdataset '"+d+"' already exists in the database - is this okay?") warn=True else: logger.debug("\tdataset '"+d+"' is new") logger.debug("\tDataset Count: "+str(len(datasets))) return (error,warn) def get_confirmation(self, steps, general_data): print("\n") for item,value in general_data.items(): #print(len(value)) if type(value) != bool and len(value) > 80: tmp = value.split(',') print("%-20s = %s .. %s" % (item,tmp[0],tmp[-1])) else: print("%-20s = %-20s" % (item,value)) print("\nStep(s) to be performed: \033[1;36m",steps,'\033[0m') print("\n"+self.warn_msg+"\n") if 'validate' in steps.split(','): # print(we are done) sys.exit() if self.utils.is_local(): return 'c' else: return 'c' # return raw_input("\nDoes this look okay? (q to quit, v to view configFile, c to continue) ") def convert_csv_to_ini(self, new_ini_file): #print(self.args) from pipeline.get_ini import readCSV print('CSV path', self.general_config_dict['csvPath']) my_csv = readCSV(file_path = self.general_config_dict['csvPath']) content = my_csv.read_csv() headers = content[1].keys() headers_clean = [x.strip('"').replace(" ", "_").lower() for x in headers] projects = {} #print #print(content[1]) #print # get list of keys keys_list = [] if self.check_headers(headers_clean): logger.info("CSV headers okay") for k,values in content.items(): keys_list.append(values['barcode_index']+"_"+values['run_key']+"_"+values['lane']) fh = open(new_ini_file,'w') # general section fh.write("#\n#\tCreated by MBL Pipeline for run: "+self.general_config_dict['run']+" on "+self.general_config_dict['date']+"\n#\n\n") fh.write("[general]\n") fh.write("run = "+self.general_config_dict['run']+"\n") fh.write("configPath = "+new_ini_file+"\n") fh.write("configPath_orig = " + self.general_config_dict['configPath']+"\n") fh.write("platform = " + self.general_config_dict['platform']+"\n") fh.write("output_dir = " + os.path.dirname(new_ini_file)+"\n") #fh.write("output_dir = "+os.path.join(self.general_config_dict['baseoutputdir'],self.general_config_dict['run'])+"\n") if self.general_config_dict['platform'] in C.illumina_list: #fh.write("input_file_suffix = " + self.general_config_dict['input_file_suffix']+"\n") fh.write("input_file_format = " + self.general_config_dict['input_file_format']+"\n") fh.write("anchor_file = " + self.general_config_dict['anchor_file']+"\n") fh.write("primer_file = " + self.general_config_dict['primer_file']+"\n") fh.write("compressed = " + str(self.general_config_dict['compressed'])+"\n") fh.write("do_perfect = " + str(self.general_config_dict['do_perfect'])+"\n") fh.write("lane_name = " + str(self.general_config_dict['lane_name'])+"\n") fh.write("database_host = " + self.general_config_dict['database_host']+"\n") fh.write("database_name = " + self.general_config_dict['database_name']+"\n") fh.write("input_dir = " + self.general_config_dict['input_dir']+"\n") fh.write("require_distal = " + str(self.general_config_dict['require_distal'])+"\n") fh.write("use_cluster = " + str(self.general_config_dict['use_cluster'])+"\n") fh.write("date = " + str(datetime.date.today())+"\n") fh.write("site = " + self.general_config_dict['site']+"\n") fh.write("load_vamps_database = " + str(self.general_config_dict['load_vamps_database'])+"\n") fh.write("idx_keys = " +','.join(keys_list)+"\n") if 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != '': file_list = self.get_input_files() fh.write("input_files = " + ','.join(file_list)+"\n") else: fh.write("input_files = \n") #fh.write(getattr(args,'force_runkey', "")) for k, values in content.items(): fh.write("\n") if self.general_config_dict['platform'] in C.illumina_list: fh.write("["+values['barcode_index']+"_"+values['run_key']+"_"+values['lane']+"]\n") elif self.general_config_dict['platform'] == '454': fh.write("["+values['lane']+"_"+values['run_key']+"]\n") for v in values: if v == "env_sample_source": try: new_val = [str(j[0]) for j in self.env if j[1] == values[v]][0] except: text = """There was an error in env_sample_source. Please check your metadata. Possible values: ----------- air extreme habitat host associated human associated human-amniotic-fluid human-blood human-gut human-oral human-skin human-urine human-vaginal indoor microbial mat/biofilm miscellaneous_natural_or_artificial_environment plant associated sediment soil/sand unknown wastewater/sludge water-freshwater water-marine ----------- """ print(text) raise fh.write("env_sample_source_id = "+new_val+"\n") else: fh.write(v+" = "+values[v]+"\n") fh.close() return new_ini_file def save_ini_file(self,new_ini_file): # give it a new name out_fh = open(new_ini_file,'w') #for line in open(os.path.abspath(self.general_config_dict['configPath']),"r"): # out_fh.write(line) self.general_config_dict['configPath_original'] = self.general_config_dict['configPath'] self.general_config_dict['configPath'] = new_ini_file out_fh.write("#\n#\tCreated by MBL Pipeline for run: "+self.general_config_dict['run']+" on "+self.general_config_dict['date']+"\n#\n\n") out_fh.write("[general]\n") for item in self.general_config_dict: out_fh.write(item+" = "+str(self.general_config_dict[item]) + "\n") #out_fh.write("\n["+self.general_config_dict['platform']+"]\n") #for item in self.general_config_dict: # if item not in C.general_run_items: # out_fh.write(item+" = "+str(self.general_config_dict[item]) + "\n") if 'fasta_file' in self.general_config_dict and self.general_config_dict['fasta_file'] != '': (path,fasta) = os.path.split(self.general_config_dict['fasta_file']) if 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != path: sys.exit("Your input_dir and fasta_file directory don't agree - Exiting\n\t"+self.general_config_dict['input_dir']+" != "+self.general_config_dict['fasta_file']) out_fh.write("input_dir = "+path+"\n") out_fh.write("input_files = "+fasta+"\n") #out_fh.write("input_file_suffix = fasta\n") elif 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != '': file_list = self.get_input_files() out_fh.write("input_files = " + ','.join(file_list)+"\n") else: out_fh.write("input_files = \n") out_fh.close() def check_headers(self, headers): if self.general_config_dict['platform'] in C.illumina_list: pl = self.general_config_dict['platform'] known_header_list = self.known_header_list[pl] elif self.general_config_dict['platform'] == '454': known_header_list = self.known_header_list['454'] else: logger.error("in utils: check_headers - unknown platform") #print( sorted(known_header_list)) #print(sorted(headers)) self.res_headers = headers if "env_sample_source" in headers: self.env_source_to_id(headers) if sorted(known_header_list) != sorted(self.res_headers): print("=" * 40) print("csv file header problem") print("%-20s %-20s" % ("REQUIRED", "YOUR CSV")) for i in sorted(known_header_list): if i in headers: print("%-20s%-20s" % (i,i)) else: print("%-20s%-20s" % (i,"----------- <--- missing")) for i in headers: if i not in known_header_list: print("%-20s%-20s" % (" ",i+" <--- extra")) print("=" * 40) sys.exit("ERROR : unknown or missing headers\n") else: return True def env_source_to_id(self, headers): logger.error("self.utils.is_local() LLL2 metadata") logger.error(self.utils.is_local()) if self.utils.is_local(): self.my_conn = MyConnection(host = 'localhost', db="test_env454") else: self.my_conn = MyConnection(host='bpcdb1', db="env454") # self.my_conn = MyConnection() my_sql = """SELECT * FROM env_sample_source""" self.env = self.my_conn.execute_fetch_select(my_sql) self.res_headers = ["env_sample_source_id" if x=="env_sample_source" else x for x in headers] def configDictionaryFromFile_ini(self, config_file_path): import configparser configDict = {} user_config = configparser.ConfigParser() user_config.read(config_file_path) for section in user_config.sections(): section_dict = configDict[section] = {} for option in user_config.options(section): section_dict[option] = user_config.get(section,option) if section_dict[option] == 'True' or section_dict[option] == 'true': section_dict[option] = True elif section_dict[option] == 'False' or section_dict[option] == 'false': section_dict[option] = False return configDict def get_values(self, args, general_config_dict = {} ): collector={} for item in self.pipeline_run_items[args.platform]: # set collector[item] to the default first collector[item] = self.pipeline_run_items[args.platform][item] # now look for args (then ini) values to replace if item in args and getattr( args, item ) != None: collector[item] = getattr( args, item ) elif general_config_dict and item in general_config_dict[args.platform] and general_config_dict[args.platform][item] != '': collector[item] = general_config_dict[args.platform][item] # get all the items from general_config_dict['general'] if 'general' in general_config_dict: for item in general_config_dict['general']: collector[item] = general_config_dict['general'][item] return collector def validate_args(self): """ # THOUGHTS # vamps users # single project and dataset # Supply an ini file OR commandline (for web interface), but no csv file # # MBL pipeline # REQUIRE a csv file and a ini file """ collector={} if self.args.configPath: general_config_dict = self.configDictionaryFromFile_ini(self.args.configPath) if self.args.platform in general_config_dict and 'general' in general_config_dict: collector= self.get_values( self.args, general_config_dict) else: sys.exit("The ini file needs both a [general] and ["+ self.args.platform +"] section - Exiting.") else: # no configPath collector= self.get_values( self.args ) collector['current_db_host_name'] = self.utils.find_in_nested_dict(C.db_cnf, {'host': collector['database_host'], 'db': collector['database_name']}) if not collector['current_db_host_name']: sys.exit("""Please check -db_host and -db_name parameters, the current combination does not exist: 'db_host' = %s, 'db_name' = %s """ % (collector['database_host'], collector['database_name'])) if self.args.platform in C.illumina_list: print("Starting Illumina Pipeline") if not self.args.csvPath: sys.exit("illumina requires a csv file - Exiting") elif self.args.platform == 'vamps': print("Starting VAMPS Pipeline:") if 'project' not in collector or collector['project'] == '': collector['project'] = collector['project'][:1].capitalize() + collector['project'][1:] else: logger.debug("No project found in vamps pipeline") if self.args.fasta_file: collector['project'] = self.args.fasta_file collector['from_fasta'] = True elif self.args.platform == '454': print("Starting 454 Pipeline") elif self.args.platform == 'ion_torrent': print("Starting Ion Torrent Pipeline") else: sys.exit("Validate args: Unknown Platform") if self.args.configPath: collector['configPath'] = self.args.configPath else: collector['configPath'] = "" # these are all the bool items in the collector # they need to be converted from str to bool here for i in collector: if collector[i] == 'True' or collector[i] == 'true': collector[i] = True elif collector[i] == 'False' or collector[i] == 'false': collector[i] = False #collector['runcode'] = self.args.run collector['run'] = self.args.run #collector['run_date'] = self.args.run #collector['steps'] = self.args.steps collector['platform'] = self.args.platform if self.args.input_dir: collector['input_dir'] = self.args.input_dir collector['date'] = str(datetime.date.today()) #print(collector) return collector