def check_for_missing_values(self, data): missing_key = '' error = False warn = False for item in data: if item == 'general': for k,v in data[item].items(): if not k: #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting") logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing") warn=True if v == '': logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing") warn=True for item in data: if item != 'general': for k,v in data[item].items(): if not k: #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting") logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing") warn=True if not v: if (k == 'barcode' or k == 'adaptor'): #these could be empty logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing") else: logger.error("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing") error=True return (error,warn)
def write_status_to_vamps_db(site='vampsdev', id='0', status='Test', message=''): """ This should be available to write status updates to vamps:vamps_upload_status. It is especially important for MoBeDAC uploads because the qiime site will 'see' and react to the message in the db. <-- not true any longer 2014-02-01 AAV """ import ConMySQL from pipeline.db_upload import MyConnection today = str(datetime.date.today()) if site == 'vamps': db_host = 'vampsdb' db_name = 'vamps' db_home = '/xraid2-2/vampsweb/vamps/' else: db_host = 'bpcweb7' db_name = 'vamps' db_home = '/xraid2-2/vampsweb/vampsdev/' #obj=ConMySQL.New(db_host, db_name, db_home) #my_conn = MyConnection(db_host, db_name) obj=ConMySQL.New(db_host, db_name, db_home) conn = obj.get_conn() cursor = conn.cursor() query = "update vamps_upload_status set status='%s', status_message='%s', date='%s' where id='%s'" % (status, message, today, id) try: cursor.execute(query) #print("executing",query) except: conn.rollback() logger.error("ERROR status update failed") else: conn.commit()
def check_for_missing_values(self, data): missing_key = '' error = False warn = False for item in data: if item == 'general': for k,v in data[item].iteritems(): if not k: #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting") logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing") warn=True if v == '': logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing") warn=True for item in data: if item != 'general': for k,v in data[item].iteritems(): if not k: #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting") logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing") warn=True if not v: if (k == 'barcode' or k == 'adaptor'): #these could be empty logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing") else: logger.error("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing") error=True return (error,warn)
def get_reference_databases(self,dna_region): #if dna region == v6v4(a) change it to v4v6 # other reverse regions? if dna_region == 'v6v4': dna_region = 'v4v6' if dna_region == 'v6v4a': dna_region = 'v4v6a' if C.use_full_length: if os.path.exists(os.path.join(self.refdb_dir, 'refssu.udb')): refdb = os.path.join(self.refdb_dir, 'refssu.udb') taxdb = os.path.join(self.refdb_dir, 'refssu.tax') elif os.path.exists(os.path.join(self.refdb_dir, 'refssu.fa')): refdb = os.path.join(self.refdb_dir, 'refssu.fa') taxdb = os.path.join(self.refdb_dir, 'refssu.tax') else: if os.path.exists(os.path.join(self.refdb_dir, C.refdbs[dna_region])): refdb = os.path.join(self.refdb_dir, C.refdbs[dna_region]) taxdb = os.path.join(self.refdb_dir, 'ref'+dna_region+'.tax') elif os.path.exists(os.path.join(self.refdb_dir, 'ref'+dna_region+'.fa')): refdb = os.path.join(self.refdb_dir, 'ref'+dna_region+'.fa') taxdb = os.path.join(self.refdb_dir, 'ref'+dna_region+'.tax') elif os.path.exists(os.path.join(self.refdb_dir, 'refssu.udb')): refdb = os.path.join(self.refdb_dir, 'refssu.udb') taxdb = os.path.join(self.refdb_dir, 'refssu.tax') elif os.path.exists(os.path.join(self.refdb_dir, 'refssu.fa')): refdb = os.path.join(self.refdb_dir, 'refssu.fa') taxdb = os.path.join(self.refdb_dir, 'refssu.tax') else: logger.error("Could not find reference database "+refdb+" Exiting") sys.exit() logger.info('tax_file '+taxdb) logger.info('ref_file '+refdb) return (refdb,taxdb)
def check_headers(self, headers): if self.general_config_dict['platform'] in C.illumina_list: pl = self.general_config_dict['platform'] known_header_list = self.known_header_list[pl] elif self.general_config_dict['platform'] == '454': known_header_list = self.known_header_list['454'] else: logger.error("in utils: check_headers - unknown platform") #print( sorted(known_header_list)) #print(sorted(headers)) self.res_headers = headers if "env_sample_source" in headers: self.env_source_to_id(headers) if sorted(known_header_list) != sorted(self.res_headers): print("=" * 40) print("csv file header problem") print("%-20s %-20s" % ("REQUIRED", "YOUR CSV")) for i in sorted(known_header_list): if i in headers: print("%-20s%-20s" % (i,i)) else: print("%-20s%-20s" % (i,"----------- <--- missing")) for i in headers: if i not in known_header_list: print("%-20s%-20s" % (" ",i+" <--- extra")) print("=" * 40) sys.exit("ERROR : unknown or missing headers\n") else: return True
def check_headers(self, headers): if self.general_config_dict['platform']=='illumina': known_header_list= self.known_header_list['illumina'] elif self.general_config_dict['platform'] == '454': known_header_list = self.known_header_list['454'] else: logger.error("in utils: check_headers - unknown platform") #print sorted(known_header_list) #print sorted(headers) self.res_headers = headers if "env_sample_source" in headers: self.env_source_to_id(headers) if sorted(known_header_list) != sorted(self.res_headers): print "=" * 40 print "csv file header problem" print "%-20s %-20s" % ("REQUIRED", "YOUR CSV") for i in sorted(known_header_list): if i in headers: print "%-20s%-20s" % (i,i) else: print "%-20s%-20s" % (i,"----------- <--- missing") for i in headers: if i not in known_header_list: print "%-20s%-20s" % (" ",i+" <--- extra") print "=" * 40 sys.exit("ERROR : unknown or missing headers\n") else: return True
def chmod_all(self, dir_name): try: call(['chmod', '-R', 'ug+w', dir_name]) except Exception: logger.error("call(['chmod', '-R', 'ug+w', %s]) didn't work: \n" % (dir_name)) logger.error(Exception) pass
def write_seq_frequencies_in_file(self, out_file, fa_file_name, seq_in_file): try: with open(out_file, "a") as myfile: myfile.write( str(fa_file_name) + ": " + str(seq_in_file) + "\n") except Exception: logger.error(Exception)
def env_source_to_id(self, headers): logger.error("self.utils.is_local() LLL2 metadata") logger.error(self.utils.is_local()) if self.utils.is_local(): self.my_conn = MyConnection(host = 'localhost', db="test_env454") else: self.my_conn = MyConnection(host='bpcdb1', db="env454") # self.my_conn = MyConnection() my_sql = """SELECT * FROM env_sample_source""" self.env = self.my_conn.execute_fetch_select(my_sql) self.res_headers = ["env_sample_source_id" if x=="env_sample_source" else x for x in headers]
def check_for_datasets(self,data): error = False warn=False for item in data: if item != 'general': #print 'ds',data[item]['dataset'] if not data[item]['dataset']: #if 'dataset' not in data[item]: logger.error("Current dataset name is missing or corrupt - Exiting (key: "+item+")") error=True return (error,warn)
def check_for_datasets(self,data): error = False warn=False for item in data: if item != 'general': #print('ds',data[item]['dataset']) if not data[item]['dataset']: #if 'dataset' not in data[item]: logger.error("Current dataset name is missing or corrupt - Exiting (key: "+item+")") error=True return (error,warn)
def file_to_db_upload_seq(my_file_to_db_upload, filename, sequences): # for filename in filenames: insert_seq_time_start = time.time() try: logger.debug("\n----------------\nfilename = %s" % filename) my_file_to_db_upload.seq.insert_seq(sequences) insert_seq_time = (time.time() - insert_seq_time_start) logger.debug("insert_seq() took %s sec to finish" % insert_seq_time) except: # catch everything logger.error("\r[pipelineprocessor] Unexpected:") # handle unexpected exceptions logger.error(sys.exc_info()[0]) # info about curr exception (type,value,traceback) raise # re-throw caught exception
def file_to_db_upload_all_but_seq(my_file_to_db_upload, filename, no_run_info_list, full_upload): total_time = 0 try: my_file_to_db_upload.get_gast_result(os.path.basename(filename)) filename_base_no_suff = get_filename_base_no_suff(filename) run_info_ill_id = my_file_to_db_upload.get_run_info_ill_id(filename_base_no_suff) if run_info_ill_id: my_file_to_db_upload.collect_project_ids(run_info_ill_id) seq_in_file = len(my_file_to_db_upload.seq.fasta_dict) my_file_to_db_upload.put_seq_statistics_in_file(filename, seq_in_file) total_time += seq_in_file start_fasta_next = time.time() start_insert_pdr_info_time = 0 start_insert_pdr_info_time = time.time() my_file_to_db_upload.insert_pdr_info(run_info_ill_id) insert_pdr_info_time = (time.time() - start_insert_pdr_info_time) start_insert_taxonomy_time = 0 start_insert_taxonomy_time = time.time() my_file_to_db_upload.insert_taxonomy() insert_taxonomy_time = (time.time() - start_insert_taxonomy_time) insert_sequence_uniq_info_time = 0 start_insert_sequence_uniq_info_time = time.time() my_file_to_db_upload.insert_sequence_uniq_info() insert_sequence_uniq_info_time = (time.time() - start_insert_sequence_uniq_info_time) logger.debug("start_fasta_loop took %s sec to finish" % (time.time() - start_fasta_next)) logger.debug("insert_pdf_info_query_time took %s sec to finish" % insert_pdr_info_time) logger.debug("start_insert_taxonomy_upload_time took %s sec to finish" % insert_taxonomy_time) logger.debug("insert_sequence_uniq_info_time took %s sec to finish" % insert_sequence_uniq_info_time) return total_time else: utils = PipelneUtils() no_run_info_list.append(filename_base_no_suff) utils.print_both( "ERROR: There is no run info for %s yet, please check if it's in the csv and uploaded to the db" % filename) return 0 except: # catch everything logger.error("\r[pipelineprocessor] Unexpected:") # handle unexpected exceptions logger.error(sys.exc_info()[0]) # info about curr exception (type,value,traceback) raise # re-throw caught exception
def file_to_db_upload_seq(my_file_to_db_upload, filename, sequences): # for filename in filenames: insert_seq_time_start = time.time() try: logger.debug("\n----------------\nfilename = %s" % filename) my_file_to_db_upload.seq.insert_seq(sequences) insert_seq_time = (time.time() - insert_seq_time_start) logger.debug("insert_seq() took %s sec to finish" % insert_seq_time) except: # catch everything logger.error("\r[pipelineprocessor] Unexpected:" ) # handle unexpected exceptions logger.error(sys.exc_info() [0]) # info about curr exception (type,value,traceback) raise # re-throw caught exception
def check_dataset_name(self,data): """ # CHECK: dataset name can be ONLY alphanumeric and underscore and cannot start with a number! """ error =False warn =False for item in data: if item != 'general': dataset_name = data[item]['dataset'] if not re.match("^[A-Za-z0-9_]*$", dataset_name): logger.error("Dataset name has illeagal character(s): "+dataset_name+" (must be alphanumeric and underscore only)") error = True #if re.match("^[0-9]", dataset_name): # logger.error("Dataset name cannot begin with a digit: "+dataset_name) # error = True return (error,warn)
def check_if_array_job_is_done(self, job_name): cluster_done = False check_qstat_cmd_line = "qstat -r | grep %s | wc -l" % job_name logger.debug("check_qstat_cmd_line = %s" % check_qstat_cmd_line) try: p = subprocess.Popen(check_qstat_cmd_line, stdout=subprocess.PIPE, shell=True) (output, err) = p.communicate() num_proc = int(output) logger.debug("qstat is running %s '%s' processes" % (num_proc, job_name)) # pprint(p) if (num_proc == 0): cluster_done = True # print("cluster_done from check_if_cluster_is_done = %s" % cluster_done) except: logger.error("%s can be done only on a cluster." % job_name) raise return cluster_done
def check_dataset_name(self,data): """ # CHECK: dataset name can be ONLY alphanumeric and underscore and cannot start with a number! """ error =False warn =False for item in data: if item != 'general': dataset_name = data[item]['dataset'] if not re.match("^[A-Za-z0-9_]*$", dataset_name): logger.error("Dataset name has illeagal character(s): "+dataset_name+" (must be alphanumeric and underscore only)") error = True #if re.match("^[0-9]", dataset_name): # logger.error("Dataset name cannot begin with a digit: "+dataset_name) # error = True return (error, warn)
def check_project_name(self, data): """ # CHECK: project name format: 3 parts; end with Bv6,Ev9,Av6 or something similar """ error =False warn =False for item in data: if item != 'general': try: (a,b,c) = data[item]['project'].split('_') except: logger.error("project not in correct format: "+data[item]['project']+" - Exiting (key: "+data[item]+")") error=True (a,b,c) = data[item]['project'].split('_') #if c[0] not in [i[0].upper() for i in domains]: # sys.exit("ERROR : Project suffix has incorrect/non-existant domain: "+c) if (c[1:] not in self.dna_regions) and (c.lower() not in self.dna_regions): logger.error("Project suffix has incorrect DNA region: "+c+" - Exiting (key: "+data[item]+")") error = True return (error,warn)
def gast2tax(self): for key in self.idx_keys: output_dir = os.path.join(self.basedir,key) gast_dir = os.path.join(output_dir,'gast') if key in self.runobj.samples: dna_region = self.runobj.samples[key].dna_region else: dna_region = self.runobj.dna_region if not dna_region: logger.error("gast2tax: We have no DNA Region: Setting dna_region to 'unknown'") self.runobj.run_status_file_h.write("gast2tax: We have no DNA Region: Setting dna_region to 'unknown'") dna_region = 'unknown' (refdb,taxdb) = self.get_reference_databases(dna_region) #print tax_file max_distance = C.max_distance['default'] if dna_region in C.max_distance: max_distance = C.max_distance[dna_region] unique_file = 'Not Found' names_file = 'Not Found' if self.runobj.platform == 'vamps': unique_file = os.path.join(output_dir, key+'.unique.fa') names_file = os.path.join(output_dir,key+'.names') elif self.runobj.platform == 'illumina': file_prefix = self.runobj.samples[key].file_prefix unique_file = os.path.join(self.input_dir,file_prefix+"-PERFECT_reads.fa.unique") names_file = os.path.join(self.input_dir,file_prefix+"-PERFECT_reads.fa.unique.names") else: pass #usearch_filename= os.path.join(self.gast_dir, "uc") #uc_results = self.parse_uclust(usearch_filename) #print uc_results ref_taxa = self.load_reftaxa(taxdb) names_file = os.path.join(output_dir, key+'.names') self.assign_taxonomy(gast_dir,dna_region,names_file, ref_taxa); return ("SUCCESS","gast2tax")
def check_domain_suite_region(self,data): error = False warn=False for item in data: if item != 'general': # CHECK MUST MATCH: "Domain","Primer Suite","DNA Region" if data[item]['primer_suite'] not in self.primer_suites: logger.error("Primer Suite not found: "+data[item]['primer_suite']+" - Exiting (key: "+item+")") error=True #if dataset_items['domain'] not in domains: # sys.exit("ERROR: Domain not found: "+dataset_items['domain']) if data[item]['dna_region'] not in self.dna_regions: logger.error("DNA Region not found: "+data[item]['dna_region']+" - Exiting (key: "+item+")") error=True # "Bacterial v6","BacterialV6Suite","v6" #if dataset_items['domain'][:6] != dataset_items['primer_suite'][:6]: # sys.exit("ERROR: Domain ("+dataset_items['domain']+") -- Primer Suite ("+dataset_items['primer_suite']+") mismatch.") #if dataset_items['domain'][-2:].lower() != dataset_items['dna_region'].lower(): # sys.exit("ERROR: DNA Region ("+dataset_items['dna_region']+") -- Domain ("+dataset_items['domain']+") mismatch.") if data[item]['dna_region'] not in data[item]['primer_suite']: logger.error("DNA Region ("+data[item]['dna_region']+") not found in Primer Suite ("+data[item]['primer_suite']+") - Exiting (key: "+item+")") error=True return (error,warn)
def vampsupload(runobj): """ Upload data files to VAMPS database """ # for vamps 'new_lane_keys' will be prefix # of the uniques and names file # that was just created in vamps_gast.py # or we can get the 'lane_keys' directly from the config_file # for illumina: # a unique idx_key is a concatenation of barcode_index and run_key idx_keys = get_keys(runobj) # if(runobj.vamps_user_upload): # idx_keys = [runobj.user+runobj.runcode] # else: # idx_keys = convert_unicode_dictionary_to_str(json.loads(open(runobj.trim_status_file_name,"r").read()))["new_lane_keys"] # NOT NEEDED HERE: Find duplicate project names # if vamps user uploads this has already been done and this project is # already in vamps_upload_info table # if data from a csv file (illumina and 454) this also is not needed # as data is checked in metadata.py myvamps = Vamps(runobj, idx_keys) # Create files myvamps.create_vamps_files() # put files in db result_code = myvamps.load_vamps_db() if result_code[:5] == 'ERROR': logger.error("load_vamps_db failed") if runobj.vamps_user_upload: write_status_to_vamps_db(runobj.site, runobj.run, "GAST_ERROR", result_code) sys.exit("load_vamps_db failed") elif runobj.vamps_user_upload: logger.debug("Finished loading VAMPS data. %s" % result_code) write_status_to_vamps_db(runobj.site, runobj.run, 'GAST_SUCCESS', 'Loading VAMPS Finished')
def vampsupload(runobj): """ Upload data files to VAMPS database """ # for vamps 'new_lane_keys' will be prefix # of the uniques and names file # that was just created in vamps_gast.py # or we can get the 'lane_keys' directly from the config_file # for illumina: # a unique idx_key is a concatenation of barcode_index and run_key idx_keys = get_keys(runobj) # if(runobj.vamps_user_upload): # idx_keys = [runobj.user+runobj.runcode] # else: # idx_keys = convert_unicode_dictionary_to_str(json.loads(open(runobj.trim_status_file_name,"r").read()))["new_lane_keys"] # NOT NEEDED HERE: Find duplicate project names # if vamps user uploads this has already been done and this project is # already in vamps_upload_info table # if data from a csv file (illumina and 454) this also is not needed # as data is checked in metadata.py myvamps = Vamps(runobj, idx_keys) # Create files myvamps.create_vamps_files() # put files in db result_code = myvamps.load_vamps_db() if result_code[:5] == 'ERROR': logger.error("load_vamps_db failed") sys.exit("load_vamps_db failed") if runobj.vamps_user_upload: write_status_to_vamps_db( runobj.site, runobj.run, "GAST_ERROR", result_code ) elif runobj.vamps_user_upload: print "Finished loading VAMPS data",result_code write_status_to_vamps_db( runobj.site, runobj.run, 'GAST_SUCCESS', 'Loading VAMPS Finished' )
def write_status_to_vamps_db(site='vampsdev', id='0', status='Test', message=''): """ This should be available to write status updates to vamps:vamps_upload_status. It is especially important for MoBeDAC uploads because the qiime site will 'see' and react to the message in the db. <-- not true any longer 2014-02-01 AAV """ import ConMySQL from pipeline.db_upload import MyConnection today = str(datetime.date.today()) if site == 'vamps': db_host = 'vampsdb' db_name = 'vamps' db_home = '/xraid2-2/vampsweb/vamps/' else: db_host = 'bpcweb7' db_name = 'vamps' db_home = '/xraid2-2/vampsweb/vampsdev/' #obj=ConMySQL.New(db_host, db_name, db_home) #my_conn = MyConnection(db_host, db_name) obj = ConMySQL.New(db_host, db_name, db_home) conn = obj.get_conn() cursor = conn.cursor() query = "update vamps_upload_status set status='%s', status_message='%s', date='%s' where id='%s'" % ( status, message, today, id) try: cursor.execute(query) #print("executing",query) except: conn.rollback() logger.error("ERROR status update failed") else: conn.commit()
def gast(runobj): # for vamps 'new_lane_keys' will be prefix # of the uniques and names file # that was just created in vamps_gast.py # or we can get the 'lane_keys' directly from the config_file # for illumina: # a unique idx_key is a concatenation of barcode_index and run_key # Should return a list not a string idx_keys = get_keys(runobj) # get GAST object mygast = Gast(runobj, idx_keys) # Check for unique files and create them if not there result_code = mygast.check_for_uniques_files(idx_keys) runobj.run_status_file_h.write(json.dumps(result_code) + "\n") if result_code[0] == "ERROR": logger.error("uniques not found failed") sys.exit("uniques not found failed") sleep(5) # CLUSTERGAST result_code = mygast.clustergast() runobj.run_status_file_h.write(json.dumps(result_code) + "\n") if result_code[0] == "ERROR": logger.error("clutergast failed") sys.exit("clustergast failed") sleep(5) # GAST_CLEANUP result_code = mygast.gast_cleanup() runobj.run_status_file_h.write(json.dumps(result_code) + "\n") if result_code[0] == "ERROR": logger.error("gast_cleanup failed") sys.exit("gast_cleanup failed") sleep(5) # GAST2TAX result_code = mygast.gast2tax() runobj.run_status_file_h.write(json.dumps(result_code) + "\n") if result_code[0] == "ERROR": logger.error("gast2tax failed") sys.exit("gast2tax failed")
def extract_zipped_file(run_date, outdir, filename): """ """ # check if zipped assert os.path.isdir(outdir) archivename = os.path.join(outdir,run_date+'.zip') if zipfile.is_zipfile(archivename): zf = zipfile.ZipFile(archivename, 'r') try: data = zf.read(filename) except KeyError: logger.error('ERROR: Did not find %s in zip file' % filename) else: logger.error(filename, ':') logger.error(repr(data)) print zf.close() else: logger.error("No zipfile archive found:",archivename)
def extract_zipped_file(run_date, outdir, filename): """ """ # check if zipped assert os.path.isdir(outdir) archivename = os.path.join(outdir, run_date + '.zip') if zipfile.is_zipfile(archivename): zf = zipfile.ZipFile(archivename, 'r') try: data = zf.read(filename) except KeyError: logger.error('ERROR: Did not find %s in zip file' % filename) else: logger.error(filename, ':') logger.error(repr(data)) print zf.close() else: logger.error("No zipfile archive found:", archivename)
def check_domain_suite_region(self,data): error = False warn=False for item in data: if item != 'general': primer_suite = self.convert_primer_suites(data[item]['primer_suite']) dna_region = self.convert_primer_suites(data[item]['dna_region']) # CHECK MUST MATCH: "Domain","Primer Suite","DNA Region" if primer_suite not in self.primer_suites: logger.error("Primer Suite not found: "+primer_suite+" - Exiting (key: "+item+")") error=True if dna_region not in self.dna_regions: logger.error("DNA Region not found: "+dna_region+" - Exiting (key: "+item+")") error=True if dna_region not in primer_suite: logger.error("DNA Region ("+dna_region+") not found in Primer Suite ("+primer_suite+") - Exiting (key: "+item+")") error=True return (error, warn)
def write_seq_frequencies_in_file(self, out_file, fa_file_name, seq_in_file): try: with open(out_file, "a") as myfile: myfile.write(str(fa_file_name) + ": " + str(seq_in_file) + "\n") except Exception: logger.error(Exception)
def gast_cleanup(self): """ gast_cleanup - follows clustergast, explodes the data and copies to gast_concat and gast files """ logger.info("Starting GAST Cleanup") self.runobj.run_status_file_h.write("Starting gast_cleanup\n") for key in self.idx_keys: output_dir = os.path.join(self.basedir,key) gast_dir = os.path.join(output_dir,'gast') if key in self.runobj.samples: dna_region = self.runobj.samples[key].dna_region else: dna_region = self.runobj.dna_region if not dna_region: logger.error("gast_cleanup: We have no DNA Region: Setting dna_region to 'unknown'") self.runobj.run_status_file_h.write("gast_cleanup: We have no DNA Region: Setting dna_region to 'unknown'\n") dna_region = 'unknown' # find gast_dir # for vamps user upload # basedir is like avoorhis_3453211 # and outdir is like avoorhis_3453211/2012-06-25 # for MBL pipeline # basedir is like 1_AGTCG # and outdir is like 1_AGTCG/2012-06-25 unique_file = 'Not Found' names_file = 'Not Found' if self.runobj.platform == 'vamps': unique_file = os.path.join(output_dir, key+'.unique.fa') names_file = os.path.join(output_dir,key+'.names') elif self.runobj.platform == 'illumina': file_prefix = self.runobj.samples[key].file_prefix unique_file = os.path.join(self.input_dir,file_prefix+"-PERFECT_reads.fa.unique") names_file = os.path.join(self.input_dir,file_prefix+"-PERFECT_reads.fa.unique.names") else: pass print 'UNIQUE FILE',unique_file #print 'names file',names_file if not os.path.exists(gast_dir): logger.error("Could not find gast directory: "+gast_dir+" Exiting") sys.exit() clustergast_filename_single = os.path.join(gast_dir, "gast"+dna_region) logger.debug('gast filesize:'+str(os.path.getsize(clustergast_filename_single))) gast_filename = os.path.join(gast_dir, "gast") gastconcat_filename = os.path.join(gast_dir, "gast_concat") #dupes_filename = os.path.join(gast_dir, "dupes") #nonhits_filename = os.path.join(gast_dir, "nonhits") copies = {} nonhits = {} # open and read names file names_fh = open(names_file,'r') for line in names_fh: s = line.strip().split("\t") index_read = s[0] copies[index_read] = s[1].split(',') if index_read in nonhits: nonhits[index_read] += 1 else: nonhits[index_read] = 1 names_fh.close() #print nonhits #print copies ####################################### # # Insert records with valid gast hits into gast_file # ####################################### # read the .gast file from clustergast concat = {} gast_fh = open(gast_filename,'w') if(os.path.exists(clustergast_filename_single)): in_gast_fh = open(clustergast_filename_single,'r') else: print "No clustergast file found:",clustergast_filename_single,"\nExiting" self.runobj.run_status_file_h.write("No clustergast file found:",clustergast_filename_single," Exiting\n") sys.exit() for line in in_gast_fh: s = line.strip().split() if len(s) == 4: read_id = s[0] refhvr_id = s[1].split('|')[0] distance = s[2] alignment = s[3] #print read_id,refhvr_id # if this was in the gast table zero it out because it had a valid hit # so we don't insert them as non-hits later if read_id in nonhits: del nonhits[read_id] #print 'deleling',read_id #print 'nonhits',nonhits if read_id not in copies: logger.info(read_id+' not in names file: Skipping') continue # give the same ref and dist for each duplicate for id in copies[read_id]: if id != read_id: #print id,read_id,distance,refhvr_id gast_fh.write( id + "\t" + refhvr_id + "\t" + distance + "\t" + alignment + "\n" ) in_gast_fh.close() ####################################### # # Insert a record for any valid sequence that had no blast hit and therefore no gast result # into gast_filename # ####################################### for read in sorted(nonhits.iterkeys()): for d in copies[read]: gast_fh.write( d+"\t0\t1\t\n") gast_fh.close() # concatenate the two gast files clustergast_fh = open(clustergast_filename_single,'a') shutil.copyfileobj(open(gast_filename,'rb'), clustergast_fh) clustergast_fh.close() #the open again and get data for gast concat concat = {} print clustergast_filename_single for line in open(clustergast_filename_single,'r'): data = line.strip().split("\t") id = data[0] refhvr_id = data[1].split('|')[0] distance = data[2] #print 'data',data if id in concat: concat[id]['refhvrs'].append(refhvr_id) else: concat[id] = {} concat[id]['refhvrs'] = [refhvr_id] concat[id]['distance'] = distance ####################################### # # Insert records into gast_concat_filename # ####################################### # first we need to open the gast_filename gastconcat_fh = open(gastconcat_filename,'w') for id, value in concat.iteritems(): #print 'trying gastconcat', id,value gastconcat_fh.write( id + "\t" + concat[id]['distance'] + "\t" + ' '.join(concat[id]['refhvrs']) + "\n" ) gastconcat_fh.close() print "Finished gast_cleanup" logger.info("Finished gast_cleanup") return ("SUCCESS","gast_cleanup")
def gather_files_per_key(self, key): file_collector={} out_gast_dir = os.path.join(self.global_gast_dir,key) #directory file_collector['gast_concat_file'] = os.path.join(out_gast_dir,'gast_concat') file_collector['tagtax_file'] = os.path.join(out_gast_dir,'tagtax_terse') if not os.path.exists(file_collector['gast_concat_file']): logger.warning("Could not find gast_concat_file file: "+file_collector['gast_concat_file']) if not os.path.exists(file_collector['tagtax_file']): logger.warning("Could not find tagtax_file file: "+file_collector['tagtax_file']) #print key,self.runobj.platform if self.runobj.vamps_user_upload: file_collector['unique_file'] = os.path.join(out_gast_dir,'unique.fa') file_collector['original_fa_file'] = os.path.join(out_gast_dir,'fasta.fa') if self.runobj.fasta_file: grep_cmd = ['grep','-c','>',self.runobj.fasta_file] else: grep_cmd = ['grep','-c','>',file_collector['unique_file']] else: if self.runobj.platform == 'illumina': #unique_file = os.path.join(self.basedir,C.gast_dir),'unique.fa') reads_dir = dirs.check_dir(dirs.reads_overlap_dir) file_prefix = self.runobj.samples[key].file_prefix file_collector['unique_file'] = os.path.join(reads_dir,file_prefix+"-PERFECT_reads.fa.unique") # ANNA What is the correct file here: file_collector['original_fa_file'] = os.path.join(reads_dir,file_prefix+"-PERFECT_reads.fa.unique") grep_cmd = ['grep','-c','>',file_collector['unique_file']] elif self.runobj.platform == '454': pass else: sys.exit("no usable platform found") if not os.path.exists(file_collector['unique_file']): logger.error("Could not find unique_file: "+file_collector['unique_file']) # get dataset_count here from unique_file # the dataset_count should be from the non-unique file # but if we don't have that must use uniques try: dataset_count = subprocess.check_output(grep_cmd).strip() except: dataset_count = 0 print key,": Sequence Count", dataset_count # output files to be created: file_collector['taxes_file'] = os.path.join(out_gast_dir,'vamps_data_cube_uploads.txt') file_collector['summed_taxes_file'] = os.path.join(out_gast_dir,'vamps_junk_data_cube_pipe.txt') file_collector['distinct_taxes_file'] = os.path.join(out_gast_dir,'vamps_taxonomy_pipe.txt') file_collector['sequences_file'] = os.path.join(out_gast_dir,'vamps_sequences_pipe.txt') file_collector['export_file'] = os.path.join(out_gast_dir,'vamps_export_pipe.txt') file_collector['projects_datasets_file'] = os.path.join(out_gast_dir,'vamps_projects_datasets_pipe.txt') file_collector['project_info_file'] = os.path.join(out_gast_dir,'vamps_projects_info_pipe.txt') return (file_collector, dataset_count, out_gast_dir)
def gast(runobj): logger.info("STARTING GAST()") # logger.info("vsearch version: " % utils.get_vsearch_version) # for vamps 'new_lane_keys' will be prefix # of the uniques and names file # that was just created in vamps_gast.py # or we can get the 'lane_keys' directly from the config_file # for illumina: # a unique idx_key is a concatenation of barcode_index and run_key # Should return a list not a string idx_keys = get_keys(runobj) # get GAST object mygast = Gast(runobj, idx_keys) # Check for unique files and create them if not there result_code = mygast.check_for_unique_files(idx_keys) runobj.run_status_file_h.write(json.dumps(result_code)+"\n") if result_code['status'] == 'ERROR': logger.error("uniques not found failed") sys.exit("uniques not found failed") if runobj.vamps_user_upload: write_status_to_vamps_db( runobj.site, runobj.run, "GAST ERROR", "uniques file not found - failed" ) elif runobj.vamps_user_upload: write_status_to_vamps_db( runobj.site, runobj.run, result_code['status'], result_code['message'] ) sleep(5) # CLUSTERGAST result_code = mygast.clustergast() runobj.run_status_file_h.write(json.dumps(result_code)+"\n") if result_code['status'] == 'ERROR': logger.error("clutergast failed") sys.exit("clustergast failed") if runobj.vamps_user_upload: write_status_to_vamps_db( runobj.site, runobj.run, "GAST ERROR", "clustergast failed" ) elif runobj.vamps_user_upload: write_status_to_vamps_db( runobj.site, runobj.run, result_code['status'], result_code['message'] ) sleep(5) # GAST_CLEANUP result_code = mygast.gast_cleanup() runobj.run_status_file_h.write(json.dumps(result_code)+"\n") if result_code['status'] == 'ERROR': logger.error("gast_cleanup failed") sys.exit("gast_cleanup failed") if runobj.vamps_user_upload: write_status_to_vamps_db( runobj.site, runobj.run, "GAST ERROR", "gast_cleanup failed" ) elif runobj.vamps_user_upload: write_status_to_vamps_db( runobj.site, runobj.run, result_code['status'], result_code['message'] ) sleep(5) # GAST2TAX result_code = mygast.gast2tax() runobj.run_status_file_h.write(json.dumps(result_code)+"\n") if result_code['status'] == 'ERROR': logger.error("gast2tax failed") sys.exit("gast2tax failed") if runobj.vamps_user_upload: write_status_to_vamps_db( runobj.site, runobj.run, "GAST ERROR", "gast2tax failed" ) elif runobj.vamps_user_upload: write_status_to_vamps_db( runobj.site, runobj.run, result_code['status'], result_code['message'] )
def check_project_name(self, data): """ # CHECK: project name format: 3 parts; end with Bv6,Ev9,Av6 or something similar """ error =False warn =False for item in data: if item != 'general': try: (a,b,c) = data[item]['project'].split('_') except: logger.error("project not in correct format: ") logger.error(data[item]['project']) logger.error(" - Exiting (key: ") logger.error(data[item]) error=True (a,b,c) = data[item]['project'].split('_') #if c[0] not in [i[0].upper() for i in domains]: # sys.exit("ERROR : Project suffix has incorrect/non-existant domain: "+c) # logger.error("c[1:] = ") # logger.error(c[1:]) # logger.error("c.lower() =") # logger.error(c.lower()) # logger.error("self.dna_regions") # logger.error(self.dna_regions ) if (c[1:].lower() not in self.dna_regions) and (c.lower() not in self.dna_regions): logger.error("Project suffix has incorrect DNA region: ") logger.error(c) logger.error(" - Exiting (key: ") logger.error(data[item]) error = True return (error, warn)
def validate_illumina_ini(self, analysis_dir): """ The csv headers are checked earlier """ print "Validating ini type Config File (may have been converted from csv)" new_ini_file = os.path.join(analysis_dir, self.general_config_dict['run'] + '.ini') print "New ini file location: "+new_ini_file return_code = False error_code = False warn_code = False msg = '' error=False warn=False #print 'configpath',self.general_config_dict['configPath'] # configPath here is the new configPath self.data_object = self.configDictionaryFromFile_ini(self.general_config_dict['configPath']) (error_code,warn_code) = self.check_for_missing_values(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_for_datasets(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_domain_suite_region(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_project_name(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_dataset_name(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_projects_and_datasets(self.data_object) if error_code: error=True if warn_code: warn=True #print self.data_object['input_dir'] #print self.data_object['input_files'] if 'input_dir' not in self.data_object['general'] and 'input_files' not in self.data_object['general']: logger.warning("No input directory and no input files") warn=True elif not os.path.isdir(self.data_object['general']['input_dir']): logger.error("That is not a directory: "+self.data_object['general']['input_dir']) error=True elif self.data_object['general']['input_file_format'] == 'fastq' and self.data_object['general']['platform'] == 'illumina': file_exists = False # if 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir']: for dirname, dirnames, filenames in os.walk(self.data_object['general']['input_dir']): # if not filenames: for file_name in filenames: if os.path.isfile(os.path.join(dirname, file_name)): file_exists = True break if not file_exists: logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir']) error=True elif 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir'] and ('input_files' not in self.data_object['general'] or not self.data_object['general']['input_files']): logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir']) error=True if error: sys.exit( """\n\t\033[91mTHERE WERE SEVERE PROBLEMS WITH THE CSV and/or CONFIG FILE - EXITING PLEASE CORRECT THEM AND START OVER.\033[0m\n To view the errors add ' --loglevel info' to the command line.\n""") elif warn: msg = """\n\t\033[93mTHERE WERE NON-FATAL PROBLEMS WITH THE CSV and/or CONFIG FILE THAT MAY OR MAY NOT CAUSE PROBLEMS.\033[0m\n To view the warnings add ' --loglevel warning' to the command line.\n""" print "\033[92mCSV File Passed Vaidation! (with warnings)\033[0m" else: print "\033[92mCSV File Passed Vaidation!\033[0m" return msg
def gast(runobj): logger.info("STARTING GAST()") # logger.info("vsearch version: " % utils.get_vsearch_version) # for vamps 'new_lane_keys' will be prefix # of the uniques and names file # that was just created in vamps_gast.py # or we can get the 'lane_keys' directly from the config_file # for illumina: # a unique idx_key is a concatenation of barcode_index and run_key # Should return a list not a string idx_keys = get_keys(runobj) # get GAST object mygast = Gast(runobj, idx_keys) # Check for unique files and create them if not there result_code = mygast.check_for_unique_files(idx_keys) runobj.run_status_file_h.write(json.dumps(result_code) + "\n") if result_code['status'] == 'ERROR': logger.error("uniques not found failed") if runobj.vamps_user_upload: write_status_to_vamps_db(runobj.site, runobj.run, "GAST ERROR", "uniques file not found - failed") sys.exit("uniques not found failed") elif runobj.vamps_user_upload: write_status_to_vamps_db(runobj.site, runobj.run, result_code['status'], result_code['message']) sleep(5) # CLUSTERGAST result_code = mygast.clustergast() runobj.run_status_file_h.write(json.dumps(result_code) + "\n") if result_code['status'] == 'ERROR': logger.error("clutergast failed") if runobj.vamps_user_upload: write_status_to_vamps_db(runobj.site, runobj.run, "GAST ERROR", "clustergast failed") sys.exit("clustergast failed") elif runobj.vamps_user_upload: write_status_to_vamps_db(runobj.site, runobj.run, result_code['status'], result_code['message']) sleep(5) # GAST_CLEANUP result_code = mygast.gast_cleanup() runobj.run_status_file_h.write(json.dumps(result_code) + "\n") if result_code['status'] == 'ERROR': logger.error("gast_cleanup failed") if runobj.vamps_user_upload: write_status_to_vamps_db(runobj.site, runobj.run, "GAST ERROR", "gast_cleanup failed") sys.exit("gast_cleanup failed") elif runobj.vamps_user_upload: write_status_to_vamps_db(runobj.site, runobj.run, result_code['status'], result_code['message']) sleep(5) # GAST2TAX result_code = mygast.gast2tax() runobj.run_status_file_h.write(json.dumps(result_code) + "\n") if result_code['status'] == 'ERROR': logger.error("gast2tax failed") if runobj.vamps_user_upload: write_status_to_vamps_db(runobj.site, runobj.run, "GAST ERROR", "gast2tax failed") sys.exit("gast2tax failed") elif runobj.vamps_user_upload: write_status_to_vamps_db(runobj.site, runobj.run, result_code['status'], result_code['message'])
def file_to_db_upload_all_but_seq(my_file_to_db_upload, filename, no_run_info_list, full_upload): total_time = 0 try: my_file_to_db_upload.get_gast_result(os.path.basename(filename)) filename_base_no_suff = get_filename_base_no_suff(filename) run_info_ill_id = my_file_to_db_upload.get_run_info_ill_id( filename_base_no_suff) if run_info_ill_id: my_file_to_db_upload.collect_project_ids(run_info_ill_id) seq_in_file = len(my_file_to_db_upload.seq.fasta_dict) my_file_to_db_upload.put_seq_statistics_in_file( filename, seq_in_file) total_time += seq_in_file start_fasta_next = time.time() start_insert_pdr_info_time = 0 start_insert_pdr_info_time = time.time() my_file_to_db_upload.insert_pdr_info(run_info_ill_id) insert_pdr_info_time = (time.time() - start_insert_pdr_info_time) start_insert_taxonomy_time = 0 start_insert_taxonomy_time = time.time() my_file_to_db_upload.insert_taxonomy() insert_taxonomy_time = (time.time() - start_insert_taxonomy_time) insert_sequence_uniq_info_time = 0 start_insert_sequence_uniq_info_time = time.time() my_file_to_db_upload.insert_sequence_uniq_info() insert_sequence_uniq_info_time = ( time.time() - start_insert_sequence_uniq_info_time) logger.debug("start_fasta_loop took %s sec to finish" % (time.time() - start_fasta_next)) logger.debug("insert_pdf_info_query_time took %s sec to finish" % insert_pdr_info_time) logger.debug( "start_insert_taxonomy_upload_time took %s sec to finish" % insert_taxonomy_time) logger.debug( "insert_sequence_uniq_info_time took %s sec to finish" % insert_sequence_uniq_info_time) return total_time else: utils = PipelneUtils() no_run_info_list.append(filename_base_no_suff) utils.print_both( "ERROR: There is no run info for %s yet, please check if it's in the csv and uploaded to the db" % filename) return 0 except: # catch everything logger.error("\r[pipelineprocessor] Unexpected:" ) # handle unexpected exceptions logger.error(sys.exc_info() [0]) # info about curr exception (type,value,traceback) raise # re-throw caught exception
def clustergast(self): """ clustergast - runs the GAST pipeline on the cluster. GAST uses UClust to identify the best matches of a read sequence to references sequences in a reference database. VAMPS: The uniques and names files have previously been created in trim_run.py. Illumina : """ logger.info("Starting Clustergast") self.runobj.run_status_file_h.write("Starting clustergast\n") # Step1: create empty gast table in database: gast_<rundate> # Step2: Count the number of sequences so the job can be split for nodes # $facount = `grep -c \">\" $fasta_uniqs_filename`; # $calcs = `/bioware/seqinfo/bin/calcnodes -t $facount -n $nodes -f 1`; # /bioware/seqinfo/bin/fastasampler -n $start,$end ${gastDir}/${fasta_uniqs_filename} $tmp_fasta_filename # $usearch_binary --global --query $tmp_fasta_filename --iddef 3 --gapopen 6I/1E --db $refhvr_fa --uc $tmp_usearch_filename --maxaccepts $max_accepts --maxrejects $max_rejects --id $pctid_threshold # # sort the results for valid hits, saving only the ids and pct identity # grep -P \"^H\\t\" $tmp_usearch_filename | sed -e 's/|.*\$//' | awk '{print \$9 \"\\t\" \$4 \"\\t\" \$10 \"\\t\" \$8}' | sort -k1,1b -k2,2gr | clustergast_tophit > $gast_filename # Submit the script # /usr/local/sge/bin/lx24-amd64/qsub $qsub_priority $script_filename calcnodes = C.calcnodes_cmd sqlImportCommand = C.mysqlimport_cmd #qsub = '/usr/local/sge/bin/lx24-amd64/qsub' clusterize = C.clusterize_cmd ################################################################### # use fasta.uniques file # split into smaller files # usearch --cluster each ####################################### # # Split the uniques fasta and run UClust per node # ####################################### qsub_prefix = 'clustergast_sub_' gast_prefix = 'gast_' if self.use_cluster: logger.info("Using cluster for clustergast") else: logger.info("Not using cluster") counter=0 for key in self.idx_keys: print key counter +=1 print "\nFile:",str(counter) if counter >= self.limit: pass cluster_nodes = C.cluster_nodes logger.info("Cluster nodes set to: "+str(cluster_nodes)) output_dir = os.path.join(self.basedir,key) gast_dir = os.path.join(output_dir,'gast') # SMPL1_3_NNNNCGCTC_3 #print 'samples',key,self.runobj.samples if key in self.runobj.samples: dna_region = self.runobj.samples[key].dna_region else: dna_region = self.runobj.dna_region if not dna_region: logger.error("clustergast: We have no DNA Region: Setting dna_region to 'unknown'") dna_region = 'unknown' (refdb,taxdb) = self.get_reference_databases(dna_region) #print 'DBs',refdb,taxdb # if no dna_region OR no refdb can be found then use # refssu #if refdb contains refssu #the add this to grep command #and change usearch to usearch64 unique_file = 'Not Found' names_file = 'Not Found' if self.runobj.platform == 'vamps': unique_file = os.path.join(output_dir, key+'.unique.fa') elif self.runobj.platform == 'illumina': file_prefix = self.runobj.samples[key].file_prefix unique_file = os.path.join(self.input_dir,file_prefix+"-PERFECT_reads.fa.unique") names_file = os.path.join(self.input_dir,file_prefix+"-PERFECT_reads.fa.unique.names") else: pass print 'UNIQUE FILE',unique_file #print gast_dir #sys.exit("EXIT") i = 0 if cluster_nodes: grep_cmd = ['grep','-c','>',unique_file] logger.debug( ' '.join(grep_cmd) ) facount = subprocess.check_output(grep_cmd).strip() logger.debug( key+' count '+facount) calcnode_cmd = [calcnodes,'-t',str(facount),'-n',str(cluster_nodes),'-f','1'] calcout = subprocess.check_output(calcnode_cmd).strip() logger.debug("calcout:\n"+calcout) #calcout: # node=1 start=1 end=1 rows=1 # node=2 start=2 end=2 rows=1 # node=3 start=3 end=3 rows=1 lines = calcout.split("\n") gast_file_list = [] for line in lines: i += 1 if i >= cluster_nodes: continue script_filename = os.path.join(gast_dir,qsub_prefix + str(i)) gast_filename = os.path.join(gast_dir, gast_prefix + str(i)) fastasamp_filename = os.path.join(gast_dir, 'samp_' + str(i)) clustergast_filename = os.path.join(gast_dir, key+".gast_" + str(i)) gast_file_list.append(clustergast_filename) usearch_filename= os.path.join(gast_dir, "uc_" + str(i)) log_file = os.path.join(gast_dir, 'clustergast.log_' + str(i)) data = line.split() if len(data) < 2: continue start = data[1].split('=')[1] end = data[2].split('=')[1] if self.use_cluster: fh = open(script_filename,'w') qstat_name = "gast" + key + '_' + self.runobj.run + "_" + str(i) fh.write("#!/bin/csh\n") fh.write("#$ -j y\n" ) fh.write("#$ -o " + log_file + "\n") fh.write("#$ -N " + qstat_name + "\n\n") #fh.write("source /xraid/bioware/Modules/etc/profile.modules\n"); #fh.write("module load bioware\n\n"); # setup environment fh.write("source /xraid/bioware/Modules/etc/profile.modules\n") fh.write("module load bioware\n\n") cmd1 = self.get_fastasampler_cmd(unique_file, fastasamp_filename,start,end) logger.debug("fastasampler command: "+cmd1) if self.use_cluster: fh.write(cmd1 + "\n") else: subprocess.call(cmd1,shell=True) cmd2 = self.get_usearch_cmd(fastasamp_filename, refdb, usearch_filename) logger.debug("usearch command: "+cmd2) print 'usearch',cmd2 if self.use_cluster: fh.write(cmd2 + "\n") else: subprocess.call(cmd2,shell=True) cmd3 = self.get_grep_cmd(usearch_filename, clustergast_filename) logger.debug("grep command: "+cmd3) if self.use_cluster: fh.write(cmd3 + "\n") fh.close() # make script executable and run it os.chmod(script_filename, stat.S_IRWXU) qsub_cmd = clusterize + " " + script_filename # on vamps and vampsdev qsub cannot be run - unless you call it from the # cluster aware directories /xraid2-2/vampsweb/vamps and /xraid2-2/vampsweb/vampsdev qsub_cmd = C.qsub_cmd + " " + script_filename logger.debug("qsub command: "+qsub_cmd) #subprocess.call(qsub_cmd, shell=True) proc = subprocess.Popen(qsub_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # proc.communicate will block - probably not what we want #(stdout, stderr) = proc.communicate() #block the last onehere #print stderr,stdout else: subprocess.call(cmd3,shell=True) print cmd3 else: #fastasamp_filename = os.path.join(gast_dir, 'samp') usearch_filename= os.path.join(gast_dir, "uc") clustergast_filename_single = os.path.join(gast_dir, "gast"+dna_region) print usearch_filename,clustergast_filename_single cmd1 = self.get_usearch_cmd(unique_file,refdb,usearch_filename) print cmd1 subprocess.call(cmd1,shell=True) cmd2 = self.get_grep_cmd(usearch_filename, clustergast_filename_single) print cmd2 subprocess.call(cmd2,shell=True) if self.use_cluster: # wait here for all the clustergast scripts to finish temp_file_list = gast_file_list c = False maxwaittime = C.maxwaittime # seconds sleeptime = C.sleeptime # seconds counter = 0 while c == False: counter += 1 if counter >= maxwaittime / sleeptime: raise Exception("Max wait time exceeded in gast.py") for index, file in enumerate(temp_file_list): #print temp_file_list if os.path.exists(file) and os.path.getsize(file) > 0: # remove from tmp list logger.debug("Found file now removing from list: "+file) temp_file_list = temp_file_list[:index] + temp_file_list[index+1:] if temp_file_list: logger.info("waiting for clustergast files to fill...") logger.debug(' '.join(temp_file_list)) logger.info("\ttime: "+str(counter * sleeptime)+" | files left: "+str(len(temp_file_list))) time.sleep(sleeptime) else: c = True # now concatenate all the clustergast_files into one file (if they were split) if cluster_nodes: # gast file clustergast_filename_single = os.path.join(gast_dir, "gast"+dna_region) clustergast_fh = open(clustergast_filename_single,'w') # have to turn off cluster above to be able to 'find' these files for concatenation for n in range(1,i-1): #cmd = "cat "+ gast_dir + key+".gast_" + str(n) + " >> " + gast_dir + key+".gast" file = os.path.join(gast_dir, key+".gast_" + str(n)) if(os.path.exists(file)): shutil.copyfileobj(open(file,'rb'), clustergast_fh) else: logger.info( "Could not find file: "+os.path.basename(file)+" Skipping") clustergast_fh.flush() clustergast_fh.close() if not self.test: # remove tmp files for n in range(i+1): #print "Trying to remove "+os.path.join(gast_dir,"uc_"+str(n)) if os.path.exists(os.path.join(gast_dir,"uc_"+str(n))): os.remove(os.path.join(gast_dir,"uc_"+str(n))) pass #print "Trying to remove "+os.path.join(gast_dir,"samp_"+str(n)) if os.path.exists(os.path.join(gast_dir,"samp_"+str(n))): os.remove(os.path.join(gast_dir,"samp_"+str(n))) pass #print "Trying to remove "+os.path.join(self.gast_dir,key+".gast_"+str(n)) if os.path.exists(os.path.join(gast_dir,key+".gast_"+str(n))): os.remove(os.path.join(gast_dir,key+".gast_"+str(n))) pass print "Finished clustergast" logger.info("Finished clustergast") return ("SUCCESS","Clustergast")
def validate_illumina_ini(self, analysis_dir): """ The csv headers are checked earlier """ print("Validating ini type Config File (may have been converted from csv)") new_ini_file = os.path.join(analysis_dir, self.general_config_dict['run'] + '.ini') print("New ini file location: "+new_ini_file) return_code = False error_code = False warn_code = False msg = '' error=False warn=False #print('configpath',self.general_config_dict['configPath']) # configPath here is the new configPath self.data_object = self.configDictionaryFromFile_ini(self.general_config_dict['configPath']) (error_code,warn_code) = self.check_for_missing_values(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_for_datasets(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_domain_suite_region(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_project_name(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_dataset_name(self.data_object) if error_code: error=True if warn_code: warn=True (error_code,warn_code) = self.check_projects_and_datasets(self.data_object) if error_code: error=True if warn_code: warn=True #print(self.data_object['input_dir']) #print(self.data_object['input_files']) if 'input_dir' not in self.data_object['general'] and 'input_files' not in self.data_object['general']: logger.warning("No input directory and no input files") warn=True elif not os.path.isdir(self.data_object['general']['input_dir']): logger.error("That is not a directory: "+self.data_object['general']['input_dir']) error=True elif self.data_object['general']['input_file_format'] == 'fastq' and self.data_object['general']['platform'] in C.illumina_list: file_exists = False # if 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir']: for dirname, dirnames, filenames in os.walk(self.data_object['general']['input_dir']): # if not filenames: for file_name in filenames: if os.path.isfile(os.path.join(dirname, file_name)): file_exists = True break if not file_exists: logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir']) error=True elif 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir'] and ('input_files' not in self.data_object['general'] or not self.data_object['general']['input_files']): logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir']) error=True if error: sys.exit( """\n\t\033[91mTHERE WERE SEVERE PROBLEMS WITH THE CSV and/or CONFIG FILE - EXITING PLEASE CORRECT THEM AND START OVER.\033[0m\n To view the errors add ' --loglevel info' to the command line.\n""") elif warn: msg = """\n\t\033[93mTHERE WERE NON-FATAL PROBLEMS WITH THE CSV and/or CONFIG FILE THAT MAY OR MAY NOT CAUSE PROBLEMS.\033[0m\n To view the warnings add ' --loglevel warning' to the command line.\n""" print("\033[92mCSV File Passed Vaidation! (with warnings)\033[0m") else: print("\033[92mCSV File Passed Vaidation!\033[0m") return msg