def projects(self, key, dataset_count, file_collector): """ fill vamps_projects_datasets.txt file """ logger.info("Starting vamps_upload: projects_datasets") if self.runobj.vamps_user_upload: project = self.runobj.project dataset = key else: if self.runobj.platform == 'illumina': project = self.runobj.samples[key].project dataset = self.runobj.samples[key].dataset elif self.runobj.platform == '454': pass else: pass project = project[0].capitalize() + project[1:] project_dataset = project+'--'+dataset date_trimmed = 'unknown' dataset_description = dataset dataset_count = str(dataset_count) has_tax = '1' # true fh = open(file_collector['projects_datasets_file'],'w') fh.write("\t".join(["HEADER","project","dataset","dataset_count","has_tax", "date_trimmed","dataset_info"] )+"\n") fh.write("\t"+"\t".join([project, dataset, dataset_count, has_tax, date_trimmed, dataset_description] )+"\n") fh.close() logger.info("Finishing VAMPS projects()")
def chimera_checking(self, ref_or_novo): chimera_region_found = False output = {} for idx_key in self.input_file_names: # print "idx_key, self.input_file_names[idx_key] = %s, %s" % (idx_key, self.input_file_names) input_file_name = os.path.join(self.indir, self.input_file_names[idx_key] + self.chg_suffix) output_file_name = os.path.join(self.outdir, self.input_file_names[idx_key]) dna_region = self.runobj.samples[idx_key].dna_region # print "dna_region = %s" % dna_region if dna_region in C.regions_to_chimera_check: chimera_region_found = True else: logger.debug('region not checked: ' + dna_region) continue # print "input_file_name = %s \noutput_file_name = %s" % (input_file_name, output_file_name) ref_db = self.get_ref_db(dna_region) # print "dna_region = %s; ref_db = %s; ref_or_novo = %s" % (dna_region, ref_db, ref_or_novo) uchime_cmd = self.create_chimera_cmd(input_file_name, output_file_name, ref_or_novo, ref_db) print "\n==================\n%s command: %s" % (ref_or_novo, uchime_cmd) try: logger.info("chimera checking command: " + str(uchime_cmd)) output[idx_key] = subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except OSError, e: print "Problems with this command: %s" % (uchime_cmd) if self.utils.is_local(): print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e) else: print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e) raise
def get_reference_databases(self,dna_region): #if dna region == v6v4(a) change it to v4v6 # other reverse regions? if dna_region == 'v6v4': dna_region = 'v4v6' if dna_region == 'v6v4a': dna_region = 'v4v6a' if C.use_full_length: if os.path.exists(os.path.join(self.refdb_dir, 'refssu.udb')): refdb = os.path.join(self.refdb_dir, 'refssu.udb') taxdb = os.path.join(self.refdb_dir, 'refssu.tax') elif os.path.exists(os.path.join(self.refdb_dir, 'refssu.fa')): refdb = os.path.join(self.refdb_dir, 'refssu.fa') taxdb = os.path.join(self.refdb_dir, 'refssu.tax') else: if os.path.exists(os.path.join(self.refdb_dir, C.refdbs[dna_region])): refdb = os.path.join(self.refdb_dir, C.refdbs[dna_region]) taxdb = os.path.join(self.refdb_dir, 'ref'+dna_region+'.tax') elif os.path.exists(os.path.join(self.refdb_dir, 'ref'+dna_region+'.fa')): refdb = os.path.join(self.refdb_dir, 'ref'+dna_region+'.fa') taxdb = os.path.join(self.refdb_dir, 'ref'+dna_region+'.tax') elif os.path.exists(os.path.join(self.refdb_dir, 'refssu.udb')): refdb = os.path.join(self.refdb_dir, 'refssu.udb') taxdb = os.path.join(self.refdb_dir, 'refssu.tax') elif os.path.exists(os.path.join(self.refdb_dir, 'refssu.fa')): refdb = os.path.join(self.refdb_dir, 'refssu.fa') taxdb = os.path.join(self.refdb_dir, 'refssu.tax') else: logger.error("Could not find reference database "+refdb+" Exiting") sys.exit() logger.info('tax_file '+taxdb) logger.info('ref_file '+refdb) return (refdb,taxdb)
def info(self, lane_keys): """ fill vamps_project_info table """ logger.info("Starting vamps_upload: projects_info") if self.runobj.site == 'vamps': db_host = 'vampsdb' db_name = 'vamps' else: db_host = 'vampsdev' db_name = 'vamps' myconn = MyConnection(host=db_host, db=db_name) query = "SELECT last_name,first_name,email,institution from vamps_auth where user='******'" % (self.runobj.user) data = myconn.execute_fetch_select(query) fh = open(self.projects_info_file,'w') title="title" description='description' contact= data[0][1]+' '+data[0][0] email= data[0][2] institution= data[0][3] user = self.runobj.user fh.write("\t".join(["HEADER","project","title","description","contact", "email","institution","user","env_source_id"] )+"\n") fh.write("\t".join(["0",self.project, title, description, contact, email, institution, user, self.runobj.env_source_id] )+"\n") # if this project already exists in the db??? # the next step should update the table rather than add new to the db fh.close() logger.info("Finishing VAMPS info()")
def exports(self, lane_keys): """ fill vamps_exports table """ logger.info("Starting vamps_upload: exports") print "TODO: upload_vamps 5- exports" logger.info("Finishing VAMPS exports()")
def new_vamps(runobj): """ """ logger.info("STARTING NEW_VAMPS()") idx_keys = get_keys(runobj) myvamps = Vamps(runobj, idx_keys) myvamps.create_vamps_files()
def chimera_reference(self,lane_keys): chimera_region_found = False output = {} cluster_id_list = [] for lane_key in lane_keys: dna_region = self.run.samples[lane_key].dna_region if dna_region in C.regions_to_chimera_check: chimera_region_found = True else: logger.debug('region not checked: ' + dna_region) continue out_fileName = self.prefix[lane_key] + ".chimeras.db" # which ref db to use? ref_db = '' if dna_region.upper() == 'ITS': logger.debug("got an ITS dna region so using refdb: " + self.its_refdb) ref_db = self.its_refdb else: logger.debug("using standard refdb: " + self.refdb) ref_db = self.refdb uchime_cmd = ["clusterize"] uchime_cmd.append(self.usearch_cmd) uchime_cmd.append("--uchime") uchime_cmd.append(self.files[lane_key]['abund']) uchime_cmd.append("--uchimeout") uchime_cmd.append(out_fileName) uchime_cmd.append("--db") uchime_cmd.append(ref_db) try: logger.info("chimera referenc command: " + str(uchime_cmd)) output[lane_key] = subprocess.check_output(uchime_cmd) #print 'outsplit',output[lane_key].split()[2] cluster_id_list.append(output[lane_key].split()[2]) #print 'Have %d bytes in output' % len(output) #print 'ref',lane_key,output,len(output) if len(output[lane_key]) < 50 and len(output[lane_key]) > 40: logger.debug(lane_key + " uchime ref seems to have been submitted successfully") else: print >>sys.stderr, "uchime ref may be broke" except OSError, e: print >>sys.stderr, "Execution failed:", e
def projects(self, lane_keys): """ fill vamps_projects_datasets table """ logger.info("Starting vamps_upload: projects_datasets") date_trimmed = 'unknown' dataset_description = self.dataset dataset_count = str(self.dataset_count) has_tax = '1' # true fh = open(self.projects_datasets_file,'w') fh.write("\t".join(["HEADER","project","dataset","dataset_count","has_tax", "date_trimmed","dataset_info"] )+"\n") fh.write("\t".join(["0", self.project, self.dataset, dataset_count, has_tax, date_trimmed, dataset_description] )+"\n") fh.close() logger.info("Finishing VAMPS projects()")
def info(self, key, file_collector): """ fill vamps_project_info.txt file """ logger.info("Starting vamps_upload: projects_info") print "Starting vamps_upload: projects_info" if self.runobj.vamps_user_upload: user = self.runobj.user project = self.runobj.project sample_source_id = self.runobj.env_source_id else: if self.runobj.platform == 'illumina': user = self.runobj.samples[key].data_owner project = self.runobj.samples[key].project sample_source_id = self.runobj.samples[key].env_sample_source_id elif self.runobj.platform == '454': pass else: pass project = project[0].capitalize() + project[1:] cursor = self.conn.cursor() query = "SELECT last_name,first_name,email,institution from vamps_auth where user='******'" % (user) #data = myconn.execute_fetch_select(query) cursor.execute(query) data = cursor.fetchone() fh = open(file_collector['project_info_file'],'w') title="title" description='description' contact= data[1]+' '+data[0] email= data[2] institution= data[3] fh.write("\t".join(["HEADER","project","title","description","contact", "email","institution","user","env_source_id"] )+"\n") fh.write("\t"+"\t".join([project, title, description, contact, email, institution, user, sample_source_id] )+"\n") # if this project already exists in the db??? # the next step should update the table rather than add new to the db fh.close() self.conn.commit() cursor.close() logger.info("Finishing VAMPS info()")
def chimera_denovo(self,lane_keys): chimera_region_found = False output = {} cluster_id_list = [] for lane_key in lane_keys: dna_region = self.run.samples[lane_key].dna_region if dna_region in C.regions_to_chimera_check: chimera_region_found = True else: logger.debug('region not checked: ' + dna_region) continue out_fileName = self.prefix[lane_key] + ".chimeras.txt" #clusterize uchime454 -replace -r self.rundate -t chimeras_denovo uchime_cmd = ["clusterize"] uchime_cmd.append(self.usearch_cmd) uchime_cmd.append("--uchime") uchime_cmd.append(self.files[lane_key]['abund']) uchime_cmd.append("--uchimeout") uchime_cmd.append(out_fileName) uchime_cmd.append("--abskew") uchime_cmd.append(self.abskew) try: logger.info("chimera denovo command: " + str(uchime_cmd)) output[lane_key] = subprocess.check_output(uchime_cmd) #print output[lane_key] #print output[lane_key].split()[2] cluster_id_list.append(output[lane_key].split()[2]) #print 'Have %d bytes in output' % len(output) #print 'denovo',lane_key,output,len(output) # len(output) is normally = 47 if len(output[lane_key]) < 50 and len(output[lane_key]) > 40: logger.debug(lane_key + " uchime denovo seems to have been submitted successfully") else: logger.debug("uchime denovo may have broken") except OSError, e: print >>sys.stderr, "Execution failed:", e
def check_for_input_files(self, data_object): file_count = 0 files_list = [] imports_list = [] lanes_list = [] #input_dir = os.path.join(data_object['general']['input_dir'],"fasta") input_dir = data_object['general']['input_dir'] if os.path.isdir(input_dir): p = data_object['general']['input_dir'], '*'+data_object['general']['input_file_suffix'] for infile in glob.glob( os.path.join(input_dir, '*'+data_object['general']['input_file_suffix']) ): files_list.append(os.path.basename(infile)) for x in data_object: if 'file_prefix' in data_object[x]: pass #print data_object[x]['file_prefix'] #if os.path.basename(infile).split('-')[0] == data_object[x]['file_prefix']: #lanes_list.append(data_object[x]['lane']) file_count += 1 else: logger.info("No input directory or directory permissions problem: "+input_dir) print "No input directory or directory permissions problem: "+input_dir if not file_count: #sys.exit("ERROR: No files were found in '"+input_dir+"' with a suffix of '"+data_object['general']['input_file_suffix']+"'") logger.info("ERROR: No files were found in '"+input_dir+"' with a suffix of '"+data_object['general']['input_file_suffix']+"'") data_object['general']['files_list'] = files_list data_object['general']['file_count'] = file_count # all the files in an illumina directory should be the same type #data_object['general']['file_formats_list'] = [data_object['general']["input_file_format"]] * file_count #data_object['general']['lanes_list'] = lanes_list #print "Files LIST",data_object['general']['files_list'] return data_object
def check_for_input_files(self, data_object): file_count = 0 files_list = [] imports_list = [] lanes_list = [] #input_dir = os.path.join(data_object['general']['input_dir'],"fasta") input_dir = data_object['general']['input_dir'] if os.path.isdir(input_dir): p = data_object['general']['input_dir'], '*'+data_object['general']['input_file_suffix'] for infile in glob.glob( os.path.join(input_dir, '*'+data_object['general']['input_file_suffix']) ): files_list.append(os.path.basename(infile)) for x in data_object: if 'file_prefix' in data_object[x]: pass #print(data_object[x]['file_prefix']) #if os.path.basename(infile).split('-')[0] == data_object[x]['file_prefix']: #lanes_list.append(data_object[x]['lane']) file_count += 1 else: logger.info("No input directory or directory permissions problem: "+input_dir) print("No input directory or directory permissions problem: "+input_dir) if not file_count: #sys.exit("ERROR: No files were found in '"+input_dir+"' with a suffix of '"+data_object['general']['input_file_suffix']+"'") logger.info("ERROR: No files were found in '"+input_dir+"' with a suffix of '"+data_object['general']['input_file_suffix']+"'") data_object['general']['files_list'] = files_list data_object['general']['file_count'] = file_count # all the files in an illumina directory should be the same type #data_object['general']['file_formats_list'] = [data_object['general']["input_file_format"]] * file_count #data_object['general']['lanes_list'] = lanes_list #print("Files LIST",data_object['general']['files_list']) return data_object
def check_for_uniques_files(self,keys): logger.info("Checking for uniques file") if self.runobj.platform == 'vamps': # one fasta file or (one project and dataset from db) if os.path.exists(self.runobj.fasta_file): output_dir = os.path.join(self.basedir,keys[0]) uniques_file = os.path.join(output_dir, keys[0]+'.unique.fa') names_file = os.path.join(output_dir, keys[0]+'.names') #import pipeline.fastaunique as fu #mothur_cmd = C.mothur_cmd+" \"#unique.seqs(fasta="+self.runobj.fasta_file+", outputdir="+os.path.join(self.basedir,keys[0])+"/);\""; fastaunique_cmd = C.fastaunique_cmd +" -x -i "+self.runobj.fasta_file+" -o "+uniques_file+" -n "+names_file print fastaunique_cmd #mothur_cmd = site_base+"/clusterize_vamps -site vampsdev -rd "+user+"_"+runcode+"_gast -rc "+runcode+" -u "+user+" /bioware/mothur/mothur \"#unique.seqs(fasta="+fasta_file+");\""; subprocess.call(fastaunique_cmd, shell=True) #shutil.move('a.txt', 'b.kml') #os.rename(filename, filename[7:]) #os.rename(filename, filename[7:]) else: if self.runobj.project and self.runobj.dataset: pass else: pass #get from database else: pass # for key in keys: # fasta_file = "" # output_dir = os.path.join(self.basedir,key) # unique_file = os.path.join(output_dir, key+'.unique.fa') # if not os.path.exists(unique_file): # mothur_cmd = C.mothur_cmd+" \"#unique.seqs(fasta="+fasta_file+", outputdir="+os.path.join(self.basedir,key)+"/);\""; # # #mothur_cmd = site_base+"/clusterize_vamps -site vampsdev -rd "+user+"_"+runcode+"_gast -rc "+runcode+" -u "+user+" /bioware/mothur/mothur \"#unique.seqs(fasta="+fasta_file+");\""; # subprocess.call(mothur_cmd, shell=True) return ("SUCCESS","check for uniques")
def chimera_denovo(self): chimera_region_found = False output = {} cluster_id_list = [] for idx_key in self.idx_keys: input_file_name = os.path.join(self.indir, idx_key +'.abund.fa') if os.path.isfile(input_file_name): output_file_name = os.path.join(self.outdir, idx_key +'.chimera.denovo') #open(output_file_name, 'a').close() # make sure file exists log_file = os.path.join(self.outdir,idx_key+".denovo.log") dna_region = self.runobj.samples[idx_key].dna_region logger.debug("dna_region = %s" % dna_region) if self.runobj.vamps_user_upload: # VAMPS users can chimera check regardless of region chosen chimera_region_found = True else: if dna_region in C.regions_to_chimera_check: chimera_region_found = True else: logger.debug('region not checked: ' + dna_region) continue self.utils.print_both("input_file_name = %s \noutput_file_name = %s" % (input_file_name, output_file_name)) # uchime_cmd = C.clusterize_cmd # uchime_cmd += " " # uchime_cmd += self.usearch_cmd # uchime_cmd += " --uchime " # uchime_cmd += input_file_name # uchime_cmd += " --uchimeout " # uchime_cmd += output_file_name # uchime_cmd += " --abskew " # uchime_cmd += self.abskew uchime_cmd='' if self.use_cluster: uchime_cmd += C.clusterize_cmd uchime_cmd += " " uchime_cmd += " -log " uchime_cmd += log_file uchime_cmd += " " uchime_cmd += self.usearch_cmd uchime_cmd += " -uchime_denovo " uchime_cmd += input_file_name uchime_cmd += " -uchimeout " uchime_cmd += output_file_name logger.debug("uchime_denovo_cmd = %s" % (uchime_cmd)) try: logger.info("chimera denovo command: " + str(uchime_cmd)) # subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) #output[idx_key] = subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output[idx_key] = subprocess.check_output(uchime_cmd, shell=True) self.utils.print_both("output[idx_key] = %s" % output[idx_key]) self.utils.print_both(output[idx_key].split()[2]) cluster_id_list.append(output[idx_key].split()[2]) except OSError, e: self.utils.print_both("Problems with this command: %s" % (uchime_cmd)) if self.utils.is_local(): print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e) else: print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e) self.utils.print_both("Execution of %s failed: %s" % (uchime_cmd, e)) raise
def chimera(runobj): chimera_cluster_ids = [] logger.debug("Starting Chimera Checker") # lets read the trim status file out here and keep those details out of the Chimera code idx_keys = get_keys(runobj) #new_lane_keys = convert_unicode_dictionary_to_str(json.loads(open(runobj.trim_status_file_name,"r").read()))["new_lane_keys"] # Open run STATUS File here. # open in append mode because we may start the run in the middle # say at the gast stage and don't want to over write. # if we re-run trimming we'll get two trim status reports runobj.run_status_file_h = open(runobj.run_status_file_name, "a") mychimera = Chimera(runobj) logger.debug("\nStarting DeNovo Chimera") c_den = mychimera.chimera_denovo() logger.debug("Ending DeNovo Chimera") if c_den[0] == 'SUCCESS': chimera_cluster_ids += c_den[2] # add a list to a list logger.debug("chimera_cluster_ids: "+' '.join(chimera_cluster_ids)) chimera_code='PASS' elif c_den[0] == 'NOREGION': chimera_code='NOREGION' elif c_den[0] == 'FAIL': chimera_code = 'FAIL' else: chimera_code='FAIL' logger.debug("Chimera DeNovo Code: "+chimera_code) logger.debug("\nStarting Reference Chimera") c_ref = mychimera.chimera_reference() if c_ref[0] == 'SUCCESS': chimera_cluster_ids += c_ref[2] chimera_code='PASS' elif c_ref[0] == 'NOREGION': chimera_code = 'NOREGION' elif c_ref[0] == 'FAIL': chimera_code='FAIL' else: chimera_code='FAIL' #print chimera_cluster_ids runobj.chimera_status_file_h = open(runobj.chimera_status_file_name,"w") if chimera_code == 'PASS': if runobj.use_cluster: chimera_cluster_code = wait_for_cluster_to_finish(chimera_cluster_ids) if chimera_cluster_code[0] == 'SUCCESS': logger.info("Chimera checking finished successfully") runobj.chimera_status_file_h.write("CHIMERA SUCCESS\n") runobj.run_status_file_h.write("CHIMERA SUCCESS\n") else: logger.info("3-Chimera checking Failed") runobj.chimera_status_file_h.write("3-CHIMERA ERROR: "+str(chimera_cluster_code[1])+" "+str(chimera_cluster_code[2])+"\n") runobj.run_status_file_h.write("3-CHIMERA ERROR: "+str(chimera_cluster_code[1])+" "+str(chimera_cluster_code[2])+"\n") sys.exit("3-Chimera checking Failed") else: chimera_cluster_code = ['SUCCESS','Not using cluster'] logger.info("Chimera checking finished without using cluster") runobj.chimera_status_file_h.write("CHIMERA SUCCESS--no cluster\n") runobj.run_status_file_h.write("CHIMERA SUCCESS--no cluster\n") elif chimera_code == 'NOREGION': logger.info("No regions found that need chimera checking") runobj.chimera_status_file_h.write("CHIMERA CHECK NOT NEEDED\n") runobj.run_status_file_h.write("CHIMERA CHECK NOT NEEDED\n") elif chimera_code == 'FAIL': logger.info("1-Chimera checking Failed") runobj.chimera_status_file_h.write("1-CHIMERA ERROR: \n") runobj.run_status_file_h.write("1-CHIMERA ERROR: \n") sys.exit("1-Chimera Failed") else: logger.info("2-Chimera checking Failed") runobj.chimera_status_file_h.write("2-CHIMERA ERROR: \n") runobj.run_status_file_h.write("2-CHIMERA ERROR: \n") sys.exit("2-Chimera checking Failed") sleep(2) if chimera_code == 'PASS' and chimera_cluster_code[0] == 'SUCCESS': logger.info("Writing Chimeras to deleted files") mychimera.write_chimeras_to_deleted_file() # should also recreate fasta # then read chimera files and place (or replace) any chimeric read_id # into the deleted file. mymblutils = MBLPipelineFastaUtils(idx_keys, runobj) # write new cleaned files that remove chimera if apropriate # these are in fasta_mbl_pipeline.py # the cleaned file are renamed to the original name: # lane_key.unique.fa # lane_key.trimmed.fa # lane_key.names -- # lane_key.abund.fa -- this file is for the uclust chimera script # lane_key.deleted.txt -- no change in this file # THE ORDER IS IMPORTANT HERE: mymblutils.write_clean_fasta_file() mymblutils.write_clean_names_file() mymblutils.write_clean_uniques_file() mymblutils.write_clean_abundance_file()
def exports(self, key, refid_collector, tax_collector, read_id_lookup, file_collector): """ fill vamps_exports.txt file """ logger.info("Starting vamps_upload: exports") print "Starting vamps_upload: exports" if self.runobj.vamps_user_upload: project = self.runobj.project dataset = key else: if self.runobj.platform == 'illumina': project = self.runobj.samples[key].project dataset = self.runobj.samples[key].dataset elif self.runobj.platform == '454': pass else: pass project = project[0].capitalize() + project[1:] project_dataset = project+'--'+dataset date_trimmed = 'unknown' dataset_description = dataset fh = open(file_collector['export_file'],'w') # t.read_id, t.project, t.dataset, g.refhvr_ids, x.distance, x.taxonomy, t.sequence, x.rank," " t.entry_date fh.write("\t".join(["HEADER","read_id","project","dataset","refhvr_ids","distance","taxonomy","sequence", "rank","entry_date"] )+"\n") today = str(datetime.date.today()) # open original fasta file if os.path.exists(file_collector['original_fa_file']) and os.path.getsize(file_collector['original_fa_file']) > 0: f = FastaReader(file_collector['original_fa_file']) while f.next(): datarow = [''] id = f.id.split('|')[0] seq = f.seq if id in read_id_lookup: tax = read_id_lookup[id] else: tax = '' if tax in tax_collector: rank = tax_collector[tax]['rank'] else: rank = 'NA' if id in refid_collector: distance = refid_collector[id]['distance'] refhvr_ids = refid_collector[id]['refhvr_ids'] else: distance = '1.0' refhvr_ids = '0' datarow.append(id) datarow.append(project) datarow.append(dataset) datarow.append(refhvr_ids) datarow.append(distance) datarow.append(tax) datarow.append(seq) datarow.append(rank) datarow.append(today) w = "\t".join(datarow) #print 'w',w fh.write(w+"\n") fh.close() logger.info("Finishing VAMPS exports()")
def chimera_denovo(self): chimera_region_found = False output = {} cluster_id_list = [] for idx_key in self.idx_keys: input_file_name = os.path.join(self.indir, idx_key + '.abund.fa') if os.path.isfile(input_file_name): output_file_name = os.path.join(self.outdir, idx_key + '.chimera.denovo') #open(output_file_name, 'a').close() # make sure file exists log_file = os.path.join(self.outdir, idx_key + ".denovo.log") dna_region = self.runobj.samples[idx_key].dna_region logger.debug("dna_region = %s" % dna_region) if self.runobj.vamps_user_upload: # VAMPS users can chimera check regardless of region chosen chimera_region_found = True else: if dna_region in C.regions_to_chimera_check: chimera_region_found = True else: logger.debug('region not checked: ' + dna_region) continue self.utils.print_both( "input_file_name = %s \noutput_file_name = %s" % (input_file_name, output_file_name)) # uchime_cmd = C.clusterize_cmd # uchime_cmd += " " # uchime_cmd += self.usearch_cmd # uchime_cmd += " --uchime " # uchime_cmd += input_file_name # uchime_cmd += " --uchimeout " # uchime_cmd += output_file_name # uchime_cmd += " --abskew " # uchime_cmd += self.abskew uchime_cmd = '' if self.use_cluster: uchime_cmd += C.clusterize_cmd uchime_cmd += " " uchime_cmd += " -log " uchime_cmd += log_file uchime_cmd += " " uchime_cmd += self.usearch_cmd uchime_cmd += " -uchime_denovo " uchime_cmd += input_file_name uchime_cmd += " -uchimeout " uchime_cmd += output_file_name logger.debug("uchime_denovo_cmd = %s" % (uchime_cmd)) try: logger.info("chimera denovo command: " + str(uchime_cmd)) # subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.utils.print_both("chimera denovo command: " + str(uchime_cmd)) #output[idx_key] = subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output[idx_key] = subprocess.check_output(uchime_cmd, shell=True) self.utils.print_both("chimera denovo result: " + str(output[idx_key])) #self.utils.print_both("output[idx_key] = %s" % output[idx_key]) #if idx_key in output and len(output[idx_key].split()) > 1: #self.utils.print_both(output[idx_key].split()[2]) items = output[idx_key].split() if len(items) > 2: cluster_id_list.append(items[2]) except OSError: e = sys.exc_info()[1] self.utils.print_both( "Error: Problems with this command: %s" % (uchime_cmd)) if self.utils.is_local(): print >> sys.stderr, "Error: Execution of %s failed: %s" % ( uchime_cmd, e) else: print >> sys.stderr, "Error: Execution of %s failed: %s" % ( uchime_cmd, e) self.utils.print_both( "Error: Execution of %s failed: %s" % (uchime_cmd, e)) raise # ??? if not chimera_region_found: return ('NOREGION', 'No regions found that need checking', '') # ??? # for idx_key in output: # if len(output[idx_key]) > 50 or len(output[idx_key]) < 40: # return ('ERROR','uchime ref may have broken or empty', idx_key) # finally self.utils.print_both('Finished Chimera Denovo') if cluster_id_list: return ('SUCCESS', 'uchime ref seems to have been submitted successfully', cluster_id_list) else: return ('ERROR', 'uchime ref returned no cluster IDs', cluster_id_list)
def assign_taxonomy(self, gast_dir, dna_region, names_file, ref_taxa): from pipeline.taxonomy import Taxonomy,consensus #results = uc_results results = {} # open gast_file to get results tagtax_terse_filename = os.path.join(gast_dir,"tagtax_terse") tagtax_long_filename = os.path.join(gast_dir,"tagtax_long") tagtax_terse_fh = open(tagtax_terse_filename,'w') tagtax_long_fh = open(tagtax_long_filename,'w') tagtax_long_fh.write("\t".join(["read_id","taxonomy","distance","rank","refssu_count","vote","minrank","taxa_counts","max_pcts","na_pcts","refhvr_ids"])+"\n") gast_file = os.path.join(gast_dir, "gast"+dna_region) if not os.path.exists(gast_file): logger.info("Could not find gast file: "+gast_file) sys.exit("Could not find gast file: "+gast_file) for line in open(gast_file,'r'): # must split on tab because last field may be empty and must be maintained as blank data=line.strip().split("\t") if len(data) == 3: data.append("") # 0=id, 1=ref, 2=dist, 3=align results[data[0]]=[data[1].split('|')[0],data[2],data[3]] for read in results: #print read, results[read] pass for line in open(names_file,'r'): data=line.strip().split("\t") dupes = data[1].split(",") read = data[0] taxObjects = [] distance =0 refs_for ={} #print 'read',read if read not in results: results[read]=["Unknown", '1', "NA", '0', '0', "NA", "0;0;0;0;0;0;0;0", "0;0;0;0;0;0;0;0", "100;100;100;100;100;100;100;100"] refs_for[read] = [ "NA" ] else: #print 'read in res',read, results[read] for i in range( 0,len(results[read])): #for resultread in results[read]: #print results[read] ref = results[read][0] if ref in ref_taxa: for tax in ref_taxa[ref]: for t in tax: taxObjects.append(Taxonomy(t)) else: pass if read in refs_for: if results[read][0] not in refs_for[read]: refs_for[read].append(results[read][0]) else: refs_for[read] = [results[read][0]] # should all be the same distance distance = results[read][1] #Lookup the consensus taxonomy for the array taxReturn = consensus(taxObjects, C.majority) # 0=taxObj, 1=winning vote, 2=minrank, 3=rankCounts, 4=maxPcts, 5=naPcts; taxon = taxReturn[0].taxstring() rank = taxReturn[0].depth() #print read,taxon,rank,taxReturn[0],taxReturn[1] if not taxon: taxon = "Unknown" # (taxonomy, distance, rank, refssu_count, vote, minrank, taxa_counts, max_pcts, na_pcts) results[read] = [ taxon, str(distance), rank, str(len(taxObjects)), str(taxReturn[1]), taxReturn[2], taxReturn[3], taxReturn[4], taxReturn[5] ] #print "\t".join([read,taxon, str(distance), rank, str(len(taxObjects)), str(taxReturn[1]), taxReturn[2], taxReturn[3], taxReturn[4], taxReturn[5]]) + "\n" #read_id taxonomy distance rank refssu_count vote minrank taxa_counts max_pcts na_pcts refhvr_ids #D4ZHLFP1:25:B022DACXX:3:1101:12919:40734 1:N:0:TGACCA|frequency:162 Bacteria;Proteobacteria;Gammaproteobacteria 0.117 class 2 100 genus 1;1;1;2;2;2;0;0 100;100;100;50;50;50;0;0 0;0;0;0;0;0;100;100 v6_CI671 #D4ZHLFP1:25:B022DACXX:3:1101:10432:76870 1:N:0:TGACCA|frequency:105 Bacteria;Proteobacteria;Gammaproteobacteria 0.017 class 1 100 class 1;1;1;0;0;0;0;0 100;100;100;0;0;0;0;0 0;0;0;100;100;100;100;100 v6_BW306 # Replace hash with final taxonomy results, for each copy of the sequence for d in dupes: # print OUT join("\t", $d, @{$results{$read}}, join(",", sort @{$refs_for{$read}})) . "\n"; tagtax_long_fh.write( d+"\t"+"\t".join(results[read])+"\t"+','.join(sorted(refs_for[read])) + "\n") tagtax_terse_fh.write(d+"\t"+results[read][0]+"\t"+results[read][2]+"\t"+results[read][3]+"\t"+','.join(sorted(refs_for[read]))+"\t"+results[read][1]+"\n") tagtax_terse_fh.close() return results
def chimera(runobj): chimera_cluster_ids = [] logger.debug("Starting Chimera Checker") # lets read the trim status file out here and keep those details out of the Chimera code idx_keys = get_keys(runobj) #new_lane_keys = convert_unicode_dictionary_to_str(json.loads(open(runobj.trim_status_file_name,"r").read()))["new_lane_keys"] mychimera = Chimera(runobj) c_den = mychimera.chimera_denovo(idx_keys) if c_den[0] == 'SUCCESS': chimera_cluster_ids += c_den[2] chimera_code='PASS' elif c_den[0] == 'NOREGION': chimera_code='NOREGION' elif c_den[0] == 'FAIL': chimera_code = 'FAIL' else: chimera_code='FAIL' c_ref = mychimera.chimera_reference(idx_keys) if c_ref[0] == 'SUCCESS': chimera_cluster_ids += c_ref[2] chimera_code='PASS' elif c_ref[0] == 'NOREGION': chimera_code = 'NOREGION' elif c_ref[0] == 'FAIL': chimera_code='FAIL' else: chimera_code='FAIL' #print chimera_cluster_ids runobj.chimera_status_file_h = open(runobj.chimera_status_file_name,"w") if chimera_code == 'PASS': chimera_cluster_code = wait_for_cluster_to_finish(chimera_cluster_ids) if chimera_cluster_code[0] == 'SUCCESS': logger.info("Chimera checking finished successfully") runobj.chimera_status_file_h.write("CHIMERA SUCCESS\n") runobj.run_status_file_h.write("CHIMERA SUCCESS\n") else: logger.info("3-Chimera checking Failed") runobj.chimera_status_file_h.write("3-CHIMERA ERROR: "+str(chimera_cluster_code[1])+" "+str(chimera_cluster_code[2])+"\n") runobj.run_status_file_h.write("3-CHIMERA ERROR: "+str(chimera_cluster_code[1])+" "+str(chimera_cluster_code[2])+"\n") sys.exit("3-Chimera checking Failed") elif chimera_code == 'NOREGION': logger.info("No regions found that need chimera checking") runobj.chimera_status_file_h.write("CHIMERA CHECK NOT NEEDED\n") runobj.run_status_file_h.write("CHIMERA CHECK NOT NEEDED\n") elif chimera_code == 'FAIL': logger.info("1-Chimera checking Failed") runobj.chimera_status_file_h.write("1-CHIMERA ERROR: \n") runobj.run_status_file_h.write("1-CHIMERA ERROR: \n") sys.exit("1-Chimera Failed") else: logger.info("2-Chimera checking Failed") runobj.chimera_status_file_h.write("2-CHIMERA ERROR: \n") runobj.run_status_file_h.write("2-CHIMERA ERROR: \n") sys.exit("2-Chimera checking Failed") sleep(2) if chimera_code == 'PASS' and chimera_cluster_code[0] == 'SUCCESS': mychimera.write_chimeras_to_deleted_file(idx_keys) # should also recreate fasta # then read chimera files and place (or replace) any chimeric read_id # into the deleted file. mymblutils = MBLPipelineFastaUtils(idx_keys, mychimera.outdir) # write new cleaned files that remove chimera if apropriate # these are in fasta_mbl_pipeline.py # the cleaned file are renamed to the original name: # lane_key.unique.fa # lane_key.trimmed.fa # lane_key.names -- # lane_key.abund.fa -- this file is for the uclust chimera script # lane_key.deleted.txt -- no change in this file # THE ORDER IS IMPORTANT HERE: mymblutils.write_clean_fasta_file() mymblutils.write_clean_names_file() mymblutils.write_clean_uniques_file() mymblutils.write_clean_abundance_file() # write keys file for each lane_key - same fields as db table? for easy writing # write primers file for each lane_key # Write new clean files to the database # rawseq table not used # trimseq # runkeys # primers # run primers mymblutils.write_clean_files_to_database()
# this will read the args and ini file and return a dictionary data_object = v.validate_args() # for attr in dir(data_object): # print("obj.%s = %s" % (attr, getattr(data_object, attr))) # set logging print("\nLog Level set to:", args.loglevel) logger.setLevel(args.loglevel.upper() ) logger.info("Starting pipeline") ############## # # Test cl parameters # ############## # CL RULES: # for ini file: (no plurals) # 1) CL: input_dir ONLY shall be supplied on CL - no input filenames # # 2) All input files should be in the same directory AND of the same format # # 3) Supply a input_file_suffix on the CL if there are varying file types in the # input_dir and you only are using some (default will read all files) # 4) #
def chimera_reference(self): chimera_region_found = False output = {} cluster_id_list = [] for idx_key in self.run_keys: dna_region = self.runobj.samples[idx_key].dna_region if self.runobj.vamps_user_upload: # VAMPS users can chimera check regardless of region chosen chimera_region_found = True else: if dna_region in C.regions_to_chimera_check: chimera_region_found = True else: logger.debug('region not checked: ' + dna_region) continue input_file_name = os.path.join(self.indir, idx_key +'.abund.fa') output_file_name = os.path.join(self.outdir,idx_key+".chimera.ref") #open(output_file_name, 'a').close() # make sure file exists log_file = os.path.join(self.outdir,idx_key+".ref.log") logger.debug("OUT FILE NAME: " + output_file_name) #out_file_name = self.prefix[idx_key] + ".chimeras.db" input_file_name = os.path.join(self.indir, idx_key +'.abund.fa') if os.path.isfile(input_file_name): output_file_name = os.path.join(self.outdir,idx_key+".chimera.ref") #open(output_file_name, 'a').close() # make sure file exists log_file = os.path.join(self.outdir,idx_key+".ref.log") logger.debug("OUT FILE NAME: " + output_file_name) # which ref db to use? ref_db = '' if dna_region.upper() == 'ITS': logger.debug("got an ITS dna region so using refdb: " + self.its_refdb) ref_db = self.its_refdb else: logger.debug("using standard refdb: " + self.refdb) ref_db = self.refdb uchime_cmd='' if self.use_cluster: uchime_cmd = C.clusterize_cmd uchime_cmd += " " uchime_cmd += " -log " uchime_cmd += log_file uchime_cmd += " " uchime_cmd += self.usearch_cmd uchime_cmd += " -uchime_ref " uchime_cmd += input_file_name uchime_cmd += " -uchimeout " uchime_cmd += output_file_name uchime_cmd += " -db " uchime_cmd += ref_db uchime_cmd += " -strand " uchime_cmd += "plus" logger.debug("uchime_ref_cmd = %s" % (uchime_cmd)) try: logger.info("chimera reference command: " + str(uchime_cmd)) output[idx_key] = subprocess.check_output(uchime_cmd, shell=True) #print 'outsplit',output[idx_key].split()[2] cluster_id_list.append(output[idx_key].split()[2]) #print 'Have %d bytes in output' % len(output) #print 'ref',idx_key,output,len(output) if len(output[idx_key]) < 50 and len(output[idx_key]) > 40: logger.debug(idx_key + " uchime ref seems to have been submitted successfully") else: if self.use_cluster: print >>sys.stderr, "uchime ref may be broke" self.utils.print_both("uchime ref may be broke") except OSError, e: print >>sys.stderr, "Execution of chimera_reference failed: %s" % (uchime_cmd, e) self.utils.print_both("Execution of chimera_reference failed: %s" % (uchime_cmd, e)) raise
def chimera_denovo(self): chimera_region_found = False output = {} cluster_id_list = [] for idx_key in self.idx_keys: input_file_name = os.path.join(self.indir, idx_key +'.abund.fa') output_file_name = os.path.join(self.outdir, idx_key +'.chimera.denovo') #open(output_file_name, 'a').close() # make sure file exists log_file = os.path.join(self.outdir,idx_key+".denovo.log") dna_region = self.runobj.samples[idx_key].dna_region logger.debug("dna_region = %s" % dna_region) if self.runobj.vamps_user_upload: # VAMPS users can chimera check regardless of region chosen chimera_region_found = True else: if dna_region in C.regions_to_chimera_check: chimera_region_found = True else: logger.debug('region not checked: ' + dna_region) continue print "input_file_name = %s \noutput_file_name = %s" % (input_file_name, output_file_name) # uchime_cmd = C.clusterize_cmd # uchime_cmd += " " # uchime_cmd += self.usearch_cmd # uchime_cmd += " --uchime " # uchime_cmd += input_file_name # uchime_cmd += " --uchimeout " # uchime_cmd += output_file_name # uchime_cmd += " --abskew " # uchime_cmd += self.abskew uchime_cmd = C.clusterize_cmd uchime_cmd += " " uchime_cmd += " -log " uchime_cmd += log_file uchime_cmd += " " uchime_cmd += self.usearch_cmd uchime_cmd += " -uchime_denovo " uchime_cmd += input_file_name uchime_cmd += " -uchimeout " uchime_cmd += output_file_name logger.debug("uchime_denovo_cmd = %s" % (uchime_cmd)) try: logger.info("chimera denovo command: " + str(uchime_cmd)) # subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) #output[idx_key] = subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output[idx_key] = subprocess.check_output(uchime_cmd, shell=True) print "output[idx_key] = %s" % output[idx_key] print output[idx_key].split()[2] cluster_id_list.append(output[idx_key].split()[2]) except OSError, e: print "Problems with this command: %s" % (uchime_cmd) if self.utils.is_local(): print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e) else: print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e) raise
def trim_by_quality(self, infile=None, format='sanger', wsize=1, wstep=1, trim_ends='53', agg_action='min', exc_count=0, score_comp='>=', qual_score=0, filter_first50=False, filter_Ns=False, filter_Nx=0, failed_fastq=False, length=0, trim=0, clip=0, keep_zero_length=False): #format window_size = wsize window_step = wstep #trim_ends aggregation_action = agg_action exclude_count = exc_count score_comparison = score_comp quality_score = qual_score filter_length = length trim_length = trim clip_length = clip if not infile: sys.exit("illumina_fastq_trimmer: Need to specify an input file") if window_size < 1: sys.exit( 'illumina_fastq_trimmer: You must specify a strictly positive window size' ) if window_step < 1: sys.exit( 'illumina_fastq_trimmer: You must specify a strictly positive step size' ) print("\nRunning illumina Filtering") in_filepath = os.path.join(self.indir, infile) try: filebase = infile.split('/')[1].split('.')[0] except: filebase = infile.split('.')[0] out_filename = filebase + ".filtered.fastq" out_filepath = os.path.join(self.outdir, out_filename) #determine an exhaustive list of window indexes that can be excluded from aggregation exclude_window_indexes = [] last_exclude_indexes = [] for exclude_count in range(min(exclude_count, window_size)): if last_exclude_indexes: new_exclude_indexes = [] for exclude_list in last_exclude_indexes: for window_index in range(window_size): if window_index not in exclude_list: new_exclude = sorted(exclude_list + [window_index]) if new_exclude not in exclude_window_indexes + new_exclude_indexes: new_exclude_indexes.append(new_exclude) exclude_window_indexes += new_exclude_indexes last_exclude_indexes = new_exclude_indexes else: for window_index in range(window_size): last_exclude_indexes.append([window_index]) exclude_window_indexes = list(last_exclude_indexes) out = fastqWriter(open(out_filepath, 'wb'), format=format) action = ACTION_METHODS[aggregation_action] if failed_fastq: fail = fastqWriter(open(out_filepath + '.failed', 'wb'), format=format) num_reads = None num_reads_excluded = 0 count_of_unchaste = 0 count_of_trimmed = 0 count_of_first50 = 0 count_of_Ns = 0 if self.runobj.compressed: import gzip try: logger.info("illumina_filtering: opening compressed file: " + in_filepath) fp = gzip.open(in_filepath) except: logger.info("illumina_filtering: opening uncompressed file: " + in_filepath) fp = open(in_filepath) else: logger.info("illumina_filtering: opening uncompressed file: " + in_filepath) fp = open(in_filepath) for num_reads, fastq_read in enumerate(fastqReader(fp, format=format)): ############################################################################################ # Put chastity code here #print(fastq_read.identifier) seq = fastq_read.get_sequence() desc_items = fastq_read.identifier.split(':') if desc_items[7] == 'Y': count_of_unchaste += 1 #print('failed chastity') if failed_fastq: fail.write(fastq_read) continue # Filter reads with ambiguous bases if filter_Ns: countN = seq.count('N') if countN > 1 or (countN == 1 and seq[filter_Nx - 1:filter_Nx] != 'N'): #print('failed Ns',infile) count_of_Ns += 1 if failed_fastq: fail.write(fastq_read) continue # Filter reads below first 50 base quality if filter_first50: first50 = 50 first50_maxQ = 30 first50_maxQ_count = 34 quals = fastq_read.get_decimal_quality_scores()[:first50] count_lt30 = 0 for q in quals: if q < first50_maxQ: count_lt30 += 1 if count_lt30 >= first50_maxQ_count: #print('failed first50') if failed_fastq: fail.write(fastq_read) count_of_first50 += 1 continue ##### END CHASTITY ##################### ############################################################################################ ##### START Btails CODE ################ quality_list = fastq_read.get_decimal_quality_scores() for trim_end in trim_ends: if trim_end == '5': lwindow_position = 0 #left position of window while True: if lwindow_position >= len(quality_list): fastq_read.sequence = '' fastq_read.quality = '' break if self.exclude_and_compare( action, quality_list[ lwindow_position:lwindow_position + window_size], score_comparison, quality_score, exclude_window_indexes): fastq_read = fastq_read.slice( lwindow_position, None) break lwindow_position += window_step else: rwindow_position = len( quality_list) #right position of window while True: lwindow_position = rwindow_position - window_size #left position of window if rwindow_position <= 0 or lwindow_position < 0: fastq_read.sequence = '' fastq_read.quality = '' break if self.exclude_and_compare( action, quality_list[ lwindow_position:rwindow_position], score_comparison, quality_score, exclude_window_indexes): fastq_read = fastq_read.slice( None, rwindow_position) break rwindow_position -= window_step ######## END Btails CODE ############################### ############################################################################################ # put length/trim/clip code here quality_list = fastq_read.get_decimal_quality_scores() if filter_length: if len(quality_list) < filter_length: print('failed length') if failed_fastq: fail.write(fastq_read) continue # Trim initial bases -- remove first 10 bases from read 2 if clip_length: # remove from the front: fastq_read = fastq_read.slice(clip_length, None) count_of_trimmed += 1 # Trim to max length -- read 2 trim to 90. if trim_length: if len(quality_list) > trim_length: # remove from the end: fastq_read = fastq_read.slice( None, len(fastq_read.get_sequence()) - trim_length) count_of_trimmed += 1 if keep_zero_length or len(fastq_read): out.write(fastq_read) else: num_reads_excluded += 1 out.close() if failed_fastq: fail.close() print("file:", infile) print('count_of_trimmed (for length):', count_of_trimmed) print('count_of_first50 (avg first50 quals < 34):', count_of_first50) print("count_of_unchaste ('Y' in id):", count_of_unchaste) print('count_of_Ns (reads with N):', count_of_Ns) if num_reads is None: print("No valid FASTQ reads could be processed.") else: print("%i FASTQ reads were processed." % (num_reads + 1)) if num_reads_excluded: print("%i reads of zero length were excluded from the output." % num_reads_excluded) return out_filename
def convert_csv_to_ini(self, new_ini_file): #print(self.args) from pipeline.get_ini import readCSV print('CSV path', self.general_config_dict['csvPath']) my_csv = readCSV(file_path = self.general_config_dict['csvPath']) content = my_csv.read_csv() headers = content[1].keys() headers_clean = [x.strip('"').replace(" ", "_").lower() for x in headers] projects = {} #print #print(content[1]) #print # get list of keys keys_list = [] if self.check_headers(headers_clean): logger.info("CSV headers okay") for k,values in content.items(): keys_list.append(values['barcode_index']+"_"+values['run_key']+"_"+values['lane']) fh = open(new_ini_file,'w') # general section fh.write("#\n#\tCreated by MBL Pipeline for run: "+self.general_config_dict['run']+" on "+self.general_config_dict['date']+"\n#\n\n") fh.write("[general]\n") fh.write("run = "+self.general_config_dict['run']+"\n") fh.write("configPath = "+new_ini_file+"\n") fh.write("configPath_orig = " + self.general_config_dict['configPath']+"\n") fh.write("platform = " + self.general_config_dict['platform']+"\n") fh.write("output_dir = " + os.path.dirname(new_ini_file)+"\n") #fh.write("output_dir = "+os.path.join(self.general_config_dict['baseoutputdir'],self.general_config_dict['run'])+"\n") if self.general_config_dict['platform'] in C.illumina_list: #fh.write("input_file_suffix = " + self.general_config_dict['input_file_suffix']+"\n") fh.write("input_file_format = " + self.general_config_dict['input_file_format']+"\n") fh.write("anchor_file = " + self.general_config_dict['anchor_file']+"\n") fh.write("primer_file = " + self.general_config_dict['primer_file']+"\n") fh.write("compressed = " + str(self.general_config_dict['compressed'])+"\n") fh.write("do_perfect = " + str(self.general_config_dict['do_perfect'])+"\n") fh.write("lane_name = " + str(self.general_config_dict['lane_name'])+"\n") fh.write("database_host = " + self.general_config_dict['database_host']+"\n") fh.write("database_name = " + self.general_config_dict['database_name']+"\n") fh.write("input_dir = " + self.general_config_dict['input_dir']+"\n") fh.write("require_distal = " + str(self.general_config_dict['require_distal'])+"\n") fh.write("use_cluster = " + str(self.general_config_dict['use_cluster'])+"\n") fh.write("date = " + str(datetime.date.today())+"\n") fh.write("site = " + self.general_config_dict['site']+"\n") fh.write("load_vamps_database = " + str(self.general_config_dict['load_vamps_database'])+"\n") fh.write("idx_keys = " +','.join(keys_list)+"\n") if 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != '': file_list = self.get_input_files() fh.write("input_files = " + ','.join(file_list)+"\n") else: fh.write("input_files = \n") #fh.write(getattr(args,'force_runkey', "")) for k, values in content.items(): fh.write("\n") if self.general_config_dict['platform'] in C.illumina_list: fh.write("["+values['barcode_index']+"_"+values['run_key']+"_"+values['lane']+"]\n") elif self.general_config_dict['platform'] == '454': fh.write("["+values['lane']+"_"+values['run_key']+"]\n") for v in values: if v == "env_sample_source": try: new_val = [str(j[0]) for j in self.env if j[1] == values[v]][0] except: text = """There was an error in env_sample_source. Please check your metadata. Possible values: ----------- air extreme habitat host associated human associated human-amniotic-fluid human-blood human-gut human-oral human-skin human-urine human-vaginal indoor microbial mat/biofilm miscellaneous_natural_or_artificial_environment plant associated sediment soil/sand unknown wastewater/sludge water-freshwater water-marine ----------- """ print(text) raise fh.write("env_sample_source_id = "+new_val+"\n") else: fh.write(v+" = "+values[v]+"\n") fh.close() return new_ini_file
def convert_csv_to_ini(self, new_ini_file): #print self.args from pipeline.get_ini import readCSV print 'CSV path', self.general_config_dict['csvPath'] my_csv = readCSV(file_path = self.general_config_dict['csvPath']) content = my_csv.read_csv() headers = content[1].keys() headers_clean = [x.strip('"').replace(" ", "_").lower() for x in headers] projects = {} #print #print content[1] #print # get list of keys keys_list = [] if self.check_headers(headers_clean): logger.info("CSV headers okay") for k,values in content.iteritems(): keys_list.append(values['barcode_index']+"_"+values['run_key']+"_"+values['lane']) fh = open(new_ini_file,'w') # general section fh.write("#\n#\tCreated by MBL Pipeline for run: "+self.general_config_dict['run']+" on "+self.general_config_dict['date']+"\n#\n\n") fh.write("[general]\n") fh.write("run = "+self.general_config_dict['run']+"\n") fh.write("configPath = "+new_ini_file+"\n") fh.write("configPath_orig = " + self.general_config_dict['configPath']+"\n") fh.write("platform = " + self.general_config_dict['platform']+"\n") fh.write("output_dir = " + os.path.dirname(new_ini_file)+"\n") #fh.write("output_dir = "+os.path.join(self.general_config_dict['baseoutputdir'],self.general_config_dict['run'])+"\n") if self.general_config_dict['platform'] == 'illumina': #fh.write("input_file_suffix = " + self.general_config_dict['input_file_suffix']+"\n") fh.write("input_file_format = " + self.general_config_dict['input_file_format']+"\n") fh.write("anchor_file = " + self.general_config_dict['anchor_file']+"\n") fh.write("primer_file = " + self.general_config_dict['primer_file']+"\n") fh.write("compressed = " + str(self.general_config_dict['compressed'])+"\n") fh.write("do_perfect = " + str(self.general_config_dict['do_perfect'])+"\n") fh.write("lane_name = " + str(self.general_config_dict['lane_name'])+"\n") fh.write("database_host = " + self.general_config_dict['database_host']+"\n") fh.write("database_name = " + self.general_config_dict['database_name']+"\n") fh.write("input_dir = " + self.general_config_dict['input_dir']+"\n") fh.write("require_distal = " + str(self.general_config_dict['require_distal'])+"\n") fh.write("use_cluster = " + str(self.general_config_dict['use_cluster'])+"\n") fh.write("date = " + str(datetime.date.today())+"\n") fh.write("site = " + self.general_config_dict['site']+"\n") fh.write("load_vamps_database = " + str(self.general_config_dict['load_vamps_database'])+"\n") fh.write("idx_keys = " +','.join(keys_list)+"\n") if 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != '': file_list = self.get_input_files() fh.write("input_files = " + ','.join(file_list)+"\n") else: fh.write("input_files = \n") #fh.write(getattr(args,'force_runkey', "")) for k, values in content.iteritems(): fh.write("\n") if self.general_config_dict['platform'] == 'illumina': fh.write("["+values['barcode_index']+"_"+values['run_key']+"_"+values['lane']+"]\n") elif self.general_config_dict['platform'] == '454': fh.write("["+values['lane']+"_"+values['run_key']+"]\n") for v in values: if v == "env_sample_source": try: new_val = [str(j[0]) for j in self.env if j[1] == values[v]][0] except: print """There was an error in env_sample_source. Please check your metadata. Possible values: ----------- air extreme habitat host associated human associated human-amniotic-fluid human-blood human-gut human-oral human-skin human-urine human-vaginal indoor microbial mat/biofilm miscellaneous_natural_or_artificial_environment plant associated sediment soil/sand unknown wastewater/sludge water-freshwater water-marine ----------- """ raise fh.write("env_sample_source_id = "+new_val+"\n") else: fh.write(v+" = "+values[v]+"\n") fh.close() return new_ini_file
def chimera(runobj): chimera_cluster_ids = [] logger.debug("Starting Chimera Checker") # lets read the trim status file out here and keep those details out of the Chimera code idx_keys = get_keys(runobj) # new_lane_keys = convert_unicode_dictionary_to_str(json.loads(open(runobj.trim_status_file_name,"r").read()))["new_lane_keys"] # Open run STATUS File here. # open in append mode because we may start the run in the middle # say at the gast stage and don't want to over write. # if we re-run trimming we'll get two trim status reports runobj.run_status_file_h = open(runobj.run_status_file_name, "a") mychimera = Chimera(runobj) logger.debug("\nStarting DeNovo Chimera") c_den = mychimera.chimera_denovo() logger.debug("Ending DeNovo Chimera") if c_den[0] == 'SUCCESS': chimera_cluster_ids += c_den[2] # add a list to a list logger.debug("chimera_cluster_ids: " + ' '.join(chimera_cluster_ids)) chimera_code = 'PASS' elif c_den[0] == 'NOREGION': chimera_code = 'NOREGION' elif c_den[0] == 'FAIL': chimera_code = 'FAIL' else: chimera_code = 'FAIL' logger.debug("Chimera DeNovo Code: " + chimera_code) logger.debug("\nStarting Reference Chimera") c_ref = mychimera.chimera_reference() if c_ref[0] == 'SUCCESS': chimera_cluster_ids += c_ref[2] chimera_code = 'PASS' elif c_ref[0] == 'NOREGION': chimera_code = 'NOREGION' elif c_ref[0] == 'FAIL': chimera_code = 'FAIL' else: chimera_code = 'FAIL' # logger.debug(chimera_cluster_ids) runobj.chimera_status_file_h = open(runobj.chimera_status_file_name, "w") if chimera_code == 'PASS': if runobj.use_cluster: chimera_cluster_code = wait_for_cluster_to_finish( chimera_cluster_ids) if chimera_cluster_code[0] == 'SUCCESS': logger.info("Chimera checking finished successfully") runobj.chimera_status_file_h.write("CHIMERA SUCCESS\n") runobj.run_status_file_h.write("CHIMERA SUCCESS\n") else: logger.info("3-Chimera checking Failed") runobj.chimera_status_file_h.write( "3-CHIMERA ERROR: " + str(chimera_cluster_code[1]) + " " + str(chimera_cluster_code[2]) + "\n") runobj.run_status_file_h.write("3-CHIMERA ERROR: " + str(chimera_cluster_code[1]) + " " + str(chimera_cluster_code[2]) + "\n") sys.exit("3-Chimera checking Failed") else: chimera_cluster_code = ['SUCCESS', 'Not using cluster'] logger.info("Chimera checking finished without using cluster") runobj.chimera_status_file_h.write("CHIMERA SUCCESS--no cluster\n") runobj.run_status_file_h.write("CHIMERA SUCCESS--no cluster\n") elif chimera_code == 'NOREGION': logger.info("No regions found that need chimera checking") runobj.chimera_status_file_h.write("CHIMERA CHECK NOT NEEDED\n") runobj.run_status_file_h.write("CHIMERA CHECK NOT NEEDED\n") elif chimera_code == 'FAIL': logger.info("1-Chimera checking Failed") runobj.chimera_status_file_h.write("1-CHIMERA ERROR: \n") runobj.run_status_file_h.write("1-CHIMERA ERROR: \n") sys.exit("1-Chimera Failed") else: logger.info("2-Chimera checking Failed") runobj.chimera_status_file_h.write("2-CHIMERA ERROR: \n") runobj.run_status_file_h.write("2-CHIMERA ERROR: \n") sys.exit("2-Chimera checking Failed") sleep(2) if chimera_code == 'PASS' and chimera_cluster_code[0] == 'SUCCESS': logger.info("Writing Chimeras to deleted files") mychimera.write_chimeras_to_deleted_file() # should also recreate fasta # then read chimera files and place (or replace) any chimeric read_id # into the deleted file. mymblutils = MBLPipelineFastaUtils(idx_keys, runobj) # write new cleaned files that remove chimera if apropriate # these are in fasta_mbl_pipeline.py # the cleaned file are renamed to the original name: # lane_key.unique.fa # lane_key.trimmed.fa # lane_key.names -- # lane_key.abund.fa -- this file is for the uclust chimera script # lane_key.deleted.txt -- no change in this file # THE ORDER IS IMPORTANT HERE: mymblutils.write_clean_fasta_file() mymblutils.write_clean_names_file() mymblutils.write_clean_uniques_file() mymblutils.write_clean_abundance_file()
def gast(runobj): logger.info("STARTING GAST()") # logger.info("vsearch version: " % utils.get_vsearch_version) # for vamps 'new_lane_keys' will be prefix # of the uniques and names file # that was just created in vamps_gast.py # or we can get the 'lane_keys' directly from the config_file # for illumina: # a unique idx_key is a concatenation of barcode_index and run_key # Should return a list not a string idx_keys = get_keys(runobj) # get GAST object mygast = Gast(runobj, idx_keys) # Check for unique files and create them if not there result_code = mygast.check_for_unique_files(idx_keys) runobj.run_status_file_h.write(json.dumps(result_code) + "\n") if result_code['status'] == 'ERROR': logger.error("uniques not found failed") if runobj.vamps_user_upload: write_status_to_vamps_db(runobj.site, runobj.run, "GAST ERROR", "uniques file not found - failed") sys.exit("uniques not found failed") elif runobj.vamps_user_upload: write_status_to_vamps_db(runobj.site, runobj.run, result_code['status'], result_code['message']) sleep(5) # CLUSTERGAST result_code = mygast.clustergast() runobj.run_status_file_h.write(json.dumps(result_code) + "\n") if result_code['status'] == 'ERROR': logger.error("clutergast failed") if runobj.vamps_user_upload: write_status_to_vamps_db(runobj.site, runobj.run, "GAST ERROR", "clustergast failed") sys.exit("clustergast failed") elif runobj.vamps_user_upload: write_status_to_vamps_db(runobj.site, runobj.run, result_code['status'], result_code['message']) sleep(5) # GAST_CLEANUP result_code = mygast.gast_cleanup() runobj.run_status_file_h.write(json.dumps(result_code) + "\n") if result_code['status'] == 'ERROR': logger.error("gast_cleanup failed") if runobj.vamps_user_upload: write_status_to_vamps_db(runobj.site, runobj.run, "GAST ERROR", "gast_cleanup failed") sys.exit("gast_cleanup failed") elif runobj.vamps_user_upload: write_status_to_vamps_db(runobj.site, runobj.run, result_code['status'], result_code['message']) sleep(5) # GAST2TAX result_code = mygast.gast2tax() runobj.run_status_file_h.write(json.dumps(result_code) + "\n") if result_code['status'] == 'ERROR': logger.error("gast2tax failed") if runobj.vamps_user_upload: write_status_to_vamps_db(runobj.site, runobj.run, "GAST ERROR", "gast2tax failed") sys.exit("gast2tax failed") elif runobj.vamps_user_upload: write_status_to_vamps_db(runobj.site, runobj.run, result_code['status'], result_code['message'])
parser.add_argument('-file_base', required=True, action="store", dest = "file_base", help = 'where the files are loacated') ## optional parser.add_argument("-dna_region", required=False, action="store", dest = "dna_region", default='unknown', help="") parser.add_argument("-domain", required=False, action="store", dest = "domain", default='unknown', help="") parser.add_argument('-d', '--dataset', required=False, action="store", dest = "dataset", help = '') parser.add_argument("-p", "--project", required=False, action="store", dest = "project", help="") logger.info("Starting vamps_load.py") args = parser.parse_args() data_object['infile'] = args.infile data_object['datetime'] = str(datetime.date.today()) data_object['type'] = args.type data_object['runcode'] = args.runcode data_object['site'] = args.site data_object['user'] = args.user data_object['file_base'] = args.file_base data_object['file_type'] = args.file_type if args.dna_region: dna_region = args.dna_region data_object['dna_region'] = dna_region
def chimera_reference(self): chimera_region_found = False output = {} cluster_id_list = [] for idx_key in self.run_keys: dna_region = self.runobj.samples[idx_key].dna_region if self.runobj.vamps_user_upload: # VAMPS users can chimera check regardless of region chosen chimera_region_found = True else: if dna_region in C.regions_to_chimera_check: chimera_region_found = True else: logger.debug('region not checked: ' + dna_region) continue input_file_name = os.path.join(self.indir, idx_key +'.abund.fa') output_file_name = os.path.join(self.outdir,idx_key+".chimera.ref") #open(output_file_name, 'a').close() # make sure file exists log_file = os.path.join(self.outdir,idx_key+".ref.log") logger.debug("OUT FILE NAME: " + output_file_name) #out_file_name = self.prefix[idx_key] + ".chimeras.db" input_file_name = os.path.join(self.indir, idx_key +'.abund.fa') output_file_name = os.path.join(self.outdir,idx_key+".chimera.ref") #open(output_file_name, 'a').close() # make sure file exists log_file = os.path.join(self.outdir,idx_key+".ref.log") logger.debug("OUT FILE NAME: " + output_file_name) # which ref db to use? ref_db = '' if dna_region.upper() == 'ITS': logger.debug("got an ITS dna region so using refdb: " + self.its_refdb) ref_db = self.its_refdb else: logger.debug("using standard refdb: " + self.refdb) ref_db = self.refdb uchime_cmd = C.clusterize_cmd uchime_cmd += " " uchime_cmd += " -log " uchime_cmd += log_file uchime_cmd += " " uchime_cmd += self.usearch_cmd uchime_cmd += " -uchime_ref " uchime_cmd += input_file_name uchime_cmd += " -uchimeout " uchime_cmd += output_file_name uchime_cmd += " -db " uchime_cmd += ref_db uchime_cmd += " -strand " uchime_cmd += "plus" logger.debug("uchime_ref_cmd = %s" % (uchime_cmd)) try: logger.info("chimera reference command: " + str(uchime_cmd)) output[idx_key] = subprocess.check_output(uchime_cmd, shell=True) #print 'outsplit',output[idx_key].split()[2] cluster_id_list.append(output[idx_key].split()[2]) #print 'Have %d bytes in output' % len(output) #print 'ref',idx_key,output,len(output) if len(output[idx_key]) < 50 and len(output[idx_key]) > 40: logger.debug(idx_key + " uchime ref seems to have been submitted successfully") else: print >>sys.stderr, "uchime ref may be broke" except OSError, e: print >>sys.stderr, "Execution of chimera_reference failed: %s" % (uchime_cmd, e) raise
def load_database(self,key,out_gast_dir, file_collector): """ """ logger.info("Starting load VAMPS data") print "Starting load VAMPS data" # USER: vamps_db_tables if self.runobj.vamps_user_upload: user = self.runobj.user project = self.runobj.project data_cube_table = C.database_tables['vamps_user_uploads']['tax_dc_tbl'] summed_cube_table = C.database_tables['vamps_user_uploads']['tax_summed_tbl'] taxonomy_table = C.database_tables['vamps_user_uploads']['tax_tbl'] sequences_table = C.database_tables['vamps_user_uploads']['sequences_tbl'] export_table = C.database_tables['vamps_user_uploads']['export_tbl'] datasets_table = C.database_tables['vamps_user_uploads']['datasets_tbl'] users_table = C.database_tables['vamps_user_uploads']['users_tbl'] else: if self.runobj.platform == 'illumina': user = self.runobj[key].data_owner project = self.runobj.samples[key].project data_cube_table = C.database_tables['vamps_mbl_origin']['tax_dc_tbl'] summed_cube_table = C.database_tables['vamps_mbl_origin']['tax_summed_tbl'] taxonomy_table = C.database_tables['vamps_mbl_origin']['tax_tbl'] sequences_table = C.database_tables['vamps_mbl_origin']['sequences_tbl'] export_table = C.database_tables['vamps_mbl_origin']['export_tbl'] datasets_table = C.database_tables['vamps_mbl_origin']['datasets_tbl'] users_table = C.database_tables['vamps_mbl_origin']['users_tbl'] elif self.runobj.platform == '454': pass else: pass info_table = C.database_tables['vamps_mbl_origin']['info_tbl'] users_info_table = C.database_tables['vamps_user_uploads']['info_tbl'] cursor = self.conn.cursor() # # DATA_CUBE # print "loading "+key+": data_cube" if os.path.exists(file_collector['taxes_file']): for line in open(file_collector['taxes_file'],'r'): line = line.strip().split("\t") if line[0]=='HEADER': continue qDataCube = "insert ignore into %s (project, dataset, taxon_string,superkingdom,phylum,class, orderx,family,genus,species,strain,\ rank,knt,frequency,dataset_count,classifier)\ VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" \ % (data_cube_table, line[0],line[1],line[2],line[3],line[4],line[5],line[6],line[7], line[8],line[9],line[10],line[11],line[12],line[13],line[14],line[15]) #myconn.execute_no_fetch(qDataCube) #print qDataCube rows_affected = cursor.execute(qDataCube) else: print "taxes file not found for dataset "+key # # SUMMED (JUNK) DATA_CUBE # print "loading "+key+": junk_data_cube" if os.path.exists(file_collector['summed_taxes_file']): for line in open(file_collector['summed_taxes_file'],'r'): line = line.strip().split("\t") if line[0]=='HEADER': continue #line = line[1:] # remove leading empty tab #taxonomy sum_tax_counts frequency dataset_count rank project dataset project--dataset classifier qSummedCube = "insert ignore into %s (taxon_string,knt, frequency, dataset_count, rank, project, dataset, project_dataset, classifier)\ VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s')" \ % (summed_cube_table, line[0],line[1],line[2],line[3],line[4],line[5],line[6],line[7], line[8]) #myconn.execute_no_fetch(qSummedCube) #print qSummedCube cursor.execute(qSummedCube) else: print "summed taxes file not found for dataset "+key # # TAXONOMY # print "loading "+key+": taxonomy" if os.path.exists(file_collector['distinct_taxes_file']): for line in open(file_collector['distinct_taxes_file'],'r'): line = line.strip().split("\t") if line[0]=='HEADER': continue #line = line[1:] # remove leading empty tab qTaxonomy = "insert ignore into %s (taxon_string,rank,num_kids)\ VALUES('%s','%s','%s')" \ % (taxonomy_table, line[0],line[1],line[2]) #myconn.execute_no_fetch(qTaxonomy) cursor.execute(qTaxonomy) else: print "distinct taxes file not found for dataset "+key # # SEQUENCES # print "loading "+key+": sequences" if os.path.exists(file_collector['sequences_file']): for line in open(file_collector['sequences_file'],'r'): line = line.strip().split("\t") if line[0]=='HEADER': continue #line = line[1:] # remove leading empty tab # project dataset taxonomy refhvr_ids rank seq_count frequency distance read_id project_dataset qSequences = "insert ignore into %s (sequence,project, dataset, taxonomy,refhvr_ids,rank,seq_count,frequency,distance,rep_id, project_dataset)\ VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" \ % (sequences_table, line[0],line[1],line[2],line[3],line[4],line[5],line[6],line[7], line[8],line[9],line[10]) #myconn.execute_no_fetch(qSequences) cursor.execute(qSequences) else: print "sequences file not found for dataset "+key # # EXPORT # print "loading "+key+": export" if os.path.exists(file_collector['export_file']): for line in open(file_collector['export_file'],'r'): line = line.strip().split("\t") if line[0]=='HEADER': continue #line = line[1:] # remove leading empty tab # t.read_id, t.project, t.dataset, g.refhvr_ids, x.distance, x.taxonomy, t.sequence, x.rank," " t.entry_date # project dataset taxonomy refhvr_ids rank seq_count frequency distance read_id project_dataset qSequences = "insert ignore into %s (read_id, project, dataset, refhvr_ids, distance, taxonomy, sequence, rank, date_trimmed)\ VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s')" \ % (export_table, line[0],line[1],line[2],line[3],line[4],line[5],line[6],line[7], line[8]) #myconn.execute_no_fetch(qSequences) cursor.execute(qSequences) else: print "export file not found for dataset "+key # # PROJECTS_DATASETS # print "loading "+key+": projects_datasets" if os.path.exists(file_collector['projects_datasets_file']): for line in open(file_collector['projects_datasets_file'],'r'): line = line.strip().split("\t") # [1:] # split and remove the leading 'zero' if line[0]=='HEADER': continue qDatasets = "insert ignore into %s (project, dataset, dataset_count,has_tax,date_trimmed,dataset_info)\ VALUES('%s','%s','%s','%s','%s','%s')" \ % (datasets_table, line[0],line[1],line[2],line[3],line[4],line[5]) #myconn.execute_no_fetch(qDatasets) cursor.execute(qDatasets) qDatasets = "update %s set has_tax='1' where project='%s'" \ % (datasets_table, line[0]) #myconn.execute_no_fetch(qDatasets) cursor.execute(qDatasets) else: print "project_datasets file not found for dataset "+key # # INFO # print "loading "+key+": info" if os.path.exists(file_collector['project_info_file']): for line in open(file_collector['project_info_file'],'r'): line = line.strip().split("\t") #[1:] # split on tab and remove the leading 'zero' if line[0]=='HEADER': continue qInfo = "insert ignore into %s (project_name, title, description, contact, email, institution, user, env_source_id)\ VALUES('%s','%s','%s','%s','%s','%s','%s','%s')" \ % (users_info_table, line[0],line[1],line[2],line[3],line[4],line[5],line[6],line[7]) #myconn.execute_no_fetch(qInfo) cursor.execute(qInfo) qInfo = "update %s set has_tax='1' where project_name='%s'" \ % (users_info_table, line[0]) #myconn.execute_no_fetch(qInfo) cursor.execute(qInfo) else: print "upload_info file not found for dataset "+key # # USERS # print "loading users:"+key qUser = "******" \ % (users_table, project, user) #myconn.execute_no_fetch(qUser) cursor.execute(qUser) logger.info("Finished load VAMPS data") self.conn.commit() cursor.close()
def trim_by_quality(self, infile=None, format='sanger', wsize=1, wstep=1, trim_ends='53', agg_action='min', exc_count=0, score_comp='>=', qual_score=0, filter_first50=False, filter_Ns=False,filter_Nx=0, failed_fastq=False, length=0, trim=0, clip=0, keep_zero_length=False): #format window_size = wsize window_step = wstep #trim_ends aggregation_action = agg_action exclude_count = exc_count score_comparison = score_comp quality_score = qual_score filter_length = length trim_length = trim clip_length = clip if not infile: sys.exit( "illumina_fastq_trimmer: Need to specify an input file" ) if window_size < 1: sys.exit( 'illumina_fastq_trimmer: You must specify a strictly positive window size' ) if window_step < 1: sys.exit( 'illumina_fastq_trimmer: You must specify a strictly positive step size' ) print("\nRunning illumina Filtering") in_filepath = os.path.join(self.indir,infile) try: filebase = infile.split('/')[1].split('.')[0] except: filebase = infile.split('.')[0] out_filename = filebase+".filtered.fastq" out_filepath = os.path.join(self.outdir, out_filename) #determine an exhaustive list of window indexes that can be excluded from aggregation exclude_window_indexes = [] last_exclude_indexes = [] for exclude_count in range( min( exclude_count, window_size ) ): if last_exclude_indexes: new_exclude_indexes = [] for exclude_list in last_exclude_indexes: for window_index in range( window_size ): if window_index not in exclude_list: new_exclude = sorted( exclude_list + [ window_index ] ) if new_exclude not in exclude_window_indexes + new_exclude_indexes: new_exclude_indexes.append( new_exclude ) exclude_window_indexes += new_exclude_indexes last_exclude_indexes = new_exclude_indexes else: for window_index in range( window_size ): last_exclude_indexes.append( [ window_index ] ) exclude_window_indexes = list( last_exclude_indexes ) out = fastqWriter( open( out_filepath, 'wb' ), format = format ) action = ACTION_METHODS[ aggregation_action ] if failed_fastq: fail = fastqWriter( open( out_filepath+'.failed', 'wb' ), format = format ) num_reads = None num_reads_excluded = 0 count_of_unchaste = 0 count_of_trimmed = 0 count_of_first50 = 0 count_of_Ns = 0 if self.runobj.compressed: import gzip try: logger.info( "illumina_filtering: opening compressed file: "+in_filepath) fp = gzip.open( in_filepath ) except: logger.info( "illumina_filtering: opening uncompressed file: "+in_filepath) fp = open( in_filepath ) else: logger.info( "illumina_filtering: opening uncompressed file: "+in_filepath) fp = open( in_filepath ) for num_reads, fastq_read in enumerate( fastqReader( fp, format = format ) ): ############################################################################################ # Put chastity code here #print(fastq_read.identifier) seq = fastq_read.get_sequence() desc_items = fastq_read.identifier.split(':') if desc_items[7] == 'Y': count_of_unchaste += 1 #print('failed chastity') if failed_fastq: fail.write( fastq_read ) continue # Filter reads with ambiguous bases if filter_Ns: countN = seq.count('N') if countN > 1 or (countN == 1 and seq[filter_Nx-1:filter_Nx] != 'N'): #print('failed Ns',infile) count_of_Ns += 1 if failed_fastq: fail.write( fastq_read ) continue # Filter reads below first 50 base quality if filter_first50: first50 = 50 first50_maxQ = 30 first50_maxQ_count = 34 quals = fastq_read.get_decimal_quality_scores()[:first50] count_lt30 = 0 for q in quals: if q < first50_maxQ: count_lt30 += 1 if count_lt30 >= first50_maxQ_count: #print('failed first50') if failed_fastq: fail.write( fastq_read ) count_of_first50 += 1 continue ##### END CHASTITY ##################### ############################################################################################ ##### START Btails CODE ################ quality_list = fastq_read.get_decimal_quality_scores() for trim_end in trim_ends: if trim_end == '5': lwindow_position = 0 #left position of window while True: if lwindow_position >= len( quality_list ): fastq_read.sequence = '' fastq_read.quality = '' break if self.exclude_and_compare( action, quality_list[ lwindow_position:lwindow_position + window_size ], score_comparison, quality_score, exclude_window_indexes ): fastq_read = fastq_read.slice( lwindow_position, None ) break lwindow_position += window_step else: rwindow_position = len( quality_list ) #right position of window while True: lwindow_position = rwindow_position - window_size #left position of window if rwindow_position <= 0 or lwindow_position < 0: fastq_read.sequence = '' fastq_read.quality = '' break if self.exclude_and_compare( action, quality_list[ lwindow_position:rwindow_position ], score_comparison, quality_score, exclude_window_indexes ): fastq_read = fastq_read.slice( None, rwindow_position ) break rwindow_position -= window_step ######## END Btails CODE ############################### ############################################################################################ # put length/trim/clip code here quality_list = fastq_read.get_decimal_quality_scores() if filter_length: if len(quality_list) < filter_length: print('failed length') if failed_fastq: fail.write( fastq_read ) continue # Trim initial bases -- remove first 10 bases from read 2 if clip_length: # remove from the front: fastq_read = fastq_read.slice( clip_length, None ) count_of_trimmed += 1 # Trim to max length -- read 2 trim to 90. if trim_length: if len(quality_list) > trim_length: # remove from the end: fastq_read = fastq_read.slice( None, len(fastq_read.get_sequence()) - trim_length ) count_of_trimmed += 1 if keep_zero_length or len( fastq_read ): out.write( fastq_read ) else: num_reads_excluded += 1 out.close() if failed_fastq: fail.close() print("file:",infile) print('count_of_trimmed (for length):', count_of_trimmed) print('count_of_first50 (avg first50 quals < 34):', count_of_first50) print("count_of_unchaste ('Y' in id):", count_of_unchaste) print('count_of_Ns (reads with N):', count_of_Ns) if num_reads is None: print("No valid FASTQ reads could be processed.") else: print("%i FASTQ reads were processed." % ( num_reads + 1 )) if num_reads_excluded: print("%i reads of zero length were excluded from the output." % num_reads_excluded) return out_filename
help="user name") ## optional parser.add_argument("-nd", "--no_distal", required=False, action='store_false', dest = "require_distal", default=True, help="") parser.add_argument('-min',"--minlength", required=False, action="store", dest = "minlength", help = '') parser.add_argument("-max","--maxlength", required=False, action="store", dest = "maxlength", help="") parser.add_argument("-file_type", required=True, action="store", dest = "file_type", default='fasta', help="sff, fasta or fastq") parser.add_argument('-file_base', required=True, action="store", dest = "file_base", help = '') parser.add_argument("-cl", "--use_cluster", required=False, action="store", dest = "use_cluster", default=True, help = '') logger.info("Starting vamps_trim.py") args = parser.parse_args() data_object['datetime'] = str(datetime.date.today()) data_object['runcode'] = args.runcode data_object['site'] = args.site data_object['user'] = args.user data_object['require_distal'] = args.require_distal data_object['use_cluster'] = args.use_cluster if data_object['use_cluster'] == 'True' or data_object['use_cluster'] == 'true': data_object['use_cluster'] = True else: data_object['use_cluster'] = False if args.minlength: minlength = args.minlength
sys.exit("unknown platform - Exiting") v = MetadataUtils(command_line_args=args) # this will read the args and ini file and return a dictionary data_object = v.validate_args() # for attr in dir(data_object): # print("obj.%s = %s" % (attr, getattr(data_object, attr))) # set logging print("\nLog Level set to:", args.loglevel) logger.setLevel(args.loglevel.upper()) logger.info("Starting pipeline") ############## # # Test cl parameters # ############## # CL RULES: # for ini file: (no plurals) # 1) CL: input_dir ONLY shall be supplied on CL - no input filenames # # 2) All input files should be in the same directory AND of the same format # # 3) Supply a input_file_suffix on the CL if there are varying file types in the # input_dir and you only are using some (default will read all files) # 4) #
def gast_cleanup(self): """ gast_cleanup - follows clustergast, explodes the data and copies to gast_concat and gast files """ logger.info("Starting GAST Cleanup") self.runobj.run_status_file_h.write("Starting gast_cleanup\n") for key in self.idx_keys: output_dir = os.path.join(self.basedir,key) gast_dir = os.path.join(output_dir,'gast') if key in self.runobj.samples: dna_region = self.runobj.samples[key].dna_region else: dna_region = self.runobj.dna_region if not dna_region: logger.error("gast_cleanup: We have no DNA Region: Setting dna_region to 'unknown'") self.runobj.run_status_file_h.write("gast_cleanup: We have no DNA Region: Setting dna_region to 'unknown'\n") dna_region = 'unknown' # find gast_dir # for vamps user upload # basedir is like avoorhis_3453211 # and outdir is like avoorhis_3453211/2012-06-25 # for MBL pipeline # basedir is like 1_AGTCG # and outdir is like 1_AGTCG/2012-06-25 unique_file = 'Not Found' names_file = 'Not Found' if self.runobj.platform == 'vamps': unique_file = os.path.join(output_dir, key+'.unique.fa') names_file = os.path.join(output_dir,key+'.names') elif self.runobj.platform == 'illumina': file_prefix = self.runobj.samples[key].file_prefix unique_file = os.path.join(self.input_dir,file_prefix+"-PERFECT_reads.fa.unique") names_file = os.path.join(self.input_dir,file_prefix+"-PERFECT_reads.fa.unique.names") else: pass print 'UNIQUE FILE',unique_file #print 'names file',names_file if not os.path.exists(gast_dir): logger.error("Could not find gast directory: "+gast_dir+" Exiting") sys.exit() clustergast_filename_single = os.path.join(gast_dir, "gast"+dna_region) logger.debug('gast filesize:'+str(os.path.getsize(clustergast_filename_single))) gast_filename = os.path.join(gast_dir, "gast") gastconcat_filename = os.path.join(gast_dir, "gast_concat") #dupes_filename = os.path.join(gast_dir, "dupes") #nonhits_filename = os.path.join(gast_dir, "nonhits") copies = {} nonhits = {} # open and read names file names_fh = open(names_file,'r') for line in names_fh: s = line.strip().split("\t") index_read = s[0] copies[index_read] = s[1].split(',') if index_read in nonhits: nonhits[index_read] += 1 else: nonhits[index_read] = 1 names_fh.close() #print nonhits #print copies ####################################### # # Insert records with valid gast hits into gast_file # ####################################### # read the .gast file from clustergast concat = {} gast_fh = open(gast_filename,'w') if(os.path.exists(clustergast_filename_single)): in_gast_fh = open(clustergast_filename_single,'r') else: print "No clustergast file found:",clustergast_filename_single,"\nExiting" self.runobj.run_status_file_h.write("No clustergast file found:",clustergast_filename_single," Exiting\n") sys.exit() for line in in_gast_fh: s = line.strip().split() if len(s) == 4: read_id = s[0] refhvr_id = s[1].split('|')[0] distance = s[2] alignment = s[3] #print read_id,refhvr_id # if this was in the gast table zero it out because it had a valid hit # so we don't insert them as non-hits later if read_id in nonhits: del nonhits[read_id] #print 'deleling',read_id #print 'nonhits',nonhits if read_id not in copies: logger.info(read_id+' not in names file: Skipping') continue # give the same ref and dist for each duplicate for id in copies[read_id]: if id != read_id: #print id,read_id,distance,refhvr_id gast_fh.write( id + "\t" + refhvr_id + "\t" + distance + "\t" + alignment + "\n" ) in_gast_fh.close() ####################################### # # Insert a record for any valid sequence that had no blast hit and therefore no gast result # into gast_filename # ####################################### for read in sorted(nonhits.iterkeys()): for d in copies[read]: gast_fh.write( d+"\t0\t1\t\n") gast_fh.close() # concatenate the two gast files clustergast_fh = open(clustergast_filename_single,'a') shutil.copyfileobj(open(gast_filename,'rb'), clustergast_fh) clustergast_fh.close() #the open again and get data for gast concat concat = {} print clustergast_filename_single for line in open(clustergast_filename_single,'r'): data = line.strip().split("\t") id = data[0] refhvr_id = data[1].split('|')[0] distance = data[2] #print 'data',data if id in concat: concat[id]['refhvrs'].append(refhvr_id) else: concat[id] = {} concat[id]['refhvrs'] = [refhvr_id] concat[id]['distance'] = distance ####################################### # # Insert records into gast_concat_filename # ####################################### # first we need to open the gast_filename gastconcat_fh = open(gastconcat_filename,'w') for id, value in concat.iteritems(): #print 'trying gastconcat', id,value gastconcat_fh.write( id + "\t" + concat[id]['distance'] + "\t" + ' '.join(concat[id]['refhvrs']) + "\n" ) gastconcat_fh.close() print "Finished gast_cleanup" logger.info("Finished gast_cleanup") return ("SUCCESS","gast_cleanup")
def taxonomy(self,key, dataset_count, file_collector): """ fill vamps_data_cube, vamps_junk_data_cube and vamps_taxonomy files """ logger.info("Starting vamps_upload: taxonomy") print "Starting vamps_upload: taxonomy" # SUMMED create a look-up if self.runobj.vamps_user_upload: project = self.runobj.project dataset = key else: if self.runobj.platform == 'illumina': project = self.runobj.samples[key].project dataset = self.runobj.samples[key].dataset elif self.runobj.platform == '454': pass else: pass project = project[0].capitalize() + project[1:] project_dataset = project+'--'+dataset taxa_lookup = {} read_id_lookup={} if os.path.exists(file_collector['tagtax_file']): for line in open(file_collector['tagtax_file'],'r'): line = line.strip() items = line.split("\t") taxa = items[1] if taxa[-3:] == ';NA': taxa = taxa[:-3] read_id=items[0] read_id_lookup[read_id]=taxa # the count here is the frequency of the taxon in the datasets if taxa in taxa_lookup: taxa_lookup[taxa] += 1 else: taxa_lookup[taxa] = 1 # DATA CUBE TABLE # taxa_lookup: {'Unknown': 146, 'Bacteria': 11888, 'Bacteria;Chloroflexi': 101} # dataset_count is 3 (3 taxa in this dataset) # frequency is 3/144 fh1 = open(file_collector['taxes_file'],'w') fh1.write("\t".join( ["HEADER","project", "dataset", "taxonomy", "superkingdom", "phylum", "class", "orderx", "family", "genus", "species", "strain", "rank", "knt", "frequency", "dataset_count", "classifier"]) + "\n") tax_collector={} summer=0 for tax,knt in taxa_lookup.iteritems(): #print tax,cnt summer += knt datarow = ['',project,dataset] taxa = tax.split(';') #if taxa[0] in C.domains: freq = float(knt) / int(dataset_count) rank = C.ranks[len(taxa)-1] for i in range(len(C.ranks)): if len(taxa) <= i: taxa.append(C.ranks[i] + "_NA") tax_collector[tax] = {} datarow.append(tax) datarow.append("\t".join(taxa)) datarow.append(rank) datarow.append(str(knt)) datarow.append(str(freq)) datarow.append(dataset_count) datarow.append(self.runobj.classifier) w = "\t".join(datarow) #print w fh1.write(w+"\n") tax_collector[tax]['rank'] = rank tax_collector[tax]['knt'] = knt tax_collector[tax]['freq'] = freq fh1.close() # # SUMMED DATA CUBE TABLE # fh2 = open(file_collector['summed_taxes_file'],'w') fh2.write("\t".join(["HEADER","taxonomy", "sum_tax_counts", "frequency", "dataset_count","rank", "project","dataset","project--dataset","classifier"] )+"\n") ranks_subarray = [] rank_list_lookup = {} for i in range(0, len(C.ranks)): ranks_subarray.append(C.ranks[i]) ranks_list = ";".join(ranks_subarray) # i.e., superkingdom, phylum, class # open data_cube file again # taxes_file: data_cube_uploads for line in open(file_collector['taxes_file'],'r'): line = line.strip().split("\t") knt = line[12] taxon = line[2] if line[0] == 'HEADER': continue if taxon in tax_collector: knt = tax_collector[taxon]['knt'] else: print 'ERROR tax not found in tax_collector: assigning zero' knt = 0 idx = len(ranks_subarray) l=[] for k in range(3,idx+3): l.append(line[k]) tax = ';'.join(l) #print 'rl tax',ranks_list,tax if tax in rank_list_lookup: rank_list_lookup[tax] += knt else: rank_list_lookup[tax] = knt for tax,knt in rank_list_lookup.iteritems(): #print 'tax2',tax taxa = tax.split(';') #if taxa[0] in C.domains: rank = len( taxa ) -1 frequency = float(knt) / int(dataset_count) if len(tax) - len(''.join(taxa)) >= rank: datarow = [''] datarow.append(tax) datarow.append(str(knt)) datarow.append(str(frequency)) datarow.append(str(dataset_count)) datarow.append(str(rank)) datarow.append(project) datarow.append(dataset) datarow.append(project_dataset) datarow.append(self.runobj.classifier) w = "\t".join(datarow) #print w fh2.write(w+"\n") fh2.close() # # DISTINCT TAXONOMY # fh3 = open(file_collector['distinct_taxes_file'],'w') fh3.write("\t".join(["HEADER","taxon_string", "rank", "num_kids"] )+"\n") taxon_string_lookup={} for line in open(file_collector['summed_taxes_file'],'r'): if line.split()[0] == 'HEADER': continue items = line.strip().split() taxon_string = items[0] #print taxon_string if taxon_string in taxon_string_lookup: taxon_string_lookup[taxon_string] += 1 else: taxon_string_lookup[taxon_string] = 1 for taxon_string,v in taxon_string_lookup.iteritems(): datarow = [''] datarow.append(taxon_string) taxa = taxon_string.split(';') if taxa[0] in C.domains: rank = str(len(taxa)-1) datarow.append(rank) if rank==7 or taxon_string[-3:]=='_NA': num_kids = '0' else: num_kids = '1' datarow.append(num_kids) w = "\t".join(datarow) #print 'w',w fh3.write(w+"\n") fh3.close() return (tax_collector,read_id_lookup)
def clustergast(self): """ clustergast - runs the GAST pipeline on the cluster. GAST uses UClust to identify the best matches of a read sequence to references sequences in a reference database. VAMPS: The uniques and names files have previously been created in trim_run.py. Illumina : """ logger.info("Starting Clustergast") self.runobj.run_status_file_h.write("Starting clustergast\n") # Step1: create empty gast table in database: gast_<rundate> # Step2: Count the number of sequences so the job can be split for nodes # $facount = `grep -c \">\" $fasta_uniqs_filename`; # $calcs = `/bioware/seqinfo/bin/calcnodes -t $facount -n $nodes -f 1`; # /bioware/seqinfo/bin/fastasampler -n $start,$end ${gastDir}/${fasta_uniqs_filename} $tmp_fasta_filename # $usearch_binary --global --query $tmp_fasta_filename --iddef 3 --gapopen 6I/1E --db $refhvr_fa --uc $tmp_usearch_filename --maxaccepts $max_accepts --maxrejects $max_rejects --id $pctid_threshold # # sort the results for valid hits, saving only the ids and pct identity # grep -P \"^H\\t\" $tmp_usearch_filename | sed -e 's/|.*\$//' | awk '{print \$9 \"\\t\" \$4 \"\\t\" \$10 \"\\t\" \$8}' | sort -k1,1b -k2,2gr | clustergast_tophit > $gast_filename # Submit the script # /usr/local/sge/bin/lx24-amd64/qsub $qsub_priority $script_filename calcnodes = C.calcnodes_cmd sqlImportCommand = C.mysqlimport_cmd #qsub = '/usr/local/sge/bin/lx24-amd64/qsub' clusterize = C.clusterize_cmd ################################################################### # use fasta.uniques file # split into smaller files # usearch --cluster each ####################################### # # Split the uniques fasta and run UClust per node # ####################################### qsub_prefix = 'clustergast_sub_' gast_prefix = 'gast_' if self.use_cluster: logger.info("Using cluster for clustergast") else: logger.info("Not using cluster") counter=0 for key in self.idx_keys: print key counter +=1 print "\nFile:",str(counter) if counter >= self.limit: pass cluster_nodes = C.cluster_nodes logger.info("Cluster nodes set to: "+str(cluster_nodes)) output_dir = os.path.join(self.basedir,key) gast_dir = os.path.join(output_dir,'gast') # SMPL1_3_NNNNCGCTC_3 #print 'samples',key,self.runobj.samples if key in self.runobj.samples: dna_region = self.runobj.samples[key].dna_region else: dna_region = self.runobj.dna_region if not dna_region: logger.error("clustergast: We have no DNA Region: Setting dna_region to 'unknown'") dna_region = 'unknown' (refdb,taxdb) = self.get_reference_databases(dna_region) #print 'DBs',refdb,taxdb # if no dna_region OR no refdb can be found then use # refssu #if refdb contains refssu #the add this to grep command #and change usearch to usearch64 unique_file = 'Not Found' names_file = 'Not Found' if self.runobj.platform == 'vamps': unique_file = os.path.join(output_dir, key+'.unique.fa') elif self.runobj.platform == 'illumina': file_prefix = self.runobj.samples[key].file_prefix unique_file = os.path.join(self.input_dir,file_prefix+"-PERFECT_reads.fa.unique") names_file = os.path.join(self.input_dir,file_prefix+"-PERFECT_reads.fa.unique.names") else: pass print 'UNIQUE FILE',unique_file #print gast_dir #sys.exit("EXIT") i = 0 if cluster_nodes: grep_cmd = ['grep','-c','>',unique_file] logger.debug( ' '.join(grep_cmd) ) facount = subprocess.check_output(grep_cmd).strip() logger.debug( key+' count '+facount) calcnode_cmd = [calcnodes,'-t',str(facount),'-n',str(cluster_nodes),'-f','1'] calcout = subprocess.check_output(calcnode_cmd).strip() logger.debug("calcout:\n"+calcout) #calcout: # node=1 start=1 end=1 rows=1 # node=2 start=2 end=2 rows=1 # node=3 start=3 end=3 rows=1 lines = calcout.split("\n") gast_file_list = [] for line in lines: i += 1 if i >= cluster_nodes: continue script_filename = os.path.join(gast_dir,qsub_prefix + str(i)) gast_filename = os.path.join(gast_dir, gast_prefix + str(i)) fastasamp_filename = os.path.join(gast_dir, 'samp_' + str(i)) clustergast_filename = os.path.join(gast_dir, key+".gast_" + str(i)) gast_file_list.append(clustergast_filename) usearch_filename= os.path.join(gast_dir, "uc_" + str(i)) log_file = os.path.join(gast_dir, 'clustergast.log_' + str(i)) data = line.split() if len(data) < 2: continue start = data[1].split('=')[1] end = data[2].split('=')[1] if self.use_cluster: fh = open(script_filename,'w') qstat_name = "gast" + key + '_' + self.runobj.run + "_" + str(i) fh.write("#!/bin/csh\n") fh.write("#$ -j y\n" ) fh.write("#$ -o " + log_file + "\n") fh.write("#$ -N " + qstat_name + "\n\n") #fh.write("source /xraid/bioware/Modules/etc/profile.modules\n"); #fh.write("module load bioware\n\n"); # setup environment fh.write("source /xraid/bioware/Modules/etc/profile.modules\n") fh.write("module load bioware\n\n") cmd1 = self.get_fastasampler_cmd(unique_file, fastasamp_filename,start,end) logger.debug("fastasampler command: "+cmd1) if self.use_cluster: fh.write(cmd1 + "\n") else: subprocess.call(cmd1,shell=True) cmd2 = self.get_usearch_cmd(fastasamp_filename, refdb, usearch_filename) logger.debug("usearch command: "+cmd2) print 'usearch',cmd2 if self.use_cluster: fh.write(cmd2 + "\n") else: subprocess.call(cmd2,shell=True) cmd3 = self.get_grep_cmd(usearch_filename, clustergast_filename) logger.debug("grep command: "+cmd3) if self.use_cluster: fh.write(cmd3 + "\n") fh.close() # make script executable and run it os.chmod(script_filename, stat.S_IRWXU) qsub_cmd = clusterize + " " + script_filename # on vamps and vampsdev qsub cannot be run - unless you call it from the # cluster aware directories /xraid2-2/vampsweb/vamps and /xraid2-2/vampsweb/vampsdev qsub_cmd = C.qsub_cmd + " " + script_filename logger.debug("qsub command: "+qsub_cmd) #subprocess.call(qsub_cmd, shell=True) proc = subprocess.Popen(qsub_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # proc.communicate will block - probably not what we want #(stdout, stderr) = proc.communicate() #block the last onehere #print stderr,stdout else: subprocess.call(cmd3,shell=True) print cmd3 else: #fastasamp_filename = os.path.join(gast_dir, 'samp') usearch_filename= os.path.join(gast_dir, "uc") clustergast_filename_single = os.path.join(gast_dir, "gast"+dna_region) print usearch_filename,clustergast_filename_single cmd1 = self.get_usearch_cmd(unique_file,refdb,usearch_filename) print cmd1 subprocess.call(cmd1,shell=True) cmd2 = self.get_grep_cmd(usearch_filename, clustergast_filename_single) print cmd2 subprocess.call(cmd2,shell=True) if self.use_cluster: # wait here for all the clustergast scripts to finish temp_file_list = gast_file_list c = False maxwaittime = C.maxwaittime # seconds sleeptime = C.sleeptime # seconds counter = 0 while c == False: counter += 1 if counter >= maxwaittime / sleeptime: raise Exception("Max wait time exceeded in gast.py") for index, file in enumerate(temp_file_list): #print temp_file_list if os.path.exists(file) and os.path.getsize(file) > 0: # remove from tmp list logger.debug("Found file now removing from list: "+file) temp_file_list = temp_file_list[:index] + temp_file_list[index+1:] if temp_file_list: logger.info("waiting for clustergast files to fill...") logger.debug(' '.join(temp_file_list)) logger.info("\ttime: "+str(counter * sleeptime)+" | files left: "+str(len(temp_file_list))) time.sleep(sleeptime) else: c = True # now concatenate all the clustergast_files into one file (if they were split) if cluster_nodes: # gast file clustergast_filename_single = os.path.join(gast_dir, "gast"+dna_region) clustergast_fh = open(clustergast_filename_single,'w') # have to turn off cluster above to be able to 'find' these files for concatenation for n in range(1,i-1): #cmd = "cat "+ gast_dir + key+".gast_" + str(n) + " >> " + gast_dir + key+".gast" file = os.path.join(gast_dir, key+".gast_" + str(n)) if(os.path.exists(file)): shutil.copyfileobj(open(file,'rb'), clustergast_fh) else: logger.info( "Could not find file: "+os.path.basename(file)+" Skipping") clustergast_fh.flush() clustergast_fh.close() if not self.test: # remove tmp files for n in range(i+1): #print "Trying to remove "+os.path.join(gast_dir,"uc_"+str(n)) if os.path.exists(os.path.join(gast_dir,"uc_"+str(n))): os.remove(os.path.join(gast_dir,"uc_"+str(n))) pass #print "Trying to remove "+os.path.join(gast_dir,"samp_"+str(n)) if os.path.exists(os.path.join(gast_dir,"samp_"+str(n))): os.remove(os.path.join(gast_dir,"samp_"+str(n))) pass #print "Trying to remove "+os.path.join(self.gast_dir,key+".gast_"+str(n)) if os.path.exists(os.path.join(gast_dir,key+".gast_"+str(n))): os.remove(os.path.join(gast_dir,key+".gast_"+str(n))) pass print "Finished clustergast" logger.info("Finished clustergast") return ("SUCCESS","Clustergast")
def sequences(self,key,tax_collector, read_id_lookup, file_collector): """ fill vamps_sequences.txt file """ logger.info("Starting vamps_upload: sequences") print "Starting vamps_upload: sequences" if self.runobj.vamps_user_upload: project = self.runobj.project dataset = key else: if self.runobj.platform == 'illumina': project = self.runobj.samples[key].project dataset = self.runobj.samples[key].dataset elif self.runobj.platform == '454': pass else: pass project = project[0].capitalize() + project[1:] project_dataset = project+'--'+dataset # open gast_concat table to get the distances and the ferids refid_collector={} #if os.path.exists(gast_concat_file): for line in open(file_collector['gast_concat_file'],'r'): line = line.strip() items=line.split() id=items[0] distance=items[1] refhvr_ids=items[2] refid_collector[id]={} refid_collector[id]['distance']=distance refid_collector[id]['refhvr_ids']=refhvr_ids fh = open(file_collector['sequences_file'],'w') fh.write("\t".join(["HEADER","project","dataset","taxonomy","refhvr_ids", "rank", "seq_count","frequency","distance","read_id","project_dataset"] )+"\n") # open uniques fa file if os.path.exists(file_collector['unique_file']) and os.path.getsize(file_collector['unique_file']) > 0: f = FastaReader(file_collector['unique_file']) while f.next(): datarow = [''] id = f.id.split('|')[0] seq = f.seq if id in read_id_lookup: tax = read_id_lookup[id] else: tax = '' if tax in tax_collector: rank = tax_collector[tax]['rank'] cnt = tax_collector[tax]['knt'] freq = tax_collector[tax]['freq'] else: rank = 'NA' cnt = 0 freq = 0 if id in refid_collector: distance = refid_collector[id]['distance'] refhvr_ids = refid_collector[id]['refhvr_ids'] else: distance = '1.0' refhvr_ids = '0' datarow.append(seq) datarow.append(project) datarow.append(dataset) datarow.append(tax) datarow.append(refhvr_ids) datarow.append(rank) datarow.append(str(cnt)) datarow.append(str(freq)) datarow.append(distance) datarow.append(id) datarow.append(project_dataset) w = "\t".join(datarow) #print 'w',w fh.write(w+"\n") fh.close() logger.info("") return refid_collector
parser.add_argument('-l', '--loglevel', required=False, action="store", dest = "loglevel", default='ERROR', help = 'Sets logging level... DEBUG, INFO, WARNING, ERROR, CRITICAL') steps ='gast,vampsupload' #steps ='gast' #steps ='vampsupload' args = parser.parse_args() # set logging loggerlevel = args.loglevel.upper() print "\nLog Level set to:",loggerlevel logger.setLevel(loggerlevel) logger.info("Starting vamps_gast.py") # fill command line object data_object['datetime'] = str(datetime.date.today()) data_object['runcode'] = args.runcode data_object['site'] = args.site data_object['user'] = args.user data_object['project'] = args.project[:1].capitalize() + args.project[1:] data_object['dataset'] = args.dataset data_object['dna_region'] = args.dna_region data_object['domain'] = args.domain data_object['from_fasta'] = args.from_fasta data_object['fasta_file'] = args.fasta_file data_object['baseoutputdir'] = args.baseoutputdir data_object['output_dir'] = args.output_dir data_object['load_db'] = args.load_db
def gast(runobj): logger.info("STARTING GAST()") # logger.info("vsearch version: " % utils.get_vsearch_version) # for vamps 'new_lane_keys' will be prefix # of the uniques and names file # that was just created in vamps_gast.py # or we can get the 'lane_keys' directly from the config_file # for illumina: # a unique idx_key is a concatenation of barcode_index and run_key # Should return a list not a string idx_keys = get_keys(runobj) # get GAST object mygast = Gast(runobj, idx_keys) # Check for unique files and create them if not there result_code = mygast.check_for_unique_files(idx_keys) runobj.run_status_file_h.write(json.dumps(result_code)+"\n") if result_code['status'] == 'ERROR': logger.error("uniques not found failed") sys.exit("uniques not found failed") if runobj.vamps_user_upload: write_status_to_vamps_db( runobj.site, runobj.run, "GAST ERROR", "uniques file not found - failed" ) elif runobj.vamps_user_upload: write_status_to_vamps_db( runobj.site, runobj.run, result_code['status'], result_code['message'] ) sleep(5) # CLUSTERGAST result_code = mygast.clustergast() runobj.run_status_file_h.write(json.dumps(result_code)+"\n") if result_code['status'] == 'ERROR': logger.error("clutergast failed") sys.exit("clustergast failed") if runobj.vamps_user_upload: write_status_to_vamps_db( runobj.site, runobj.run, "GAST ERROR", "clustergast failed" ) elif runobj.vamps_user_upload: write_status_to_vamps_db( runobj.site, runobj.run, result_code['status'], result_code['message'] ) sleep(5) # GAST_CLEANUP result_code = mygast.gast_cleanup() runobj.run_status_file_h.write(json.dumps(result_code)+"\n") if result_code['status'] == 'ERROR': logger.error("gast_cleanup failed") sys.exit("gast_cleanup failed") if runobj.vamps_user_upload: write_status_to_vamps_db( runobj.site, runobj.run, "GAST ERROR", "gast_cleanup failed" ) elif runobj.vamps_user_upload: write_status_to_vamps_db( runobj.site, runobj.run, result_code['status'], result_code['message'] ) sleep(5) # GAST2TAX result_code = mygast.gast2tax() runobj.run_status_file_h.write(json.dumps(result_code)+"\n") if result_code['status'] == 'ERROR': logger.error("gast2tax failed") sys.exit("gast2tax failed") if runobj.vamps_user_upload: write_status_to_vamps_db( runobj.site, runobj.run, "GAST ERROR", "gast2tax failed" ) elif runobj.vamps_user_upload: write_status_to_vamps_db( runobj.site, runobj.run, result_code['status'], result_code['message'] )