def __init__(self, runobj = None): self.utils = PipelneUtils() self.runobj = runobj self.run_keys = self.runobj.run_keys self.rundate = self.runobj.run try: self.use_cluster = self.runobj.use_cluster except: self.use_cluster = True self.chg_suffix = ".chg" self.chimeras_suffix = ".chimeras" self.ref_suffix = ".db" self.denovo_suffix = ".txt" self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa" self.chimeric_suffix = ".chimeric.fa" self.base_suffix = "unique" + self.chimeras_suffix try: if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' except: lane_name = '' if self.runobj.vamps_user_upload: os.environ['SGE_ROOT'] ='/opt/sge' os.environ['SGE_CELL'] ='grendel' path = os.environ['PATH'] os.environ['PATH'] = '/opt/sge/bin/lx24-amd64:'+path site = self.runobj.site dir_prefix = self.runobj.user + '_' + self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.idx_keys = convert_unicode_dictionary_to_str(json.loads(open(self.runobj.trim_status_file_name,"r").read()))["new_lane_keys"] self.analysis_dir = self.dirs.check_dir(self.dirs.analysis_dir) self.indir = self.dirs.check_dir(self.dirs.trimming_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) self.gast_dir = self.dirs.check_dir(self.dirs.gast_dir) else: site = '' dir_prefix = self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.indir = self.dirs.check_dir(self.dirs.reads_overlap_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) # self.usearch_cmd = C.usearch_cmd self.usearch_cmd = C.usearch6_cmd #self.abskew = C.chimera_checking_abskew self.refdb = C.chimera_checking_refdb_6 self.its_refdb = C.chimera_checking_its_refdb_6 self.input_file_names = self.make_chimera_input_illumina_file_names()
def __init__(self, runobj = None): self.utils = PipelneUtils() self.runobj = runobj self.run_keys = self.runobj.run_keys self.rundate = self.runobj.run try: self.use_cluster = self.runobj.use_cluster except: self.use_cluster = True self.chg_suffix = ".chg" self.chimeras_suffix = ".chimeras" self.ref_suffix = ".db" self.denovo_suffix = ".txt" self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa" self.chimeric_suffix = ".chimeric.fa" self.base_suffix = "unique" + self.chimeras_suffix try: if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' except: lane_name = '' if self.runobj.vamps_user_upload: site = self.runobj.site dir_prefix = self.runobj.user + '_' + self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.idx_keys = convert_unicode_dictionary_to_str(json.loads(open(self.runobj.trim_status_file_name,"r").read()))["new_lane_keys"] self.indir = self.dirs.check_dir(self.dirs.trimming_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) else: site = '' dir_prefix = self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.indir = self.dirs.check_dir(self.dirs.reads_overlap_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) # self.usearch_cmd = C.usearch_cmd self.usearch_cmd = C.usearch6_cmd #self.abskew = C.chimera_checking_abskew self.refdb = C.chimera_checking_refdb_6 self.its_refdb = C.chimera_checking_its_refdb_6 self.input_file_names = self.make_chimera_input_illumina_file_names()
def __init__(self, runobj = None): self.utils = PipelneUtils() self.runobj = runobj self.rundate = self.runobj.run self.use_cluster = 1 self.unique_fasta_files = [] # if self.runobj.vamps_user_upload: # site = self.runobj.site # dir_prefix = self.runobj.user + '_' + self.runobj.run # else: # site = '' # dir_prefix = self.runobj.run # dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, site = site) if self.runobj.vamps_user_upload: site = self.runobj.site dir_prefix=self.runobj.user+'_'+self.runobj.run else: site = '' dir_prefix = self.runobj.run if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.analysis_dir = self.dirs.check_dir(self.dirs.analysis_dir) self.fasta_dir = self.dirs.check_dir(self.dirs.reads_overlap_dir) self.gast_dir = self.dirs.check_dir(self.dirs.gast_dir) host_name = runobj.database_host database_name = runobj.database_name self.filenames = [] self.my_conn = MyConnection(host = 'newbpcdb2.jbpc-np.mbl.edu', db="env454") # self.my_conn = MyConnection() # self.my_conn = MyConnection(host = 'localhost', db="test_env454") self.sequence_table_name = "sequence_ill" self.sequence_field_name = "sequence_comp" self.my_csv = None self.unique_file_counts = self.dirs.unique_file_counts self.dirs.delete_file(self.unique_file_counts) self.seq_id_dict = {} self.tax_id_dict = {} self.run_id = None # self.nonchimeras_suffix = ".nonchimeric.fa" self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa" self.fa_unique_suffix = ".fa." + C.unique_suffix #.fa.unique self.v6_unique_suffix = "MERGED_V6_PRIMERS_REMOVED." + C.unique_suffix self.suff_list = [self.nonchimeric_suffix, self.fa_unique_suffix, self.v6_unique_suffix] # self.merge_unique_suffix = "." + C.filtered_suffix + "." + C.unique_suffix #.MERGED-MAX-MISMATCH-3.unique self.suffix_used = ""
def __init__(self, runobj = None): self.utils = PipelneUtils() self.runobj = runobj self.run_keys = self.runobj.run_keys self.rundate = self.runobj.run self.chg_suffix = ".chg" self.chimeras_suffix = ".chimeras" self.ref_suffix = ".db" self.denovo_suffix = ".txt" self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa" self.chimeric_suffix = ".chimeric.fa" self.base_suffix = "unique" + self.chimeras_suffix try: if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' except: lane_name = '' if self.runobj.vamps_user_upload: site = self.runobj.site dir_prefix = self.runobj.user + '_' + self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.idx_keys = convert_unicode_dictionary_to_str(json.loads(open(self.runobj.trim_status_file_name,"r").read()))["new_lane_keys"] self.indir = self.dirs.check_dir(self.dirs.trimming_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) else: site = '' dir_prefix = self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.indir = self.dirs.check_dir(self.dirs.reads_overlap_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) # self.usearch_cmd = C.usearch_cmd self.usearch_cmd = C.usearch6_cmd #self.abskew = C.chimera_checking_abskew self.refdb = C.chimera_checking_refdb_6 self.its_refdb = C.chimera_checking_its_refdb_6 self.input_file_names = self.make_chimera_input_illumina_file_names()
def __init__(self, runobj): self.utils = PipelneUtils() self.runobj = runobj self.out_files = {} self.id_dataset_idx = {} self.in_file_path = self.runobj.input_dir if self.runobj.vamps_user_upload: site = self.runobj.site dir_prefix=self.runobj.user+'_'+self.runobj.run else: site = '' dir_prefix = self.runobj.run if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.dirs = dirs self.out_file_path = dirs.check_dir(dirs.analysis_dir) self.results_path = dirs.check_dir(dirs.reads_overlap_dir)
def __init__(self, runobj): self.utils = PipelneUtils() self.runobj = runobj self.out_files = {} self.id_dataset_idx = {} self.in_file_path = self.runobj.input_dir if self.runobj.vamps_user_upload: site = self.runobj.site dir_prefix=self.runobj.user+'_'+self.runobj.run else: site = '' dir_prefix = self.runobj.run if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.dirs = dirs self.out_file_path = dirs.check_dir(dirs.analysis_dir) self.results_path = dirs.check_dir(dirs.reads_overlap_dir) self.platform = self.runobj.platform
def __init__(self, runobj = None): self.runobj = runobj self.rundate = self.runobj.run self.use_cluster = 1 # if self.runobj.vamps_user_upload: # site = self.runobj.site # dir_prefix = self.runobj.user + '_' + self.runobj.run # else: # site = '' # dir_prefix = self.runobj.run # dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, site = site) if self.runobj.vamps_user_upload: site = self.runobj.site dir_prefix=self.runobj.user+'_'+self.runobj.run else: site = '' dir_prefix = self.runobj.run if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.analysis_dir = dirs.check_dir(dirs.analysis_dir) self.fasta_dir = dirs.check_dir(dirs.reads_overlap_dir) self.gast_dir = dirs.check_dir(dirs.gast_dir) host_name = runobj.database_host database_name = runobj.database_name self.filenames = [] self.my_conn = MyConnection(host = 'newbpcdb2', db="env454") # self.my_conn = MyConnection() self.sequence_table_name = "sequence_ill" self.sequence_field_name = "sequence_comp" self.my_csv = None self.unique_file_counts = dirs.unique_file_counts dirs.delete_file(self.unique_file_counts) self.seq_id_dict = {} self.tax_id_dict = {} self.run_id = None
class dbUpload: """db upload methods""" Name = "dbUpload" """ TODO: add tests and test case TODO: change hardcoded values to args: self.sequence_table_name = "sequence_ill", self.sequence_field_name = "sequence_comp" TODO: generalize all bulk uploads and all inserts? to not copy and paste TODO: add refssu_id TODO: change csv validaton for new fields Order: # put_run_info # insert_seq() # insert_pdr_info() # gast # insert_taxonomy() # insert_sequence_uniq_info_ill() """ def __init__(self, runobj = None): self.utils = PipelneUtils() self.runobj = runobj self.rundate = self.runobj.run self.use_cluster = 1 self.unique_fasta_files = [] # if self.runobj.vamps_user_upload: # site = self.runobj.site # dir_prefix = self.runobj.user + '_' + self.runobj.run # else: # site = '' # dir_prefix = self.runobj.run # dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, site = site) if self.runobj.vamps_user_upload: site = self.runobj.site dir_prefix=self.runobj.user+'_'+self.runobj.run else: site = '' dir_prefix = self.runobj.run if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.analysis_dir = self.dirs.check_dir(self.dirs.analysis_dir) self.fasta_dir = self.dirs.check_dir(self.dirs.reads_overlap_dir) self.gast_dir = self.dirs.check_dir(self.dirs.gast_dir) host_name = runobj.database_host database_name = runobj.database_name self.filenames = [] self.my_conn = MyConnection(host = 'newbpcdb2.jbpc-np.mbl.edu', db="env454") # self.my_conn = MyConnection() # self.my_conn = MyConnection(host = 'localhost', db="test_env454") self.sequence_table_name = "sequence_ill" self.sequence_field_name = "sequence_comp" self.my_csv = None self.unique_file_counts = self.dirs.unique_file_counts self.dirs.delete_file(self.unique_file_counts) self.seq_id_dict = {} self.tax_id_dict = {} self.run_id = None # self.nonchimeras_suffix = ".nonchimeric.fa" self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa" self.fa_unique_suffix = ".fa." + C.unique_suffix #.fa.unique self.v6_unique_suffix = "MERGED_V6_PRIMERS_REMOVED." + C.unique_suffix self.suff_list = [self.nonchimeric_suffix, self.fa_unique_suffix, self.v6_unique_suffix] # self.merge_unique_suffix = "." + C.filtered_suffix + "." + C.unique_suffix #.MERGED-MAX-MISMATCH-3.unique self.suffix_used = "" # self.refdb_dir = '/xraid2-2/vampsweb/blastdbs/' def get_fasta_file_names(self): files_names = self.dirs.get_all_files(self.fasta_dir) self.unique_fasta_files = [f for f in files_names.keys() if f.endswith(tuple(self.suff_list))] # needs return because how it's called from pipelineprocesor return self.unique_fasta_files def get_run_info_ill_id(self, filename_base): my_sql = """SELECT run_info_ill_id FROM run_info_ill JOIN run using(run_id) WHERE file_prefix = '%s' and run = '%s' """ % (filename_base, self.rundate) res = self.my_conn.execute_fetch_select(my_sql) if res: return int(res[0][0]) def make_seq_upper(self, filename): read_fasta = fastalib.ReadFasta(filename) sequences = [seq.upper() for seq in read_fasta.sequences] #here we make uppercase for VAMPS compartibility read_fasta.close() return sequences def insert_seq(self, sequences): query_tmpl = "INSERT IGNORE INTO %s (%s) VALUES (COMPRESS(%s))" val_tmpl = "'%s'" my_sql = query_tmpl % (self.sequence_table_name, self.sequence_field_name, ')), (COMPRESS('.join([val_tmpl % key for key in sequences])) seq_id = self.my_conn.execute_no_fetch(my_sql) self.utils.print_both("sequences in file: %s\n" % (len(sequences))) return seq_id # try: # query_tmpl = "INSERT IGNORE INTO %s (%s) VALUES (COMPRESS(%s))" # val_tmpl = "'%s'" # my_sql = query_tmpl % (self.sequence_table_name, self.sequence_field_name, ')), (COMPRESS('.join([val_tmpl % key for key in sequences])) # seq_id = self.my_conn.execute_no_fetch(my_sql) # # print "sequences in file: %s" % (len(sequences)) # self.utils.print_both("sequences in file: %s\n" % (len(sequences))) # return seq_id # except self.my_conn.conn.cursor._mysql_exceptions.Error as err: # if err.errno == 1582: # self.utils.print_both(("ERROR: _mysql_exceptions.OperationalError: (1582, \"Incorrect parameter count in the call to native function 'COMPRESS'\"), there is an empty fasta in %s") % self.fasta_dir) # else: # raise # except: # if len(sequences) == 0: # self.utils.print_both(("ERROR: There are no sequences, please check if there are correct fasta files in the directory %s") % self.fasta_dir) # raise def get_seq_id_dict(self, sequences): id_name = self.sequence_table_name + "_id" query_tmpl = """SELECT %s, uncompress(%s) FROM %s WHERE %s in (COMPRESS(%s))""" val_tmpl = "'%s'" try: my_sql = query_tmpl % (id_name, self.sequence_field_name, self.sequence_table_name, self.sequence_field_name, '), COMPRESS('.join([val_tmpl % key for key in sequences])) res = self.my_conn.execute_fetch_select(my_sql) one_seq_id_dict = dict((y, int(x)) for x, y in res) self.seq_id_dict.update(one_seq_id_dict) except: if len(sequences) == 0: self.utils.print_both(("ERROR: There are no sequences, please check if there are correct fasta files in the directory %s") % self.fasta_dir) raise def get_id(self, table_name, value): id_name = table_name + '_id' my_sql = """SELECT %s FROM %s WHERE %s = '%s'""" % (id_name, table_name, table_name, value) res = self.my_conn.execute_fetch_select(my_sql) if res: return int(res[0][0]) def get_sequence_id(self, seq): my_sql = """SELECT sequence_ill_id FROM sequence_ill WHERE COMPRESS('%s') = sequence_comp""" % (seq) res = self.my_conn.execute_fetch_select(my_sql) if res: return int(res[0][0]) def insert_pdr_info(self, fasta, run_info_ill_id): res_id = "" if (not run_info_ill_id): self.utils.print_both("ERROR: There is no run info yet, please check if it's uploaded to env454") # ------- insert sequence info per run/project/dataset -------- seq_upper = fasta.seq.upper() sequence_ill_id = self.seq_id_dict[seq_upper] seq_count = int(fasta.id.split('|')[-1].split(':')[-1]) # print run_info_ill_id, sequence_ill_id, seq_count my_sql = """INSERT IGNORE INTO sequence_pdr_info_ill (run_info_ill_id, sequence_ill_id, seq_count) VALUES (%s, %s, %s)""" % (run_info_ill_id, sequence_ill_id, seq_count) try: res_id = self.my_conn.execute_no_fetch(my_sql) return res_id except: self.utils.print_both("Offensive query: %s" % my_sql) raise def make_gast_files_dict(self): return self.dirs.get_all_files(self.gast_dir, "gast") def gast_filename(self, filename): # todo: if filename in make_gast_files_dict, use it full path gast_file_names = self.make_gast_files_dict() gast_file_name_path = "" for gast_file_name_path, tpls in gast_file_names.iteritems(): if any(t.endswith(filename) for t in tpls): return gast_file_name_path def get_gast_result(self, filename): gast_file_name = self.gast_filename(filename) self.utils.print_both("current gast_file_name = %s." % gast_file_name) try: with open(gast_file_name) as fd: gast_dict = dict([(l.split("\t")[0], l.split("\t")[1:]) for l in fd]) return gast_dict except IOError, e: # print dir(e) #['__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__getitem__', '__getslice__', '__hash__', '__init__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', 'args', 'errno', 'filename', 'message', 'strerror'] # print "errno = %s" % e.errno logger.debug("errno = %s" % e.errno) if e.errno == 2: # suppress "No such file or directory" error pass # except OSError, e: except TypeError, e: self.utils.print_both("Check if there is a gast file under %s for %s." % (self.gast_dir, filename)) pass
def __init__(self, run_object = None, idx_keys=None): self.runobj = run_object if self.runobj.site == 'vamps' or self.runobj.site == 'vampsdev' or self.runobj.site == 'new_vamps': sys.path.append('/groups/vampsweb/py_mbl_sequencing_pipeline') else: sys.path.append('/bioware/linux/seqinfo/bin/python_pipeline/py_mbl_sequencing_pipeline') from pipeline.pipelinelogging import logger from pipeline.utils import Dirs self.logger = logger self.logger.info('STARTING VAMPS') if self.runobj.vamps_user_upload: site = self.runobj.site if site == 'new_vamps': # vampsdev and vamps and new_vamps NOT local installation dir_prefix = self.runobj.project_dir elif self.runobj.mobedac: dir_prefix = 'mobedac_' + self.runobj.user + '_' + self.runobj.run else: dir_prefix = self.runobj.user + '_' + self.runobj.run else: site = '' dir_prefix = self.runobj.run dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, site = site) #self.basedir = self.runobj.output_dir #self.outdir = os.path.join(self.runobj.output_dir,self.prefix) self.idx_keys = idx_keys if self.runobj.vamps_user_upload: self.idx_keys = [self.runobj.user+self.runobj.run] self.iterator = self.runobj.datasets else: self.idx_keys = idx_keys self.iterator = self.idx_keys self.use_cluster = self.runobj.use_cluster os.environ['SGE_ROOT']='/opt/sge' os.environ['SGE_CELL']='grendel' path = os.environ['PATH'] os.environ['PATH'] = path + ':/opt/sge/bin/lx24-amd64' #First step is to check for (or create via mothur) # a uniques fasta file and a names file # one for each dataset. # If we are here from a vamps gast process # then there should be just one dataset to gast # but if MBL pipe then many datasets are prbably involved. # 1) clustergast # 2) gast cleanup # 3) gast2tax self.global_gast_dir = dirs.check_dir(dirs.gast_dir) if not os.path.exists(self.global_gast_dir): sys.exit("Could not find global gast dir: "+self.global_gast_dir) if self.runobj.site == 'vamps': db_host = 'vampsdb' db_name = 'vamps' db_home = '/xraid2-2/vampsweb/vamps/' else: db_host = 'bpcweb7' db_name = 'vamps' db_home = '/xraid2-2/vampsweb/vampsdev/' obj=ConMySQL.New(db_host, db_name, db_home) self.conn = obj.get_conn()
def __init__(self, run_object = None, idx_keys=None): self.runobj = run_object self.outdir = self.runobj.output_dir self.indir = self.runobj.input_dir if self.runobj.vamps_user_upload: site = self.runobj.site dir_prefix=self.runobj.user+'_'+self.runobj.run+'_trim' else: site = '' dir_prefix = self.runobj.run dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, site = site) self.analysis_dir = dirs.check_dir(dirs.analysis_dir) self.trimming_dir = dirs.check_dir(dirs.trimming_dir) # do something with 'run'. self.run = self.runobj.run logger.debug("Rundate:" + str(self.run)) self.use_cluster = self.runobj.use_cluster self.idx_keys = idx_keys self.seqDirs = {} self.dna_regions = {} self.taxonomic_domain = {} self.expanded_primers = {} self.proximal_primers = {} self.distal_primers = {} self.anchor_name = {} self.adtnl_anchors ={} self.anchors ={} self.psuite = {} self.id_list_passed = {} self.deleted_ids = {} self.trimmed_ids = {} self.uniques = {} self.names = {} self.uniquefa ={} self.abundfa ={} self.fa ={} self.statsFileName = 'run_trim_stats' os.environ['SGE_ROOT']='/usr/local/sge' os.environ['SGE_CELL']='grendel' path = os.environ['PATH'] os.environ['PATH'] = '/usr/local/sge/bin/lx24-amd64:'+path self.runbin={} # print 'SAMPLES: ' # for idx_key in self.runobj.samples: # print idx_key,self.runobj.samples[idx_key] # print 'SAMPLES: ' if self.runobj.vamps_user_upload: for idx_key in self.runobj.samples: sample = self.runobj.samples[idx_key] # strip off surrounding single or double quotes self.seqDirs[idx_key] = sample.direction self.dna_regions[idx_key] = sample.dna_region self.taxonomic_domain[idx_key] = sample.taxonomic_domain # this should be defaiult 'suite' of anchors # but also in ini file: anchor=XXXXX will be added to defaults self.anchor_name[idx_key] = sample.anchor self.adtnl_anchors[idx_key] = sample.stop_sequences #list self.anchors[idx_key] = {} #self.id_list_all[idx_key] = [] self.id_list_passed[idx_key] = [] self.deleted_ids[idx_key] = {} self.deleted_ids['nokey'] = {} self.trimmed_ids[idx_key] = {} self.uniques[idx_key] = {} self.names[idx_key] = {} self.fa[idx_key] = FastaOutput(os.path.join(self.trimming_dir, idx_key) + ".trimmed.fa") ##################### # # PrimerSuite Class # ##################### print self.taxonomic_domain[idx_key],self.dna_regions[idx_key],idx_key self.psuite[idx_key] = PrimerSuite(self.runobj, self.taxonomic_domain[idx_key],self.dna_regions[idx_key],idx_key) #self.runbin['psuite'][idx_key]= PrimerSuite(self.taxonomic_domain[idx_key],self.dna_regions[idx_key]) if(self.seqDirs[idx_key] == 'F' or self.seqDirs[idx_key] == 'B'): self.proximal_primers[idx_key] = self.psuite[idx_key].primer_expanded_seq_list['F'] self.distal_primers[idx_key] = self.psuite[idx_key].primer_expanded_seq_list['R'] if self.anchor_name[idx_key]: self.anchors[idx_key] = get_anchor_list(self.runobj, self.anchor_name[idx_key], self.adtnl_anchors[idx_key]) if(self.seqDirs[idx_key] == 'R' or self.seqDirs[idx_key] == 'B'): self.proximal_primers[idx_key] = [revcomp(primer_seq) for primer_seq in self.psuite[idx_key].primer_expanded_seq_list['R'] ] self.distal_primers[idx_key] = [revcomp(primer_seq) for primer_seq in self.psuite[idx_key].primer_expanded_seq_list['F'] ] if self.anchor_name[idx_key]: self.anchors[idx_key] = [revcomp( anchor ) for anchor in get_anchor_list(self.runobj, self.anchor_name[idx_key], self.adtnl_anchors[idx_key]) ] if len(self.proximal_primers[idx_key]) == 0 and len(self.distal_primers[idx_key]) == 0: logger.debug("**** Didn't find any primers that match any of the domain/regions in the lane/key sections") else: if self.runobj.platform == '454': for idx_key in self.idx_keys: sample = self.runobj.samples[idx_key] # strip off surrounding single or double quotes self.seqDirs[idx_key] = sample.direction self.dna_regions[idx_key] = sample.dna_region self.taxonomic_domain[idx_key] = sample.taxonomic_domain # this should be defaiult 'suite' of anchors # but also in ini file: anchor=XXXXX will be added to defaults self.anchor_name[idx_key] = sample.anchor self.adtnl_anchors[idx_key] = sample.stop_sequences #list self.anchors[idx_key] = {} #self.id_list_all[idx_key] = [] self.id_list_passed[idx_key] = [] self.deleted_ids[idx_key] = {} self.deleted_ids['nokey'] = {} self.trimmed_ids[idx_key] = {} self.uniques[idx_key] = {} self.names[idx_key] = {} self.fa[idx_key] = FastaOutput(os.path.join(self.trimming_dir, idx_key) + ".trimmed.fa") ##################### # # PrimerSuite Class # ##################### self.psuite[idx_key] = PrimerSuite(self.runobj, self.taxonomic_domain[idx_key],self.dna_regions[idx_key],idx_key) #self.runbin['psuite'][idx_key]= PrimerSuite(self.taxonomic_domain[idx_key],self.dna_regions[idx_key]) if(self.seqDirs[idx_key] == 'F' or self.seqDirs[idx_key] == 'B'): self.proximal_primers[idx_key] = self.psuite[idx_key].primer_expanded_seq_list['F'] self.distal_primers[idx_key] = self.psuite[idx_key].primer_expanded_seq_list['R'] if self.anchor_name[idx_key]: self.anchors[idx_key] = get_anchor_list(self.runobj, self.anchor_name[idx_key], self.adtnl_anchors[idx_key]) if(self.seqDirs[idx_key] == 'R' or self.seqDirs[idx_key] == 'B'): self.proximal_primers[idx_key] = [revcomp(primer_seq) for primer_seq in self.psuite[idx_key].primer_expanded_seq_list['R'] ] self.distal_primers[idx_key] = [revcomp(primer_seq) for primer_seq in self.psuite[idx_key].primer_expanded_seq_list['F'] ] if self.anchor_name[idx_key]: self.anchors[idx_key] = [revcomp( anchor ) for anchor in get_anchor_list(self.runobj, self.anchor_name[idx_key], self.adtnl_anchors[idx_key]) ] if len(self.proximal_primers[idx_key]) == 0 and len(self.distal_primers[idx_key]) == 0: logger.debug("**** Didn't find any primers that match any of the domain/regions in the lane/key sections") elif self.runobj.platform == 'illumina': # create our directories for each key pass
def __init__(self, run_object = None, idx_keys=None): self.runobj = run_object if self.runobj.vamps_user_upload: site = self.runobj.site if self.runobj.mobedac: dir_prefix = 'mobedac_' + self.runobj.user + '_' + self.runobj.run else: dir_prefix = self.runobj.user + '_' + self.runobj.run + '_gast' else: site = '' dir_prefix = self.runobj.run dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, site = site) #self.basedir = self.runobj.output_dir #self.outdir = os.path.join(self.runobj.output_dir,self.prefix) self.idx_keys = idx_keys if self.runobj.vamps_user_upload: self.idx_keys = [self.runobj.user+self.runobj.run] self.iterator = self.runobj.datasets else: self.idx_keys = idx_keys self.iterator = self.idx_keys self.use_cluster = self.runobj.use_cluster os.environ['SGE_ROOT']='/usr/local/sge' os.environ['SGE_CELL']='grendel' path = os.environ['PATH'] os.environ['PATH'] = path + ':/usr/local/sge/bin/lx24-amd64' #First step is to check for (or create via mothur) # a uniques fasta file and a names file # one for each dataset. # If we are here from a vamps gast process # then there should be just one dataset to gast # but if MBL pipe then many datasets are prbably involved. # 1) clustergast # 2) gast cleanup # 3) gast2tax self.global_gast_dir = dirs.check_dir(dirs.gast_dir) if not os.path.exists(self.global_gast_dir): sys.exit("Could not find global gast dir: "+self.global_gast_dir) if self.runobj.site == 'vamps': db_host = 'vampsdb' db_name = 'vamps' db_home = '/xraid2-2/vampsweb/vamps/' else: db_host = 'vampsdev' db_name = 'vamps' db_home = '/xraid2-2/vampsweb/vampsdev/' obj=ConMySQL.New(db_host, db_name, db_home) self.conn = obj.get_conn()
class Chimera: """ Define here """ def __init__(self, runobj = None): self.utils = PipelneUtils() self.runobj = runobj self.run_keys = self.runobj.run_keys self.rundate = self.runobj.run self.chg_suffix = ".chg" self.chimeras_suffix = ".chimeras" self.ref_suffix = ".db" self.denovo_suffix = ".txt" self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa" self.chimeric_suffix = ".chimeric.fa" self.base_suffix = "unique" + self.chimeras_suffix try: if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' except: lane_name = '' if self.runobj.vamps_user_upload: site = self.runobj.site dir_prefix = self.runobj.user + '_' + self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.idx_keys = convert_unicode_dictionary_to_str(json.loads(open(self.runobj.trim_status_file_name,"r").read()))["new_lane_keys"] self.indir = self.dirs.check_dir(self.dirs.trimming_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) else: site = '' dir_prefix = self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.indir = self.dirs.check_dir(self.dirs.reads_overlap_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) # self.usearch_cmd = C.usearch_cmd self.usearch_cmd = C.usearch6_cmd #self.abskew = C.chimera_checking_abskew self.refdb = C.chimera_checking_refdb_6 self.its_refdb = C.chimera_checking_its_refdb_6 self.input_file_names = self.make_chimera_input_illumina_file_names() # pprint(self.run_keys) # self.output_file_names = self.make_chimera_output_illumina_file_names(self.input_file_names) def make_chimera_input_illumina_file_names(self): input_file_names = {} for idx_key in self.run_keys: file_name = idx_key + "_" + C.filtered_suffix + ".unique" if os.path.exists(os.path.join(self.indir, file_name)): input_file_names[idx_key] = file_name return input_file_names # def make_chimera_output_illumina_file_names(self, input_file_names): # output_file_names = {} # for idx_key, input_file_name in input_file_names.iteritems(): # output_file_names[idx_key] = input_file_name # return output_file_names def get_current_dirname(self, in_or_out = ""): if in_or_out == "": cur_dirname = self.indir else: cur_dirname = self.outdir return cur_dirname def is_chimera_check_file(self, filename): return filename.endswith((self.chimeras_suffix + self.denovo_suffix, self.chimeras_suffix + self.ref_suffix, self.chimeric_suffix, self.nonchimeric_suffix)) def get_current_filenames(self, cur_dirname): cur_file_names = [] if cur_dirname == self.indir: cur_file_names = self.input_file_names.values() elif cur_dirname == self.outdir: cur_file_names = self.get_chimera_file_names(self.outdir) return cur_file_names def get_chimera_file_names(self, cur_dirname): cur_file_names = [] for dirname, dirnames, filenames in os.walk(cur_dirname): cur_file_names = [filename for filename in filenames if (self.is_chimera_check_file(filename))] return cur_file_names # def illumina_frequency_size(self, in_or_out = "", find = "frequency:", replace = ";size="): # cur_dirname = self.get_current_dirname(in_or_out) # cur_file_names = self.get_current_filenames(cur_dirname) # # print "cur_file_names: " # # pprint(cur_file_names) # change_from_suffix = "" # change_to_suffix = self.chg_suffix # # print "find = %s, replace = %s" % (find, replace) # regex = re.compile(r"%s" % find) # # for cur_file_name in cur_file_names: # file_name = os.path.join(cur_dirname, cur_file_name) # with open(file_name + change_from_suffix, "r") as sources: # lines = sources.readlines() # with open(file_name + change_to_suffix, "w") as target: # for line in lines: # target.write(regex.sub(replace, line)) def read_file(self, source_name): with open(source_name, "r") as sources: return sources.readlines() def illumina_sed(self, lines, target_name, regex, replace, uppercase): with open(target_name, "w") as target: for line in lines: if line.startswith(">"): line1 = regex.sub(replace, line) else: if (uppercase): line1 = line.upper() else: line1 = line target.write(line1) def call_illumina_sed(self, from_to): """ from_to = from_frequency_to_size or from_size_to_frequency """ sed_from_to = namedtuple('sed_from_to', 'find, replace, cur_dirname, cur_file_names, change_from_suffix, change_to_suffix, uppercase') from_frequency_to_size = sed_from_to( find = "frequency:", replace = ";size=", cur_dirname = self.indir, cur_file_names = self.get_current_filenames(self.indir), change_from_suffix = "", change_to_suffix = self.chg_suffix, uppercase = True ) from_size_to_frequency = sed_from_to( find = ";size=", replace = "frequency:", cur_dirname = self.outdir, cur_file_names = self.get_chimera_file_names(self.outdir), change_from_suffix = "", change_to_suffix = "", uppercase = False ) if (from_to == "from_frequency_to_size"): tuple_name = from_frequency_to_size elif (from_to == "from_size_to_frequency"): tuple_name = from_size_to_frequency regex = re.compile(r"%s" % tuple_name.find) # print "find = %s, replace = %s" % (find, replace) for cur_file_name in tuple_name.cur_file_names: file_name = os.path.join(tuple_name.cur_dirname, cur_file_name) source_name = file_name + tuple_name.change_from_suffix target_name = file_name + tuple_name.change_to_suffix lines = self.read_file(source_name) self.illumina_sed(lines, target_name, regex, tuple_name.replace, tuple_name.uppercase) def illumina_freq_to_size_in_chg(self): # TODO: not used? find1 = "frequency:" replace1 = ";size=" regex1 = re.compile(r"%s" % find1) # print "cur_file_names: " # pprint(cur_file_names) cur_dirname = self.get_current_dirname() cur_file_names = self.get_current_filenames(cur_dirname) change_from_suffix = "" change_to_suffix = self.chg_suffix # print "find = %s, replace = %s" % (find, replace) for cur_file_name in cur_file_names: file_name = os.path.join(cur_dirname, cur_file_name) with open(file_name + change_from_suffix, "r") as sources: lines = sources.readlines() with open(file_name + change_to_suffix, "w") as target: # line2 = [regex1.sub(replace1, line) if line.startswith(">") else line.upper() for line in lines] for line in lines: if line.startswith(">"): line1 = regex1.sub(replace1, line) else: line1 = line.upper() # print line1 target.write(line1) def illumina_size_to_freq_in_chimer(self): find1 = ";size=" replace1 = "frequency:" regex1 = re.compile(r"%s" % find1) cur_file_names = self.get_chimera_file_names(self.outdir) for file_chim in cur_file_names: file_chim_path = os.path.join(self.outdir, file_chim) with open(file_chim_path, "r") as sources: lines = sources.readlines() with open(file_chim_path, "w") as target: for line in lines: line1 = regex1.sub(replace1, line) target.write(line1) def illumina_rm_size_files(self): for idx_key in self.input_file_names: file_name = os.path.join(self.indir, self.input_file_names[idx_key] + self.chg_suffix) if os.path.exists(file_name): os.remove(file_name) # def illumina_chimera_size_files(self): # # import os # [os.rename(f, f.replace('_', '-')) for f in os.listdir('.') if not f.startswith('.')] def check_if_cluster_is_done(self, time_before): cluster_done = False check_qstat_cmd_line = "qstat | grep \"%s\" | grep usearch | wc -l" % time_before # check_qstat_cmd_line = "qstat | grep usearch" print "check_qstat_cmd_line = %s" % check_qstat_cmd_line try: p = subprocess.Popen(check_qstat_cmd_line, stdout=subprocess.PIPE, shell=True) (output, err) = p.communicate() num_proc = int(output) print "qstat is running %s 'usearch' processes" % num_proc # pprint(p) if (num_proc == 0): cluster_done = True # print "cluster_done from check_if_cluster_is_done = %s" % cluster_done except: print "Chimera checking can be done only on a cluster." raise return cluster_done def create_chimera_cmd(self, input_file_name, output_file_name, ref_or_novo, ref_db = ""): """ http://www.drive5.com/usearch/manual/uchime_denovo.html from usearch -help Chimera detection (UCHIME ref. db. mode): usearch -uchime q.fasta [-db db.fasta] [-chimeras ch.fasta] [-nonchimeras good.fasta] [-uchimeout results.uch] [-uchimealns results.alns] Chimera detection (UCHIME de novo mode): usearch -uchime amplicons.fasta [-chimeras ch.fasta] [-nonchimeras good.fasta] [-uchimeout results.uch] [-uchimealns results.alns] Input is estimated amplicons with integer abundances specified using ";size=N". usearch -uchime_denovo amplicons.fasta -uchimeout results.uchime """ uchime_cmd_append = "" db_cmd_append = "" dir_cmd_append = "" if (ref_or_novo == "denovo"): uchime_cmd_append = " -uchime_denovo " output_file_name = output_file_name + self.chimeras_suffix + self.denovo_suffix elif (ref_or_novo == "ref"): uchime_cmd_append = " -uchime_ref " output_file_name = output_file_name + self.chimeras_suffix + self.ref_suffix db_cmd_append = " -db " + ref_db dir_cmd_append = " -strand plus" else: print "Incorrect method, should be \"denovo\" or \"ref\"" print "output_file_name = %s" % output_file_name uchime_cmd = C.clusterize_cmd uchime_cmd += " " uchime_cmd += self.usearch_cmd uchime_cmd += uchime_cmd_append + input_file_name uchime_cmd += db_cmd_append uchime_cmd += " -uchimeout " + output_file_name """if we need nonchimeric for denovo and db separate we might create them here # uchime_cmd += " -nonchimeras " # uchime_cmd += (output_file_name + self.nonchimeric_suffix) """ uchime_cmd += " -chimeras " + (output_file_name + self.chimeric_suffix) uchime_cmd += dir_cmd_append uchime_cmd += " -notrunclabels" # print "uchime_cmd FROM create_chimera_cmd = %s" % (uchime_cmd) return uchime_cmd def get_ref_db(self, dna_region): ref_db = '' if dna_region.upper() == 'ITS': logger.debug("got an ITS dna region so using refdb: " + self.its_refdb) ref_db = self.its_refdb else: logger.debug("using standard refdb: " + self.refdb) ref_db = self.refdb return ref_db def chimera_checking(self, ref_or_novo): chimera_region_found = False output = {} for idx_key in self.input_file_names: # print "idx_key, self.input_file_names[idx_key] = %s, %s" % (idx_key, self.input_file_names) input_file_name = os.path.join(self.indir, self.input_file_names[idx_key] + self.chg_suffix) output_file_name = os.path.join(self.outdir, self.input_file_names[idx_key]) dna_region = self.runobj.samples[idx_key].dna_region # print "dna_region = %s" % dna_region if dna_region in C.regions_to_chimera_check: chimera_region_found = True else: logger.debug('region not checked: ' + dna_region) continue # print "input_file_name = %s \noutput_file_name = %s" % (input_file_name, output_file_name) ref_db = self.get_ref_db(dna_region) # print "dna_region = %s; ref_db = %s; ref_or_novo = %s" % (dna_region, ref_db, ref_or_novo) uchime_cmd = self.create_chimera_cmd(input_file_name, output_file_name, ref_or_novo, ref_db) print "\n==================\n%s command: %s" % (ref_or_novo, uchime_cmd) try: logger.info("chimera checking command: " + str(uchime_cmd)) output[idx_key] = subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except OSError, e: print "Problems with this command: %s" % (uchime_cmd) if self.utils.is_local(): print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e) else: print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e) raise # ??? if not chimera_region_found: return ('NOREGION', 'No regions found that need checking', '') else: return ("The usearch commands were created")
def __init__(self, run_object=None, idx_keys=None): self.runobj = run_object if self.runobj.site == 'vamps' or self.runobj.site == 'vampsdev' or self.runobj.site == 'new_vamps': sys.path.append('/groups/vampsweb/py_mbl_sequencing_pipeline') else: sys.path.append( '/bioware/linux/seqinfo/bin/python_pipeline/py_mbl_sequencing_pipeline' ) from pipeline.pipelinelogging import logger from pipeline.utils import Dirs self.logger = logger self.logger.info('STARTING VAMPS') if self.runobj.vamps_user_upload: site = self.runobj.site if site == 'new_vamps': # vampsdev and vamps and new_vamps NOT local installation dir_prefix = self.runobj.project_dir elif self.runobj.mobedac: dir_prefix = 'mobedac_' + self.runobj.user + '_' + self.runobj.run else: dir_prefix = self.runobj.user + '_' + self.runobj.run else: site = '' dir_prefix = self.runobj.run dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, site=site) #self.basedir = self.runobj.output_dir #self.outdir = os.path.join(self.runobj.output_dir,self.prefix) self.idx_keys = idx_keys if self.runobj.vamps_user_upload: self.idx_keys = [self.runobj.user + self.runobj.run] self.iterator = self.runobj.datasets else: self.idx_keys = idx_keys self.iterator = self.idx_keys self.use_cluster = self.runobj.use_cluster os.environ['SGE_ROOT'] = '/opt/sge' os.environ['SGE_CELL'] = 'grendel' path = os.environ['PATH'] os.environ['PATH'] = path + ':/opt/sge/bin/lx24-amd64' #First step is to check for (or create via mothur) # a uniques fasta file and a names file # one for each dataset. # If we are here from a vamps gast process # then there should be just one dataset to gast # but if MBL pipe then many datasets are prbably involved. # 1) clustergast # 2) gast cleanup # 3) gast2tax self.global_gast_dir = dirs.check_dir(dirs.gast_dir) if not os.path.exists(self.global_gast_dir): sys.exit("Could not find global gast dir: " + self.global_gast_dir) if self.runobj.site == 'vamps': db_host = 'vampsdb' db_name = 'vamps' db_home = '/xraid2-2/vampsweb/vamps/' else: db_host = 'bpcweb7' db_name = 'vamps' db_home = '/xraid2-2/vampsweb/vampsdev/' obj = ConMySQL.New(db_host, db_name, db_home) self.conn = obj.get_conn()
class Chimera: """ Define here """ def __init__(self, runobj = None): self.utils = PipelneUtils() self.runobj = runobj self.run_keys = self.runobj.run_keys self.rundate = self.runobj.run try: self.use_cluster = self.runobj.use_cluster except: self.use_cluster = True self.chg_suffix = ".chg" self.chimeras_suffix = ".chimeras" self.ref_suffix = ".db" self.denovo_suffix = ".txt" self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa" self.chimeric_suffix = ".chimeric.fa" self.base_suffix = "unique" + self.chimeras_suffix try: if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' except: lane_name = '' if self.runobj.vamps_user_upload: site = self.runobj.site dir_prefix = self.runobj.user + '_' + self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.idx_keys = convert_unicode_dictionary_to_str(json.loads(open(self.runobj.trim_status_file_name,"r").read()))["new_lane_keys"] self.indir = self.dirs.check_dir(self.dirs.trimming_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) else: site = '' dir_prefix = self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.indir = self.dirs.check_dir(self.dirs.reads_overlap_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) # self.usearch_cmd = C.usearch_cmd self.usearch_cmd = C.usearch6_cmd #self.abskew = C.chimera_checking_abskew self.refdb = C.chimera_checking_refdb_6 self.its_refdb = C.chimera_checking_its_refdb_6 self.input_file_names = self.make_chimera_input_illumina_file_names() # pprint(self.run_keys) # self.output_file_names = self.make_chimera_output_illumina_file_names(self.input_file_names) def make_chimera_input_illumina_file_names(self): input_file_names = {} for idx_key in self.run_keys: file_name = idx_key + "_" + C.filtered_suffix + ".unique" if os.path.exists(os.path.join(self.indir, file_name)): input_file_names[idx_key] = file_name return input_file_names # def make_chimera_output_illumina_file_names(self, input_file_names): # output_file_names = {} # for idx_key, input_file_name in input_file_names.iteritems(): # output_file_names[idx_key] = input_file_name # return output_file_names def get_current_dirname(self, in_or_out = ""): if in_or_out == "": cur_dirname = self.indir else: cur_dirname = self.outdir return cur_dirname def is_chimera_check_file(self, filename): return filename.endswith((self.chimeras_suffix + self.denovo_suffix, self.chimeras_suffix + self.ref_suffix, self.chimeric_suffix, self.nonchimeric_suffix)) def get_current_filenames(self, cur_dirname): cur_file_names = [] if cur_dirname == self.indir: cur_file_names = self.input_file_names.values() elif cur_dirname == self.outdir: cur_file_names = self.get_chimera_file_names(self.outdir) return cur_file_names def get_chimera_file_names(self, cur_dirname): cur_file_names = [] for dirname, dirnames, filenames in os.walk(cur_dirname): cur_file_names = [filename for filename in filenames if (self.is_chimera_check_file(filename))] return cur_file_names # def illumina_frequency_size(self, in_or_out = "", find = "frequency:", replace = ";size="): # cur_dirname = self.get_current_dirname(in_or_out) # cur_file_names = self.get_current_filenames(cur_dirname) # # print "cur_file_names: " # # pprint(cur_file_names) # change_from_suffix = "" # change_to_suffix = self.chg_suffix # # print "find = %s, replace = %s" % (find, replace) # regex = re.compile(r"%s" % find) # # for cur_file_name in cur_file_names: # file_name = os.path.join(cur_dirname, cur_file_name) # with open(file_name + change_from_suffix, "r") as sources: # lines = sources.readlines() # with open(file_name + change_to_suffix, "w") as target: # for line in lines: # target.write(regex.sub(replace, line)) def read_file(self, source_name): with open(source_name, "r") as sources: return sources.readlines() def illumina_sed(self, lines, target_name, regex, replace, uppercase): with open(target_name, "w") as target: for line in lines: if line.startswith(">"): line1 = regex.sub(replace, line) else: if (uppercase): line1 = line.upper() else: line1 = line target.write(line1) def call_illumina_sed(self, from_to): """ from_to = from_frequency_to_size or from_size_to_frequency """ sed_from_to = namedtuple('sed_from_to', 'find, replace, cur_dirname, cur_file_names, change_from_suffix, change_to_suffix, uppercase') from_frequency_to_size = sed_from_to( find = "frequency:", replace = ";size=", cur_dirname = self.indir, cur_file_names = self.get_current_filenames(self.indir), change_from_suffix = "", change_to_suffix = self.chg_suffix, uppercase = True ) from_size_to_frequency = sed_from_to( find = ";size=", replace = "frequency:", cur_dirname = self.outdir, cur_file_names = self.get_chimera_file_names(self.outdir), change_from_suffix = "", change_to_suffix = "", uppercase = False ) if (from_to == "from_frequency_to_size"): tuple_name = from_frequency_to_size elif (from_to == "from_size_to_frequency"): tuple_name = from_size_to_frequency regex = re.compile(r"%s" % tuple_name.find) # print "find = %s, replace = %s" % (find, replace) if (not tuple_name.cur_file_names) and (tuple_name == from_frequency_to_size): self.utils.print_both('ERROR: Did not find uniqued files (".unique") in %s, please check if the previous step has finished. Exiting.\n' % self.indir) sys.exit() for cur_file_name in tuple_name.cur_file_names: file_name = os.path.join(tuple_name.cur_dirname, cur_file_name) source_name = file_name + tuple_name.change_from_suffix target_name = file_name + tuple_name.change_to_suffix lines = self.read_file(source_name) self.illumina_sed(lines, target_name, regex, tuple_name.replace, tuple_name.uppercase) def illumina_freq_to_size_in_chg(self): # TODO: not used? find1 = "frequency:" replace1 = ";size=" regex1 = re.compile(r"%s" % find1) # print "cur_file_names: " # pprint(cur_file_names) cur_dirname = self.get_current_dirname() cur_file_names = self.get_current_filenames(cur_dirname) change_from_suffix = "" change_to_suffix = self.chg_suffix # print "find = %s, replace = %s" % (find, replace) for cur_file_name in cur_file_names: file_name = os.path.join(cur_dirname, cur_file_name) with open(file_name + change_from_suffix, "r") as sources: lines = sources.readlines() with open(file_name + change_to_suffix, "w") as target: # line2 = [regex1.sub(replace1, line) if line.startswith(">") else line.upper() for line in lines] for line in lines: if line.startswith(">"): line1 = regex1.sub(replace1, line) else: line1 = line.upper() # print line1 target.write(line1) def illumina_size_to_freq_in_chimer(self): find1 = ";size=" replace1 = "frequency:" regex1 = re.compile(r"%s" % find1) cur_file_names = self.get_chimera_file_names(self.outdir) for file_chim in cur_file_names: file_chim_path = os.path.join(self.outdir, file_chim) with open(file_chim_path, "r") as sources: lines = sources.readlines() with open(file_chim_path, "w") as target: for line in lines: line1 = regex1.sub(replace1, line) target.write(line1) def illumina_rm_size_files(self): for idx_key in self.input_file_names: file_name = os.path.join(self.indir, self.input_file_names[idx_key] + self.chg_suffix) if os.path.exists(file_name): os.remove(file_name) # def illumina_chimera_size_files(self): # # import os # [os.rename(f, f.replace('_', '-')) for f in os.listdir('.') if not f.startswith('.')] def check_if_cluster_is_done(self, time_before): cluster_done = False check_qstat_cmd_line = "qstat | grep \"%s\" | grep usearch | wc -l" % time_before # check_qstat_cmd_line = "qstat | grep usearch" self.utils.print_both("check_qstat_cmd_line = %s" % check_qstat_cmd_line) try: p = subprocess.Popen(check_qstat_cmd_line, stdout=subprocess.PIPE, shell=True) (output, err) = p.communicate() num_proc = int(output) self.utils.print_both("qstat is running %s 'usearch' processes" % num_proc) # pprint(p) if (num_proc == 0): cluster_done = True # print "cluster_done from check_if_cluster_is_done = %s" % cluster_done except: self.utils.print_both("Chimera checking can be done only on a cluster.") raise return cluster_done def create_chimera_cmd(self, input_file_name, output_file_name, ref_or_novo, ref_db = ""): """ http://www.drive5.com/usearch/manual/uchime_denovo.html from usearch -help Chimera detection (UCHIME ref. db. mode): usearch -uchime q.fasta [-db db.fasta] [-chimeras ch.fasta] [-nonchimeras good.fasta] [-uchimeout results.uch] [-uchimealns results.alns] Chimera detection (UCHIME de novo mode): usearch -uchime amplicons.fasta [-chimeras ch.fasta] [-nonchimeras good.fasta] [-uchimeout results.uch] [-uchimealns results.alns] Input is estimated amplicons with integer abundances specified using ";size=N". usearch -uchime_denovo amplicons.fasta -uchimeout results.uchime """ uchime_cmd_append = "" db_cmd_append = "" dir_cmd_append = "" if (ref_or_novo == "denovo"): uchime_cmd_append = " -uchime_denovo " output_file_name = output_file_name + self.chimeras_suffix + self.denovo_suffix elif (ref_or_novo == "ref"): uchime_cmd_append = " -uchime_ref " output_file_name = output_file_name + self.chimeras_suffix + self.ref_suffix db_cmd_append = " -db " + ref_db dir_cmd_append = " -strand plus" else: self.utils.print_both("Incorrect method, should be \"denovo\" or \"ref\"") self.utils.print_both("output_file_name = %s" % output_file_name) uchime_cmd = C.clusterize_cmd uchime_cmd += " " uchime_cmd += self.usearch_cmd uchime_cmd += uchime_cmd_append + input_file_name uchime_cmd += db_cmd_append uchime_cmd += " -uchimeout " + output_file_name """if we need nonchimeric for denovo and db separate we might create them here # uchime_cmd += " -nonchimeras " # uchime_cmd += (output_file_name + self.nonchimeric_suffix) """ uchime_cmd += " -chimeras " + (output_file_name + self.chimeric_suffix) uchime_cmd += dir_cmd_append uchime_cmd += " -notrunclabels" # print "uchime_cmd FROM create_chimera_cmd = %s" % (uchime_cmd) return uchime_cmd def get_ref_db(self, dna_region): ref_db = '' if dna_region.upper() == 'ITS': logger.debug("got an ITS dna region so using refdb: " + self.its_refdb) ref_db = self.its_refdb else: logger.debug("using standard refdb: " + self.refdb) ref_db = self.refdb return ref_db def chimera_checking(self, ref_or_novo): chimera_region_found = False output = {} for idx_key in self.input_file_names: # print "idx_key, self.input_file_names[idx_key] = %s, %s" % (idx_key, self.input_file_names) input_file_name = os.path.join(self.indir, self.input_file_names[idx_key] + self.chg_suffix) output_file_name = os.path.join(self.outdir, self.input_file_names[idx_key]) dna_region = self.runobj.samples[idx_key].dna_region # print "dna_region = %s" % dna_region if dna_region in C.regions_to_chimera_check: chimera_region_found = True else: logger.debug('region not checked: ' + dna_region) continue # print "input_file_name = %s \noutput_file_name = %s" % (input_file_name, output_file_name) ref_db = self.get_ref_db(dna_region) # print "dna_region = %s; ref_db = %s; ref_or_novo = %s" % (dna_region, ref_db, ref_or_novo) uchime_cmd = self.create_chimera_cmd(input_file_name, output_file_name, ref_or_novo, ref_db) self.utils.print_both("\n==================\n%s command: %s" % (ref_or_novo, uchime_cmd)) try: logger.info("chimera checking command: " + str(uchime_cmd)) output[idx_key] = subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except OSError, e: self.utils.print_both("Problems with this command: %s" % (uchime_cmd)) if self.utils.is_local(): print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e) else: print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e) self.utils.print_both("Execution of %s failed: %s" % (uchime_cmd, e)) raise # ??? if not chimera_region_found: return ('NOREGION', 'No regions found that need checking', '') else: return ("The usearch commands were created")
def __init__(self, runobj=None): self.utils = PipelneUtils() self.runobj = runobj self.run_keys = self.runobj.run_keys self.rundate = self.runobj.run try: self.use_cluster = self.runobj.use_cluster except: self.use_cluster = True self.chg_suffix = ".chg" self.chimeras_suffix = ".chimeras" self.ref_suffix = ".db" self.denovo_suffix = ".txt" self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa" self.chimeric_suffix = ".chimeric.fa" self.base_suffix = "unique" + self.chimeras_suffix self.cluster_slots = { "grendel": [12, 8], "cricket": [40], "cluster5": [32] } try: if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' except: lane_name = '' if self.runobj.vamps_user_upload: os.environ['SGE_ROOT'] = '/opt/sge' os.environ['SGE_CELL'] = 'grendel' path = os.environ['PATH'] os.environ['PATH'] = '/opt/sge/bin/lx24-amd64:' + path site = self.runobj.site dir_prefix = self.runobj.user + '_' + self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name=lane_name, site=site) self.idx_keys = convert_unicode_dictionary_to_str( json.loads( open(self.runobj.trim_status_file_name, "r").read()))["new_lane_keys"] self.analysis_dir = self.dirs.check_dir(self.dirs.analysis_dir) self.indir = self.dirs.check_dir(self.dirs.trimming_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) self.gast_dir = self.dirs.check_dir(self.dirs.gast_dir) else: site = '' dir_prefix = self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name=lane_name, site=site) self.indir = self.dirs.check_dir(self.dirs.reads_overlap_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) # self.usearch_cmd = C.usearch_cmd self.usearch_cmd = C.usearch6_cmd if self.utils.is_local(): self.usearch_cmd = C.usearch6_cmd_local #self.abskew = C.chimera_checking_abskew self.refdb = C.chimera_checking_refdb if self.utils.is_local(): self.refdb_local = C.chimera_checking_refdb_local self.its_refdb = C.chimera_checking_its_refdb self.input_file_names = self.make_chimera_input_illumina_file_names()
class Chimera: """ Define here """ def __init__(self, runobj=None): self.utils = PipelneUtils() self.runobj = runobj self.run_keys = self.runobj.run_keys self.rundate = self.runobj.run try: self.use_cluster = self.runobj.use_cluster except: self.use_cluster = True self.chg_suffix = ".chg" self.chimeras_suffix = ".chimeras" self.ref_suffix = ".db" self.denovo_suffix = ".txt" self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa" self.chimeric_suffix = ".chimeric.fa" self.base_suffix = "unique" + self.chimeras_suffix self.cluster_slots = { "grendel": [12, 8], "cricket": [40], "cluster5": [32] } try: if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' except: lane_name = '' if self.runobj.vamps_user_upload: os.environ['SGE_ROOT'] = '/opt/sge' os.environ['SGE_CELL'] = 'grendel' path = os.environ['PATH'] os.environ['PATH'] = '/opt/sge/bin/lx24-amd64:' + path site = self.runobj.site dir_prefix = self.runobj.user + '_' + self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name=lane_name, site=site) self.idx_keys = convert_unicode_dictionary_to_str( json.loads( open(self.runobj.trim_status_file_name, "r").read()))["new_lane_keys"] self.analysis_dir = self.dirs.check_dir(self.dirs.analysis_dir) self.indir = self.dirs.check_dir(self.dirs.trimming_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) self.gast_dir = self.dirs.check_dir(self.dirs.gast_dir) else: site = '' dir_prefix = self.runobj.run self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name=lane_name, site=site) self.indir = self.dirs.check_dir(self.dirs.reads_overlap_dir) self.outdir = self.dirs.check_dir(self.dirs.chimera_dir) # self.usearch_cmd = C.usearch_cmd self.usearch_cmd = C.usearch6_cmd if self.utils.is_local(): self.usearch_cmd = C.usearch6_cmd_local #self.abskew = C.chimera_checking_abskew self.refdb = C.chimera_checking_refdb if self.utils.is_local(): self.refdb_local = C.chimera_checking_refdb_local self.its_refdb = C.chimera_checking_its_refdb self.input_file_names = self.make_chimera_input_illumina_file_names() # self.output_file_names = self.make_chimera_output_illumina_file_names(self.input_file_names) def get_ref_db(self, dna_region): ref_db = '' if dna_region.upper() == 'ITS': ref_db = C.chimera_checking_its_refdb logger.debug("got an ITS dna region so using refdb: " + ref_db) else: ref_db = C.chimera_checking_refdb if self.utils.is_local(): ref_db = C.chimera_checking_refdb_local logger.debug("using standard refdb: " + ref_db) return ref_db def make_chimera_input_illumina_file_names(self): input_file_names = {} for idx_key in self.run_keys: file_name = idx_key + "_" + C.filtered_suffix + ".unique" if os.path.exists(os.path.join(self.indir, file_name)): input_file_names[idx_key] = file_name return input_file_names def get_current_dirname(self, in_or_out=""): if in_or_out == "": cur_dirname = self.indir else: cur_dirname = self.outdir return cur_dirname def is_chimera_check_file(self, filename): return filename.endswith( (self.chimeras_suffix + self.denovo_suffix, self.chimeras_suffix + self.ref_suffix, self.chimeric_suffix, self.nonchimeric_suffix)) def get_current_filenames(self, cur_dirname): cur_file_names = [] if cur_dirname == self.indir: cur_file_names = self.input_file_names.values() elif cur_dirname == self.outdir: cur_file_names = self.get_chimera_file_names(self.outdir) return cur_file_names def get_chimera_file_names(self, cur_dirname): cur_file_names = [] for dirname, dirnames, filenames in os.walk(cur_dirname): cur_file_names = [ filename for filename in filenames if (self.is_chimera_check_file(filename)) ] return cur_file_names def read_file(self, source_name): with open(source_name, "r") as sources: return sources.readlines() def illumina_sed(self, lines, target_name, regex, replace, uppercase): with open(target_name, "w") as target: for line in lines: if line.startswith(">"): line1 = regex.sub(replace, line) else: if (uppercase): line1 = line.upper() else: line1 = line target.write(line1) def call_illumina_sed(self, from_to): """ from_to = from_frequency_to_size or from_size_to_frequency """ sed_from_to = namedtuple( 'sed_from_to', 'find, replace, cur_dirname, cur_file_names, change_from_suffix, change_to_suffix, uppercase' ) from_frequency_to_size = sed_from_to( find="frequency:", replace=";size=", cur_dirname=self.indir, cur_file_names=self.get_current_filenames(self.indir), change_from_suffix="", change_to_suffix=self.chg_suffix, uppercase=True) from_size_to_frequency = sed_from_to( find=";size=", replace="frequency:", cur_dirname=self.outdir, cur_file_names=self.get_chimera_file_names(self.outdir), change_from_suffix="", change_to_suffix="", uppercase=False) if (from_to == "from_frequency_to_size"): tuple_name = from_frequency_to_size elif (from_to == "from_size_to_frequency"): tuple_name = from_size_to_frequency regex = re.compile(r"%s" % tuple_name.find) # logger.debug("find = %s, replace = %s" % (find, replace)) if (not tuple_name.cur_file_names) and (tuple_name == from_frequency_to_size): self.utils.print_both( 'ERROR: Did not find uniqued files ("%s") in %s, please check if the previous step has finished. Exiting.\n' % (C.filtered_suffix + ".unique", self.indir)) sys.exit() for cur_file_name in tuple_name.cur_file_names: file_name = os.path.join(tuple_name.cur_dirname, cur_file_name) source_name = file_name + tuple_name.change_from_suffix target_name = file_name + tuple_name.change_to_suffix lines = self.read_file(source_name) self.illumina_sed(lines, target_name, regex, tuple_name.replace, tuple_name.uppercase) def illumina_freq_to_size_in_chg(self): find1 = "frequency:" replace1 = ";size=" regex1 = re.compile(r"%s" % find1) # logger.debug("cur_file_names: ") # pprint(cur_file_names) cur_dirname = self.get_current_dirname() cur_file_names = self.get_current_filenames(cur_dirname) change_from_suffix = "" change_to_suffix = self.chg_suffix # logger.debug("find = %s, replace = %s" % (find, replace)) for cur_file_name in cur_file_names: file_name = os.path.join(cur_dirname, cur_file_name) lines = self.utils.read_file(file_name + change_from_suffix) with open(file_name + change_to_suffix, "w") as target: for line in lines: if line.startswith(">"): line1 = regex1.sub(replace1, line) else: line1 = line.upper() # logger.debug(line1) target.write(line1) def illumina_size_to_freq_in_chimer(self): find1 = ";size=" replace1 = "frequency:" regex1 = re.compile(r"%s" % find1) cur_file_names = self.get_chimera_file_names(self.outdir) for file_chim in cur_file_names: file_chim_path = os.path.join(self.outdir, file_chim) lines = self.utils.read_file(file_chim_path) with open(file_chim_path, "w") as target: for line in lines: line1 = regex1.sub(replace1, line) target.write(line1) def illumina_rm_size_files(self): for idx_key in self.input_file_names: file_name = os.path.join( self.indir, self.input_file_names[idx_key] + self.chg_suffix) if os.path.exists(file_name): pass # os.remove(file_name) def check_if_chimera_dir_empty(self): if not os.listdir(self.outdir): self.utils.print_both( 'ERROR: Did not find files in %s, something is wrong. First check if you ran the command on a cluster. Exiting.\n' % self.outdir) sys.exit() def check_if_cluster_is_done(self, time_before): cluster_done = False check_qstat_cmd_line = "qstat | grep \"%s\" | grep chimera_ch | wc -l" % time_before # check_qstat_cmd_line = "qstat | grep vsearch" self.utils.print_both("check_qstat_cmd_line = %s" % check_qstat_cmd_line) try: p = subprocess.Popen(check_qstat_cmd_line, stdout=subprocess.PIPE, shell=True) (output, err) = p.communicate() num_proc = int(output) self.utils.print_both("qstat is running %s 'vsearch' processes" % num_proc) # pprint(p) if (num_proc == 0): cluster_done = True # logger.debug("cluster_done from check_if_cluster_is_done = %s" % cluster_done) except: self.utils.print_both( "Chimera checking can be done only on a cluster.") raise return cluster_done def create_chimera_cmd(self, ref_db): """ /usr/local/bin/vsearch -uchime_denovo /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/reads_overlap/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chg -uchimeout /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.txt -chimeras /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.txt.chimeric.fa -notrunclabels --- /usr/local/bin/vsearch -uchime_ref /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/reads_overlap/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chg -uchimeout /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.db -chimeras /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.db.chimeric.fa -notrunclabels -strand plus -db /groups/g454/blastdbs/rRNA16S.gold.fasta """ command_line = [] ref_or_novo_options = { self.denovo_suffix: "-uchime_denovo", self.ref_suffix: "-uchime_ref" } for suff, opt in ref_or_novo_options.items(): input_file_name = self.indir + "/$filename_base" + self.chg_suffix output_file_name = self.outdir + "/$filename_base" + self.chimeras_suffix + suff ref_add = "" if (opt == "-uchime_ref"): ref_add = "-strand plus -db %s" % ref_db uchime_cmd = """%s %s %s -uchimeout %s -chimeras %s%s -notrunclabels %s """ % (self.usearch_cmd, opt, input_file_name, output_file_name, output_file_name, self.chimeric_suffix, ref_add) logger.debug("UUU = uchime_cmd = %s" % uchime_cmd) logger.debug("+++") command_line.append(uchime_cmd) return command_line def create_chimera_cmd_old(self, input_file_name, output_file_name, ref_or_novo, ref_db=""): """ http://www.drive5.com/usearch/manual/uchime_denovo.html from usearch -help Chimera detection (UCHIME ref. db. mode): usearch -uchime q.fasta [-db db.fasta] [-chimeras ch.fasta] [-nonchimeras good.fasta] [-uchimeout results.uch] [-uchimealns results.alns] Chimera detection (UCHIME de novo mode): usearch -uchime amplicons.fasta [-chimeras ch.fasta] [-nonchimeras good.fasta] [-uchimeout results.uch] [-uchimealns results.alns] Input is estimated amplicons with integer abundances specified using ";size=N". usearch -uchime_denovo amplicons.fasta -uchimeout results.uchime """ uchime_cmd_append = "" db_cmd_append = "" dir_cmd_append = "" if (ref_or_novo == "denovo"): uchime_cmd_append = " -uchime_denovo " output_file_name = output_file_name + self.chimeras_suffix + self.denovo_suffix elif (ref_or_novo == "ref"): uchime_cmd_append = " -uchime_ref " output_file_name = output_file_name + self.chimeras_suffix + self.ref_suffix db_cmd_append = " -db " + ref_db dir_cmd_append = " -strand plus" else: self.utils.print_both( "Error: Incorrect method, should be \"denovo\" or \"ref\"") self.utils.print_both("output_file_name = %s" % output_file_name) uchime_cmd = C.clusterize_cmd if self.utils.is_local(): uchime_cmd = "" uchime_cmd += " " uchime_cmd += self.usearch_cmd logger.debug("self.usearch_cmd FROM create_chimera_cmd = %s" % (uchime_cmd)) uchime_cmd += uchime_cmd_append + input_file_name logger.debug("uchime_cmd_append FROM create_chimera_cmd = %s" % (uchime_cmd_append)) uchime_cmd += db_cmd_append logger.debug("db_cmd_append FROM create_chimera_cmd = %s" % (db_cmd_append)) uchime_cmd += " -uchimeout " + output_file_name """if we need nonchimeric for denovo and db separate we might create them here""" uchime_cmd += " -nonchimeras " uchime_cmd += (output_file_name + self.nonchimeric_suffix) uchime_cmd += " -chimeras " + (output_file_name + self.chimeric_suffix) uchime_cmd += dir_cmd_append uchime_cmd += " -notrunclabels" logger.debug("uchime_cmd FROM create_chimera_cmd = %s" % (uchime_cmd)) return uchime_cmd def get_sge_cluster_name(self): # import subprocess result = subprocess.run(['qstat', '-F'], stdout=subprocess.PIPE) a1 = result.stdout.decode('utf-8').split() for line in a1: if (line.find("hostname") != -1): #qf:hostname=grendel-01.bpcservers.private return line.split("=")[1].split("-")[0] def get_sge_slot_number( self ): # doesn't work on cricket because: qc:slots=12 and qc:slots=8 result = subprocess.run(['qstat', '-F', 'slots'], stdout=subprocess.PIPE) a1 = result.stdout.decode('utf-8').split() slots = [] for line in a1: if line.startswith('qc:slots'): slots.append(int(line.split("=")[-1])) slots_uniq = set(slots) return max(slots_uniq) # TODO: temp! take from util. change illumina-files to use util, too # create_job_array_script(self, command_line, dir_to_run, files_list, runobj) # feb 25 2019 removed, because didn't work on grendel: # Use the allslots pe and all available slots on that cluster # #$ -pe allslots %s def create_job_array_script(self, script_file_name_base, command_line, dir_to_run, files_list): # sge_slot_number = self.get_sge_slot_number() sge_cluster_name = self.get_sge_cluster_name() sge_slot_number = self.cluster_slots[sge_cluster_name][0] logger.debug("sge_slot_number FROM create_job_array_script = %s" % (sge_slot_number)) files_string = " ".join(files_list) files_list_size = len(files_list) # command_file_name = os.path.basename(command_line.split(" ")[0]) script_file_name = script_file_name_base + "_" + self.runobj.run + "_" + self.runobj.lane_name + ".sh" script_file_name_full = os.path.join(dir_to_run, script_file_name) log_file_name = script_file_name + ".sge_script.sh.log" email_mbl = C.email_mbl # self.utils.make_users_email() text = ( '''#!/bin/bash #$ -cwd #$ -S /bin/bash #$ -N %s # Giving the name of the output log file #$ -o %s # Combining output/error messages into one file #$ -j y # Send mail to these users #$ -M %s # Send mail at job end (e); -m as sends abort, suspend. #$ -m as # max_running_tasks #$ -tc 15 -# Use the allslots pe and all available slots on that cluster #$ -pe allslots %s #$ -t 1-%s # Now the script will iterate %s times. file_list=(%s) i=$(expr $SGE_TASK_ID - 1) echo "i = $i" . /bioware/root/Modules/etc/profile.modules module load bioware module load vsearch INFILE=${file_list[$i]} filename=$(basename $INFILE) echo "INFILE = $INFILE" filename_base="${filename%%.*}" echo "filename_base = $filename_base" echo "%s" echo "%s" %s %s ''' % (script_file_name, log_file_name, email_mbl, sge_slot_number, files_list_size, files_list_size, files_string, command_line[0], command_line[1], command_line[0], command_line[1]) # ''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line) ) self.utils.open_write_close(script_file_name_full, text) return script_file_name def create_not_SGE_script(self, script_file_name_base, command_line, dir_to_run, files_list): files_string = " ".join(files_list) script_file_name = script_file_name_base + "_" + self.runobj.run + "_" + self.runobj.lane_name + ".sh" script_file_name_full = os.path.join(dir_to_run, script_file_name) text = ( '''#!/bin/bash file_list=(%s) . /bioware/root/Modules/etc/profile.modules module load bioware module load vsearch n=0 for INFILE in "${file_list[@]}" do n=$[n + 1] echo $n echo "INFILE = $INFILE" filename=$(basename $INFILE) filename_base="${filename%.*}" echo "filename_base = $filename_base" echo "%s" echo "%s" %s %s done ''' % (files_string, command_line[0], command_line[1], command_line[0], command_line[1]) # ''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line) ) self.utils.open_write_close(script_file_name_full, text) return script_file_name def chimera_checking(self): chimera_region_found = False file_list = self.dirs.get_all_files_by_ext(self.indir, self.chg_suffix) logger.debug("FFF = file_list = %s" % (file_list)) # TODO: method dna_region = list( set([ self.runobj.samples[idx_key].dna_region for idx_key in self.input_file_names ]))[0] if dna_region in C.regions_to_chimera_check: chimera_region_found = True else: logger.debug('region not checked: ' + dna_region) ref_db = self.get_ref_db(dna_region) command_line = self.create_chimera_cmd(ref_db) sh_script_file_name = self.create_job_array_script( "chimera_checking", command_line, self.indir, file_list) script_file_name_full = os.path.join(self.indir, sh_script_file_name) self.utils.call_sh_script(script_file_name_full, self.indir) self.utils.print_both("self.dirs.chmod_all(%s)" % (self.indir)) self.dirs.chmod_all(self.indir) logger.debug('sh_script_file_name: ' + sh_script_file_name) if not chimera_region_found: return ('NOREGION', 'No regions found that need checking', '') else: return ("The vsearch commands were created") def get_chimeric_ids(self): ids = set() chimera_file_names = self.get_chimera_file_names(self.outdir) file_ratio = self.check_chimeric_stats() for file_name in chimera_file_names: # logger.debug("from get_chimeric_ids: file_name = %s" % file_name) if file_name.endswith(self.chimeric_suffix): both_or_denovo = self.get_chimeras_suffix( file_ratio, file_name) # TODO: run ones for each file_base = ".".join(file_name.split(".")[0:3]) (for txt and db) if file_name.endswith(both_or_denovo): file_name_path = os.path.join(self.outdir, file_name) self.utils.print_both("Get ids from %s" % file_name_path) read_fasta = fa.ReadFasta(file_name_path) ids.update(set(read_fasta.ids)) return ids def get_chimeras_suffix(self, file_ratio, file_name): """ use only de-novo (.txt) chimeric if check_chimeric_stats shows ratio ref to de-novo > 3 e.g. if denovo_only: chimeric_suffix = self.chimeras_suffix + self.denovo_suffix + self.chimeric_suffix if no: chimeras_suffix = self.chimeric_suffix if file_name.endswith(chimeric_suffix): ... # first_name, last_name = get_name() """ # for file_basename in file_ratio: (percent_ref, ratio) = file_ratio[".".join(file_name.split(".")[0:3])] chimeric_fa_suffix = "" # logger.debug("percent_ref = %s, ratio = %s" % (percent_ref, ratio)) # if (percent_ref > 15) and (ratio > 2): if ratio > 3: chimeric_fa_suffix = self.chimeras_suffix + self.denovo_suffix + self.chimeric_suffix else: chimeric_fa_suffix = self.chimeric_suffix return chimeric_fa_suffix def move_out_chimeric(self): chimeric_ids = self.get_chimeric_ids() for idx_key in self.input_file_names: fasta_file_path = os.path.join(self.indir, self.input_file_names[idx_key]) read_fasta = fa.ReadFasta(fasta_file_path) read_fasta.close() non_chimeric_file = fasta_file_path + self.nonchimeric_suffix non_chimeric_fasta = fa.FastaOutput(non_chimeric_file) fasta = fa.SequenceSource(fasta_file_path, lazy_init=False) while fasta.next(): if not fasta.id in chimeric_ids: non_chimeric_fasta.store(fasta, store_frequencies=False) non_chimeric_fasta.close() def check_chimeric_stats(self): all_lines_suffix = self.denovo_suffix # ".txt" or ".db, doesn't matter" chimera_ref_suffix = self.ref_suffix + self.chimeric_suffix #".db.chimeric.fa" chimera_denovo_suffix = self.denovo_suffix + self.chimeric_suffix # ".txt.chimeric.fa" filenames = self.get_basenames(self.get_current_filenames(self.outdir)) file_ratio = {} for file_basename in filenames: # logger.debug(file_basename) all_lines = 0 ref_lines = 0 denovo_lines = 0 ratio = 0 percent_ref = 0 percent_denovo = 0 all_lines_file_name = os.path.join( self.outdir, file_basename + all_lines_suffix) ref_lines_file_name = os.path.join( self.outdir, file_basename + chimera_ref_suffix) denovo_lines_file_name = os.path.join( self.outdir, file_basename + chimera_denovo_suffix) all_lines = int(self.wccount(all_lines_file_name) or 0) ref_lines = int(self.get_fa_lines_count(ref_lines_file_name) or 0) denovo_lines = int( self.get_fa_lines_count(denovo_lines_file_name) or 0) # denovo_lines = int(denovo_lines or 0) if (ref_lines == 0) or (all_lines == 0): file_ratio[file_basename] = (0, 0) continue else: percent_ref = self.percent_count(all_lines, ref_lines) if (denovo_lines == 0): file_ratio[file_basename] = ( percent_ref, percent_ref ) #use ref instead of ratio, because we are actually looking for a huge difference between ref and denovo (ref > 15 and denovo = 0) continue if (denovo_lines > 0): ratio = self.count_ratio(ref_lines, denovo_lines) percent_denovo = self.percent_count(all_lines, denovo_lines) file_ratio[file_basename] = (percent_ref, ratio) # percent_ref = int(percent_ref or 0) if (percent_ref > 15): self.utils.print_both("=" * 50) self.utils.print_both(file_basename) # logger.debug("all_lines_file_name = %s, ref_lines_file_name = %s, denovo_lines_file_name = %s" % (all_lines_file_name, ref_lines_file_name, denovo_lines_file_name)) self.utils.print_both( "all_lines = %s, ref_lines = %s, denovo_lines = %s" % (all_lines, ref_lines, denovo_lines)) self.utils.print_both("ratio = %s" % ratio) self.utils.print_both("percent_ref = %s, percent_denovo = %s" % (percent_ref, percent_denovo)) return file_ratio def get_basenames(self, filenames): file_basenames = set() for f in filenames: file_basename = ".".join(f.split(".")[0:3]) if file_basename.endswith(self.base_suffix): file_basenames.add(file_basename) return file_basenames def wccount(self, filename): return subprocess.check_output(['wc', '-l', filename]).split()[0] def count_ratio(self, ref_num, denovo_num): try: return float(ref_num or 0) / float(denovo_num or 0) except ZeroDivisionError: # logger.debug("There is no denovo chimeras to count ratio.") pass def get_fa_lines_count(self, file_name): # todo: use fastalib to get cnt? # return fa.SequenceSource(file_name, lazy_init = False).total_seq try: file_open = open(file_name) return len([l for l in file_open.readlines() if l.startswith('>')]) except IOError: e = sys.exc_info()[1] self.utils.print_both(e) return 0 # logger.error("%s\nThere is no such file: %s" % (e, file_name)) def percent_count(self, all_lines, chimeric_count): try: return float(chimeric_count or 0) * 100 / float(all_lines or 0) except ZeroDivisionError: # logger.error("There is no denovo chimeras to count ratio.") pass """ ----------------------------------------------------------------------------- For 454. not tested """ def chimera_denovo(self): chimera_region_found = False output = {} cluster_id_list = [] for idx_key in self.idx_keys: input_file_name = os.path.join(self.indir, idx_key + '.abund.fa') if os.path.isfile(input_file_name): output_file_name = os.path.join(self.outdir, idx_key + '.chimera.denovo') #open(output_file_name, 'a').close() # make sure file exists log_file = os.path.join(self.outdir, idx_key + ".denovo.log") dna_region = self.runobj.samples[idx_key].dna_region logger.debug("dna_region = %s" % dna_region) if self.runobj.vamps_user_upload: # VAMPS users can chimera check regardless of region chosen chimera_region_found = True else: if dna_region in C.regions_to_chimera_check: chimera_region_found = True else: logger.debug('region not checked: ' + dna_region) continue self.utils.print_both( "input_file_name = %s \noutput_file_name = %s" % (input_file_name, output_file_name)) # uchime_cmd = C.clusterize_cmd # uchime_cmd += " " # uchime_cmd += self.usearch_cmd # uchime_cmd += " --uchime " # uchime_cmd += input_file_name # uchime_cmd += " --uchimeout " # uchime_cmd += output_file_name # uchime_cmd += " --abskew " # uchime_cmd += self.abskew uchime_cmd = '' if self.use_cluster: uchime_cmd += C.clusterize_cmd uchime_cmd += " " uchime_cmd += " -log " uchime_cmd += log_file uchime_cmd += " " uchime_cmd += self.usearch_cmd uchime_cmd += " -uchime_denovo " uchime_cmd += input_file_name uchime_cmd += " -uchimeout " uchime_cmd += output_file_name logger.debug("uchime_denovo_cmd = %s" % (uchime_cmd)) try: logger.info("chimera denovo command: " + str(uchime_cmd)) # subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.utils.print_both("chimera denovo command: " + str(uchime_cmd)) #output[idx_key] = subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output[idx_key] = subprocess.check_output(uchime_cmd, shell=True) self.utils.print_both("chimera denovo result: " + str(output[idx_key])) #self.utils.print_both("output[idx_key] = %s" % output[idx_key]) #if idx_key in output and len(output[idx_key].split()) > 1: #self.utils.print_both(output[idx_key].split()[2]) items = output[idx_key].split() if len(items) > 2: cluster_id_list.append(items[2]) except OSError: e = sys.exc_info()[1] self.utils.print_both( "Error: Problems with this command: %s" % (uchime_cmd)) if self.utils.is_local(): print >> sys.stderr, "Error: Execution of %s failed: %s" % ( uchime_cmd, e) else: print >> sys.stderr, "Error: Execution of %s failed: %s" % ( uchime_cmd, e) self.utils.print_both( "Error: Execution of %s failed: %s" % (uchime_cmd, e)) raise # ??? if not chimera_region_found: return ('NOREGION', 'No regions found that need checking', '') # ??? # for idx_key in output: # if len(output[idx_key]) > 50 or len(output[idx_key]) < 40: # return ('ERROR','uchime ref may have broken or empty', idx_key) # finally self.utils.print_both('Finished Chimera Denovo') if cluster_id_list: return ('SUCCESS', 'uchime ref seems to have been submitted successfully', cluster_id_list) else: return ('ERROR', 'uchime ref returned no cluster IDs', cluster_id_list) def chimera_reference(self): chimera_region_found = False output = {} cluster_id_list = [] for idx_key in self.run_keys: dna_region = self.runobj.samples[idx_key].dna_region if self.runobj.vamps_user_upload: # VAMPS users can chimera check regardless of region chosen chimera_region_found = True else: if dna_region in C.regions_to_chimera_check: chimera_region_found = True else: logger.debug('region not checked: ' + dna_region) continue input_file_name = os.path.join(self.indir, idx_key + '.abund.fa') output_file_name = os.path.join(self.outdir, idx_key + ".chimera.ref") #open(output_file_name, 'a').close() # make sure file exists log_file = os.path.join(self.outdir, idx_key + ".ref.log") logger.debug("OUT FILE NAME: " + output_file_name) #out_file_name = self.prefix[idx_key] + ".chimeras.db" input_file_name = os.path.join(self.indir, idx_key + '.abund.fa') if os.path.isfile(input_file_name): output_file_name = os.path.join(self.outdir, idx_key + ".chimera.ref") #open(output_file_name, 'a').close() # make sure file exists log_file = os.path.join(self.outdir, idx_key + ".ref.log") logger.debug("OUT FILE NAME: " + output_file_name) # which ref db to use? ref_db = '' if dna_region.upper() == 'ITS': logger.debug("got an ITS dna region so using refdb: " + self.its_refdb) ref_db = self.its_refdb else: logger.debug("using standard refdb: " + self.refdb) ref_db = self.refdb uchime_cmd = '' if self.use_cluster: uchime_cmd = C.clusterize_cmd uchime_cmd += " " uchime_cmd += " -log " uchime_cmd += log_file uchime_cmd += " " uchime_cmd += self.usearch_cmd uchime_cmd += " -uchime_ref " uchime_cmd += input_file_name uchime_cmd += " -uchimeout " uchime_cmd += output_file_name uchime_cmd += " -db " uchime_cmd += ref_db uchime_cmd += " -strand " uchime_cmd += "plus" logger.debug("uchime_ref_cmd = %s" % (uchime_cmd)) try: logger.info("vsearch version: " % (self.utils.get_vsearch_version)) logger.info("chimera reference command: " + str(uchime_cmd)) output[idx_key] = subprocess.check_output(uchime_cmd, shell=True) #logger.debug('outsplit',output[idx_key].split()[2]) cluster_id_list.append(output[idx_key].split()[2]) #logger.debug('Have %d bytes in output' % len(output)) #logger.debug('ref',idx_key,output,len(output)) if len(output[idx_key]) < 50 and len(output[idx_key]) > 40: logger.debug( idx_key + " uchime ref seems to have been submitted successfully" ) else: if self.use_cluster: print >> sys.stderr, "Error: uchime ref may be broke" self.utils.print_both( "Error: uchime ref may be broke") except OSError: e = sys.exc_info()[1] print >> sys.stderr, "Error: Execution of chimera_reference failed: %s" % ( uchime_cmd, e) self.utils.print_both( "Error: Execution of chimera_reference failed: %s" % (uchime_cmd, e)) raise if not chimera_region_found: return ('NOREGION', 'No regions found that need checking', '') for idx_key in output: if (len(output[idx_key]) > 50 or len(output[idx_key]) < 40) and self.use_cluster: return ('ERROR', 'uchime ref may have broken or empty', idx_key) self.utils.print_both('Finished Chimera Reference') return ('SUCCESS', 'uchime ref seems to have been submitted successfully', cluster_id_list) def write_chimeras_to_deleted_file(self): for idx_key in self.run_keys: # open deleted file and append chimera to it # open and read both chimeras files: chimeras.db and chimeras.txt # hash to remove dupes chimera_deleted = {} denovo_file = os.path.join(self.outdir, idx_key + '.chimera.denovo') ref_file = os.path.join(self.outdir, idx_key + ".chimera.ref") # deleted file is in trimming dir for vampsuser deleted_file = os.path.join(self.indir, idx_key + ".deleted.txt") for file in [denovo_file, ref_file]: if os.path.isfile(file): fh = open(file, "r") # make a list of chimera deleted read_ids for line in fh.readlines(): lst = line.strip().split() id = lst[1].split(';')[0] chimera_yesno = lst[-1] if (chimera_yesno) == 'Y': chimera_deleted[id] = 'chimera' # open to append as trimming deletions are already there fh_del = open(deleted_file, "a") for id in chimera_deleted: fh_del.write(id + "\tChimera\n") fh_del.close()