def __init__(self, runobj = None): self.runobj = runobj self.rundate = self.runobj.run self.use_cluster = 1 # if self.runobj.vamps_user_upload: # site = self.runobj.site # dir_prefix = self.runobj.user + '_' + self.runobj.run # else: # site = '' # dir_prefix = self.runobj.run # dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, site = site) if self.runobj.vamps_user_upload: site = self.runobj.site dir_prefix=self.runobj.user+'_'+self.runobj.run else: site = '' dir_prefix = self.runobj.run if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.analysis_dir = dirs.check_dir(dirs.analysis_dir) self.fasta_dir = dirs.check_dir(dirs.reads_overlap_dir) self.gast_dir = dirs.check_dir(dirs.gast_dir) host_name = runobj.database_host database_name = runobj.database_name self.filenames = [] self.my_conn = MyConnection(host = 'newbpcdb2', db="env454") # self.my_conn = MyConnection() self.sequence_table_name = "sequence_ill" self.sequence_field_name = "sequence_comp" self.my_csv = None self.unique_file_counts = dirs.unique_file_counts dirs.delete_file(self.unique_file_counts) self.seq_id_dict = {} self.tax_id_dict = {} self.run_id = None # self.nonchimeras_suffix = ".nonchimeric.fa" self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa" self.fa_unique_suffix = ".fa." + C.unique_suffix #.fa.unique # self.merge_unique_suffix = "." + C.filtered_suffix + "." + C.unique_suffix #.MERGED-MAX-MISMATCH-3.unique self.suffix_used = ""
class dbUpload: """db upload methods""" Name = "dbUpload" """ TODO: add tests and test case TODO: change hardcoded values to args: self.sequence_table_name = "sequence_ill", self.sequence_field_name = "sequence_comp" TODO: generalize all bulk uploads and all inserts? to not copy and paste TODO: add refssu_id TODO: change csv validaton for new fields Order: # put_run_info # insert_seq() # insert_pdr_info() # gast # insert_taxonomy() # insert_sequence_uniq_info_ill() """ def __init__(self, runobj = None): self.utils = PipelneUtils() self.runobj = runobj self.rundate = self.runobj.run self.use_cluster = 1 self.unique_fasta_files = [] # if self.runobj.vamps_user_upload: # site = self.runobj.site # dir_prefix = self.runobj.user + '_' + self.runobj.run # else: # site = '' # dir_prefix = self.runobj.run # dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, site = site) if self.runobj.vamps_user_upload: site = self.runobj.site dir_prefix=self.runobj.user+'_'+self.runobj.run else: site = '' dir_prefix = self.runobj.run if self.runobj.lane_name: lane_name = self.runobj.lane_name else: lane_name = '' self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) self.analysis_dir = self.dirs.check_dir(self.dirs.analysis_dir) self.fasta_dir = self.dirs.check_dir(self.dirs.reads_overlap_dir) self.gast_dir = self.dirs.check_dir(self.dirs.gast_dir) host_name = runobj.database_host database_name = runobj.database_name self.filenames = [] self.my_conn = MyConnection(host = 'newbpcdb2.jbpc-np.mbl.edu', db="env454") # self.my_conn = MyConnection() # self.my_conn = MyConnection(host = 'localhost', db="test_env454") self.sequence_table_name = "sequence_ill" self.sequence_field_name = "sequence_comp" self.my_csv = None self.unique_file_counts = self.dirs.unique_file_counts self.dirs.delete_file(self.unique_file_counts) self.seq_id_dict = {} self.tax_id_dict = {} self.run_id = None # self.nonchimeras_suffix = ".nonchimeric.fa" self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa" self.fa_unique_suffix = ".fa." + C.unique_suffix #.fa.unique self.v6_unique_suffix = "MERGED_V6_PRIMERS_REMOVED." + C.unique_suffix self.suff_list = [self.nonchimeric_suffix, self.fa_unique_suffix, self.v6_unique_suffix] # self.merge_unique_suffix = "." + C.filtered_suffix + "." + C.unique_suffix #.MERGED-MAX-MISMATCH-3.unique self.suffix_used = "" # self.refdb_dir = '/xraid2-2/vampsweb/blastdbs/' def get_fasta_file_names(self): files_names = self.dirs.get_all_files(self.fasta_dir) self.unique_fasta_files = [f for f in files_names.keys() if f.endswith(tuple(self.suff_list))] # needs return because how it's called from pipelineprocesor return self.unique_fasta_files def get_run_info_ill_id(self, filename_base): my_sql = """SELECT run_info_ill_id FROM run_info_ill JOIN run using(run_id) WHERE file_prefix = '%s' and run = '%s' """ % (filename_base, self.rundate) res = self.my_conn.execute_fetch_select(my_sql) if res: return int(res[0][0]) def make_seq_upper(self, filename): read_fasta = fastalib.ReadFasta(filename) sequences = [seq.upper() for seq in read_fasta.sequences] #here we make uppercase for VAMPS compartibility read_fasta.close() return sequences def insert_seq(self, sequences): query_tmpl = "INSERT IGNORE INTO %s (%s) VALUES (COMPRESS(%s))" val_tmpl = "'%s'" my_sql = query_tmpl % (self.sequence_table_name, self.sequence_field_name, ')), (COMPRESS('.join([val_tmpl % key for key in sequences])) seq_id = self.my_conn.execute_no_fetch(my_sql) self.utils.print_both("sequences in file: %s\n" % (len(sequences))) return seq_id # try: # query_tmpl = "INSERT IGNORE INTO %s (%s) VALUES (COMPRESS(%s))" # val_tmpl = "'%s'" # my_sql = query_tmpl % (self.sequence_table_name, self.sequence_field_name, ')), (COMPRESS('.join([val_tmpl % key for key in sequences])) # seq_id = self.my_conn.execute_no_fetch(my_sql) # # print "sequences in file: %s" % (len(sequences)) # self.utils.print_both("sequences in file: %s\n" % (len(sequences))) # return seq_id # except self.my_conn.conn.cursor._mysql_exceptions.Error as err: # if err.errno == 1582: # self.utils.print_both(("ERROR: _mysql_exceptions.OperationalError: (1582, \"Incorrect parameter count in the call to native function 'COMPRESS'\"), there is an empty fasta in %s") % self.fasta_dir) # else: # raise # except: # if len(sequences) == 0: # self.utils.print_both(("ERROR: There are no sequences, please check if there are correct fasta files in the directory %s") % self.fasta_dir) # raise def get_seq_id_dict(self, sequences): id_name = self.sequence_table_name + "_id" query_tmpl = """SELECT %s, uncompress(%s) FROM %s WHERE %s in (COMPRESS(%s))""" val_tmpl = "'%s'" try: my_sql = query_tmpl % (id_name, self.sequence_field_name, self.sequence_table_name, self.sequence_field_name, '), COMPRESS('.join([val_tmpl % key for key in sequences])) res = self.my_conn.execute_fetch_select(my_sql) one_seq_id_dict = dict((y, int(x)) for x, y in res) self.seq_id_dict.update(one_seq_id_dict) except: if len(sequences) == 0: self.utils.print_both(("ERROR: There are no sequences, please check if there are correct fasta files in the directory %s") % self.fasta_dir) raise def get_id(self, table_name, value): id_name = table_name + '_id' my_sql = """SELECT %s FROM %s WHERE %s = '%s'""" % (id_name, table_name, table_name, value) res = self.my_conn.execute_fetch_select(my_sql) if res: return int(res[0][0]) def get_sequence_id(self, seq): my_sql = """SELECT sequence_ill_id FROM sequence_ill WHERE COMPRESS('%s') = sequence_comp""" % (seq) res = self.my_conn.execute_fetch_select(my_sql) if res: return int(res[0][0]) def insert_pdr_info(self, fasta, run_info_ill_id): res_id = "" if (not run_info_ill_id): self.utils.print_both("ERROR: There is no run info yet, please check if it's uploaded to env454") # ------- insert sequence info per run/project/dataset -------- seq_upper = fasta.seq.upper() sequence_ill_id = self.seq_id_dict[seq_upper] seq_count = int(fasta.id.split('|')[-1].split(':')[-1]) # print run_info_ill_id, sequence_ill_id, seq_count my_sql = """INSERT IGNORE INTO sequence_pdr_info_ill (run_info_ill_id, sequence_ill_id, seq_count) VALUES (%s, %s, %s)""" % (run_info_ill_id, sequence_ill_id, seq_count) try: res_id = self.my_conn.execute_no_fetch(my_sql) return res_id except: self.utils.print_both("Offensive query: %s" % my_sql) raise def make_gast_files_dict(self): return self.dirs.get_all_files(self.gast_dir, "gast") def gast_filename(self, filename): # todo: if filename in make_gast_files_dict, use it full path gast_file_names = self.make_gast_files_dict() gast_file_name_path = "" for gast_file_name_path, tpls in gast_file_names.iteritems(): if any(t.endswith(filename) for t in tpls): return gast_file_name_path def get_gast_result(self, filename): gast_file_name = self.gast_filename(filename) self.utils.print_both("current gast_file_name = %s." % gast_file_name) try: with open(gast_file_name) as fd: gast_dict = dict([(l.split("\t")[0], l.split("\t")[1:]) for l in fd]) return gast_dict except IOError, e: # print dir(e) #['__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__getitem__', '__getslice__', '__hash__', '__init__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', 'args', 'errno', 'filename', 'message', 'strerror'] # print "errno = %s" % e.errno logger.debug("errno = %s" % e.errno) if e.errno == 2: # suppress "No such file or directory" error pass # except OSError, e: except TypeError, e: self.utils.print_both("Check if there is a gast file under %s for %s." % (self.gast_dir, filename)) pass