def run(self): """ 09-08-05 --db_connect() --org_short2long() --org2tax_id() --setup_acc2gene_id() if self.new_table --create_output_table() --parse_input_filename() """ (conn, curs) = db_connect(self.hostname, self.dbname) long_organism = org_short2long(self.organism) tax_id_set = Set([org2tax_id(long_organism)]) MdbId2GeneId_instance = MdbId2GeneId() acc2gene_id = MdbId2GeneId_instance.setup_acc2gene_id( self.acc_file, tax_id_set) if self.new_table: self.create_output_table(curs, self.output_table) self.parse_input_filename(curs, self.input_filename, self.output_table, acc2gene_id,\ org2tax_id(long_organism), self.up_length, self.comment, long_organism, self.type) if self.commit: curs.execute("end")
def run(self): """ 09-19-05 rewrite --db_connect() --get_global_gene_id2gene_no() --org2tax_id() --get_gene_id2mt_no_list() --return_gene_id_set() --submit() """ (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) gene_id2gene_no = get_global_gene_id2gene_no(curs, self.organism) tax_id = org2tax_id(self.organism) """ #01-14-06 comment it out for future gene_no2tf_set = get_gene_no2tf_set(curs) #12-15-05 just yeast. #12-15-05 convert gene_no(integer) into gene_id(string) gene_id2mt_no_list = {} for gene_no, tf_set in gene_no2tf_set.iteritems(): gene_id2mt_no_list[repr(gene_no)] = list(tf_set) """ gene_id2mt_no_list = get_gene_id2mt_no_list(tax_id) gene_id_set = self.return_gene_id_set(self.dir, gene_id2gene_no, self.min_frequency) self.submit(curs, output_table, gene_id_set, gene_id2gene_no, gene_id2mt_no_list) if self.needcommit: conn.commit()
def run(self): """ 07-31-05 """ if not os.path.isdir(self.outputdir): os.makedirs(self.outputdir) tax_id = org2tax_id(self.organism) mapping_dict = get_unigene2gene_list(self.mapping_file, tax_id) sys.stderr.write("\tTotally, %d files to be processed.\n"%len(self.files)) for f in self.files: sys.stderr.write("%d/%d:\t%s"%(self.files.index(f)+1,len(self.files),f)) self.transform_one_file(f, self.delimiter, self.outputdir, mapping_dict, self.type) sys.stderr.write("\n")
def run(self): """ 09-28-05 12-19-05 use class_list and output_fname_list to ease program writing 12-30-05 fix a bug in indexing darwin_instance_list 2006-09-25 2007-02-08 add context_prediction_csv_format """ tf_darwin_ofname = os.path.join(self.output_dir, '%s.tf.darwin'%self.cluster_bs_table) cluster_darwin_ofname = os.path.join(self.output_dir, '%s.cluster.darwin'%os.path.basename(self.input_fname)) prediction_darwin_ofname = os.path.join(self.output_dir, '%s.prediction.darwin'%os.path.basename(self.input_fname)) pattern_darwin_ofname = os.path.join(self.output_dir, '%s.pattern.darwin'%self.pattern_table) if not os.path.isdir(self.output_dir): os.makedirs(self.output_dir) conn, curs = db_connect(self.hostname, self.dbname, self.schema) tax_id = org2tax_id(self.organism) #gene_no2id = get_gene_no2gene_id(curs) #Watch, if unigene, should use this. gene_id2symbol = get_gene_id2gene_symbol(curs, tax_id) gene_id2symbol = self.replace_prime_in_gene_id2symbol(gene_id2symbol) #01-26-06 #gene_no2symbol = dict_transfer(gene_no2id, gene_id2symbol) #Jasmine wants the gene symbol 09-28-05 #gene_id is integer in gene.gene table and same as gene_no, so just use it. go_no2name = get_go_no2name(curs) #09-28-05 Jasmine wants the go_name, not go_id #2006-09-25 use gene_id2symbol to replace mt_no2tf_name #mt_no2tf_name = get_mt_no2tf_name() mt_no2tf_name = gene_id2symbol class_list = [tf_darwin_format, cluster_darwin_format, prediction_darwin_format, pattern_darwin_format, context_prediction_csv_format] context_prediction_csv_fname = os.path.join(self.output_dir, '%s.context.csv'%self.input_fname) output_fname_list = [tf_darwin_ofname, cluster_darwin_ofname, prediction_darwin_ofname, pattern_darwin_ofname, context_prediction_csv_fname] darwin_instance_list = [] for i in range(len(self.running_bit)): if self.running_bit[i] == '1': darwin_instance_list.append(class_list[i](self.hostname, self.dbname, self.schema, self.pattern_table,\ self.cluster_bs_table, self.input_fname, self.lm_bit, self.acc_cut_off, \ output_fname_list[i], gene_id2symbol, go_no2name, mt_no2tf_name, debug, report)) #2006-09-25 current_pos = len(darwin_instance_list)-1 #12-30-05 darwin_instance_list[current_pos].start() for i in range(len(darwin_instance_list)): darwin_instance_list[i].join()
def run(self): """ 11-15-05 correct a bug related to self.size 2006-08-27 if sequence is empty, ignore it. 2006-11-27 add running_type """ if not os.path.isdir(self.folder): os.makedirs(self.folder) (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) if self.running_type == 1: curs.execute( "DECLARE crs CURSOR FOR SELECT id, sequence from prom_seq \ where sequence is not null and strpos(chromosome, 'random')=0 and organism='%s'" % self.organism) #09-14-05 not null sequence and no 'random' in chromosome elif self.running_type == 2: curs.execute( "DECLARE crs CURSOR FOR SELECT r.id, r.sequence from sequence.raw_sequence r, sequence.annot_assembly a\ where r.acc_ver=a.acc_ver and a.tax_id=%s" % org2tax_id(self.organism)) #2006-11-27 from a specific tax_id, the later condition will guarantee that sequence is not empty else: sys.stderr.write("Unsupported running_type: %s\n" % self.running_type) sys.exit(3) curs.execute("fetch %s from crs" % self.size) rows = curs.fetchall() counter = 0 sys.stderr.write("Starting to output...\n") while rows: output_file = os.path.join(self.folder, '%s%s' % (self.prefix, counter)) of = open(output_file, 'w') for row in rows: id, sequence = row if sequence: of.write('>%s\n%s\n' % (id, sequence)) del of counter += 1 if self.report: sys.stderr.write('%s%s' % ('\x08' * 20, counter)) curs.execute("fetch %s from crs" % self.size) rows = curs.fetchall() del conn, curs sys.stderr.write("Done.\n")
def run(self): """ 09-18-05 09-30-05 way of calling get_mt_id2no() changed get gene_id_dict is a thread now, to speed up """ (conn, curs) = db_connect(self.hostname, self.dbname) tax_id = org2tax_id(self.organism) tax_id_set = Set([tax_id]) get_gene_id_dict_instance = get_gene_id_dict(self.acc_file, tax_id_set) get_gene_id_dict_instance.start() mt_id2no = get_mt_id2no(curs, self.matrix_table) mt_no2matches = self.get_mt_no2matches(curs, self.input_table, self.prom_seq_table, self.top_number, self.organism, mt_id2no) get_gene_id_dict_instance.join() #must wait it to finish before going on, need gene_id_dict self.dump2output_table(curs, self.output_table, mt_no2matches, get_gene_id_dict_instance.gene_id_dict, tax_id) if self.commit: curs.execute("end")
def run(self): """ 11-15-05 correct a bug related to self.size 2006-08-27 if sequence is empty, ignore it. 2006-11-27 add running_type """ if not os.path.isdir(self.folder): os.makedirs(self.folder) (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) if self.running_type==1: curs.execute("DECLARE crs CURSOR FOR SELECT id, sequence from prom_seq \ where sequence is not null and strpos(chromosome, 'random')=0 and organism='%s'"%self.organism) #09-14-05 not null sequence and no 'random' in chromosome elif self.running_type==2: curs.execute("DECLARE crs CURSOR FOR SELECT r.id, r.sequence from sequence.raw_sequence r, sequence.annot_assembly a\ where r.acc_ver=a.acc_ver and a.tax_id=%s"%org2tax_id(self.organism)) #2006-11-27 from a specific tax_id, the later condition will guarantee that sequence is not empty else: sys.stderr.write("Unsupported running_type: %s\n"%self.running_type) sys.exit(3) curs.execute("fetch %s from crs"%self.size) rows = curs.fetchall() counter = 0 sys.stderr.write("Starting to output...\n") while rows: output_file = os.path.join(self.folder, '%s%s'%(self.prefix, counter)) of = open(output_file, 'w') for row in rows: id, sequence = row if sequence: of.write('>%s\n%s\n'%(id,sequence)) del of counter += 1 if self.report: sys.stderr.write('%s%s'%('\x08'*20, counter)) curs.execute("fetch %s from crs"%self.size) rows = curs.fetchall() del conn, curs sys.stderr.write("Done.\n")