Example #1
0
    def run(self):
        """
		09-08-05
			
			--db_connect()
			--org_short2long()
			--org2tax_id()
			--setup_acc2gene_id()
			if self.new_table
				--create_output_table()
			--parse_input_filename()
		"""
        (conn, curs) = db_connect(self.hostname, self.dbname)
        long_organism = org_short2long(self.organism)
        tax_id_set = Set([org2tax_id(long_organism)])

        MdbId2GeneId_instance = MdbId2GeneId()
        acc2gene_id = MdbId2GeneId_instance.setup_acc2gene_id(
            self.acc_file, tax_id_set)
        if self.new_table:
            self.create_output_table(curs, self.output_table)
        self.parse_input_filename(curs, self.input_filename, self.output_table,  acc2gene_id,\
         org2tax_id(long_organism), self.up_length, self.comment, long_organism, self.type)
        if self.commit:
            curs.execute("end")
Example #2
0
	def run(self):
		"""
		09-19-05
			rewrite
			
			--db_connect()
			--get_global_gene_id2gene_no()
			--org2tax_id()
			--get_gene_id2mt_no_list()
			--return_gene_id_set()
			--submit()
		"""
		(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
		gene_id2gene_no = get_global_gene_id2gene_no(curs, self.organism)
		tax_id = org2tax_id(self.organism)
		"""
		#01-14-06 comment it out for future 
		gene_no2tf_set = get_gene_no2tf_set(curs)	#12-15-05 just yeast.
		#12-15-05 convert gene_no(integer) into gene_id(string)
		gene_id2mt_no_list = {}
		for gene_no, tf_set in gene_no2tf_set.iteritems():
			gene_id2mt_no_list[repr(gene_no)] = list(tf_set)
		"""
		gene_id2mt_no_list = get_gene_id2mt_no_list(tax_id)
		gene_id_set = self.return_gene_id_set(self.dir, gene_id2gene_no, self.min_frequency)
		self.submit(curs, output_table, gene_id_set, gene_id2gene_no, gene_id2mt_no_list)
		if self.needcommit:
			conn.commit()
Example #3
0
	def run(self):
		"""
		07-31-05
		"""
		if not os.path.isdir(self.outputdir):
			os.makedirs(self.outputdir)
		tax_id = org2tax_id(self.organism)
		mapping_dict = get_unigene2gene_list(self.mapping_file, tax_id)
		sys.stderr.write("\tTotally, %d files to be processed.\n"%len(self.files))
		for f in self.files:
			sys.stderr.write("%d/%d:\t%s"%(self.files.index(f)+1,len(self.files),f))
			self.transform_one_file(f, self.delimiter, self.outputdir, mapping_dict, self.type)
			sys.stderr.write("\n")
Example #4
0
	def run(self):
		"""
		09-28-05
		12-19-05
			use class_list and output_fname_list to ease program writing
		12-30-05
			fix a bug in indexing darwin_instance_list
		2006-09-25
		2007-02-08
			add context_prediction_csv_format
		"""
		tf_darwin_ofname = os.path.join(self.output_dir, '%s.tf.darwin'%self.cluster_bs_table)
		cluster_darwin_ofname = os.path.join(self.output_dir, '%s.cluster.darwin'%os.path.basename(self.input_fname))
		prediction_darwin_ofname = os.path.join(self.output_dir, '%s.prediction.darwin'%os.path.basename(self.input_fname))
		pattern_darwin_ofname = os.path.join(self.output_dir, '%s.pattern.darwin'%self.pattern_table)
		
		if not os.path.isdir(self.output_dir):
			os.makedirs(self.output_dir)
		conn, curs = db_connect(self.hostname, self.dbname, self.schema)
		
		tax_id = org2tax_id(self.organism)
		#gene_no2id = get_gene_no2gene_id(curs)	#Watch, if unigene, should use this.
		gene_id2symbol = get_gene_id2gene_symbol(curs, tax_id)
		
		gene_id2symbol = self.replace_prime_in_gene_id2symbol(gene_id2symbol)	#01-26-06
		
		#gene_no2symbol = dict_transfer(gene_no2id, gene_id2symbol)
		#Jasmine wants the gene symbol 09-28-05
		#gene_id is integer in gene.gene table and same as gene_no, so just use it.
		go_no2name = get_go_no2name(curs)	#09-28-05 Jasmine wants the go_name, not go_id
		
		#2006-09-25 use gene_id2symbol to replace mt_no2tf_name
		#mt_no2tf_name = get_mt_no2tf_name()
		mt_no2tf_name = gene_id2symbol
		
		class_list = [tf_darwin_format, cluster_darwin_format, prediction_darwin_format, pattern_darwin_format, context_prediction_csv_format]
		context_prediction_csv_fname = os.path.join(self.output_dir, '%s.context.csv'%self.input_fname)
		output_fname_list = [tf_darwin_ofname, cluster_darwin_ofname, prediction_darwin_ofname, pattern_darwin_ofname, context_prediction_csv_fname]
		darwin_instance_list = []
		for i in range(len(self.running_bit)):
			if self.running_bit[i] == '1':
				darwin_instance_list.append(class_list[i](self.hostname, self.dbname, self.schema, self.pattern_table,\
					self.cluster_bs_table, self.input_fname, self.lm_bit, self.acc_cut_off, \
					output_fname_list[i], gene_id2symbol, go_no2name, mt_no2tf_name, debug, report))	#2006-09-25
				current_pos = len(darwin_instance_list)-1 #12-30-05
				darwin_instance_list[current_pos].start()
			
		for i in range(len(darwin_instance_list)):
			darwin_instance_list[i].join()
Example #5
0
    def run(self):
        """
		11-15-05
			correct a bug related to self.size
		2006-08-27
			if sequence is empty, ignore it.
		2006-11-27
			add running_type
		"""
        if not os.path.isdir(self.folder):
            os.makedirs(self.folder)
        (conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
        if self.running_type == 1:
            curs.execute(
                "DECLARE crs CURSOR FOR SELECT id, sequence from prom_seq \
				where sequence is not null and strpos(chromosome, 'random')=0 and organism='%s'"
                % self.organism)
            #09-14-05	not null sequence and no 'random' in chromosome
        elif self.running_type == 2:
            curs.execute(
                "DECLARE crs CURSOR FOR SELECT r.id, r.sequence from sequence.raw_sequence r, sequence.annot_assembly a\
				where r.acc_ver=a.acc_ver and a.tax_id=%s" % org2tax_id(self.organism))
            #2006-11-27 from a specific tax_id, the later condition will guarantee that sequence is not empty
        else:
            sys.stderr.write("Unsupported running_type: %s\n" %
                             self.running_type)
            sys.exit(3)
        curs.execute("fetch %s from crs" % self.size)
        rows = curs.fetchall()
        counter = 0
        sys.stderr.write("Starting to output...\n")
        while rows:
            output_file = os.path.join(self.folder,
                                       '%s%s' % (self.prefix, counter))
            of = open(output_file, 'w')
            for row in rows:
                id, sequence = row
                if sequence:
                    of.write('>%s\n%s\n' % (id, sequence))
            del of
            counter += 1
            if self.report:
                sys.stderr.write('%s%s' % ('\x08' * 20, counter))
            curs.execute("fetch %s from crs" % self.size)
            rows = curs.fetchall()
        del conn, curs
        sys.stderr.write("Done.\n")
Example #6
0
	def run(self):
		"""
		09-18-05
		09-30-05
			way of calling get_mt_id2no() changed
			get gene_id_dict is a thread now, to speed up
		"""
		(conn, curs) =  db_connect(self.hostname, self.dbname)
		tax_id = org2tax_id(self.organism)
		tax_id_set = Set([tax_id])
		get_gene_id_dict_instance = get_gene_id_dict(self.acc_file, tax_id_set)
		get_gene_id_dict_instance.start()
		mt_id2no = get_mt_id2no(curs, self.matrix_table)
		mt_no2matches = self.get_mt_no2matches(curs, self.input_table, self.prom_seq_table, self.top_number, self.organism, mt_id2no)
		get_gene_id_dict_instance.join()	#must wait it to finish before going on, need gene_id_dict
		self.dump2output_table(curs, self.output_table, mt_no2matches, get_gene_id_dict_instance.gene_id_dict, tax_id)
		if self.commit:
			curs.execute("end")
Example #7
0
	def run(self):
		"""
		11-15-05
			correct a bug related to self.size
		2006-08-27
			if sequence is empty, ignore it.
		2006-11-27
			add running_type
		"""
		if not os.path.isdir(self.folder):
			os.makedirs(self.folder)
		(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
		if self.running_type==1:
			curs.execute("DECLARE crs CURSOR FOR SELECT id, sequence from prom_seq \
				where sequence is not null and strpos(chromosome, 'random')=0 and organism='%s'"%self.organism)
				#09-14-05	not null sequence and no 'random' in chromosome
		elif self.running_type==2:
			curs.execute("DECLARE crs CURSOR FOR SELECT r.id, r.sequence from sequence.raw_sequence r, sequence.annot_assembly a\
				where r.acc_ver=a.acc_ver and a.tax_id=%s"%org2tax_id(self.organism))
				#2006-11-27 from a specific tax_id, the later condition will guarantee that sequence is not empty
		else:
			sys.stderr.write("Unsupported running_type: %s\n"%self.running_type)
			sys.exit(3)
		curs.execute("fetch %s from crs"%self.size)
		rows = curs.fetchall()
		counter = 0
		sys.stderr.write("Starting to output...\n")
		while rows:
			output_file = os.path.join(self.folder, '%s%s'%(self.prefix, counter))
			of = open(output_file, 'w')
			for row in rows:
				id, sequence = row
				if sequence:
					of.write('>%s\n%s\n'%(id,sequence))
			del of
			counter += 1
			if self.report:
				sys.stderr.write('%s%s'%('\x08'*20, counter))
			curs.execute("fetch %s from crs"%self.size)
			rows = curs.fetchall()
		del conn, curs
		sys.stderr.write("Done.\n")