Beispiel #1
0
	def run(self):
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		db = StockDB.StockDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname)
		db.setup(create_tables=False)
		session = db.session
		session.begin()
		
		self.cmp_data_filename = self.findOutCmpDataFilename(self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod)
		header, strain_acc_list, category_list, data_matrix = read_data(self.cmp_data_filename)
		strain_acc_list = map(int, strain_acc_list)	#it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
		snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \
							data_matrix=data_matrix)	#category_list is not used.
		
		readme = formReadmeObj(sys.argv, self.ad, StockDB.README)
		session.save(readme)
		
		import MySQLdb
		conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user = self.db_user, passwd = self.db_passwd)
		curs = conn.cursor()
		from dbSNP2data import dbSNP2data
		snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m(curs, StockDB.Calls.table.name, StockDB.SNPs.table.name)
		strain_info_data = self.get_strain_id_info(self.QC_method_id)
		data_matrix = self.get_data_matrix(db, strain_info_data.strain_id2index, snp_id2index, StockDB.Calls.table.name)
		strain_acc_list = [strain_info_data.strain_id2acc[strain_id] for strain_id in strain_info_data.strain_id_list]
		category_list = [strain_info_data.strain_id2category[strain_id] for strain_id in strain_info_data.strain_id_list]
		header = ['ecotypeid', 'strainid']
		for snp_id in snp_id_list:
			snp_name, chromosome, position = snp_id2info[snp_id]
			header.append(snp_name)
		snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \
						snps_table='stock.snps')	#snps_table is set to the stock_250k snps_table
		
		twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \
							QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug)
		if self.run_type==1:
			row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise()
		elif self.run_type==2:
			#twoSNPData.save_col_wise(session, readme)	#2008-08-18 need to implement a new one for 149SNP
			row_id2NA_mismatch_rate = {}
		else:
			sys.stderr.write("run_type=%s is not supported.\n"%self.run_type)
			sys.exit(5)
		if self.output_fname and self.run_type==1 and row_id2NA_mismatch_rate:
			self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate, self.output_fname)
		
		if self.run_type==1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate:
			#if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid)
			#row_id2NA_mismatch_rate might be None if it's method 0.
			self.submit_to_call_QC(session, row_id2NA_mismatch_rate, self.QC_method_id, self.db_user, \
								twoSNPData.row_id12row_id2, readme)
		if self.commit:
			session.commit()
		else:
			session.rollback()
Beispiel #2
0
	def prepareTwoSNPData(self, db, max_mismatch_rate=0.25, min_no_of_non_NA_pairs=40, report=0):
		"""
		2009-9-23
			add arguments max_mismatch_rate & min_no_of_non_NA_pairs, and pass them to twoSNPData.
			However it's useless to control what should be inserted into db because TwoSNPData.qc_cross_match_table is
			not defined and even if it's defined, the table it'll create doesn't concord to the one in 149SNP db. 
		2008-09-10
			if self.input_fname is given, get 149SNP data from it , instead of database
		2008-8-28
			split out of run() so that MpiQC149CrossMatch could call this easily
		"""
		import MySQLdb
		conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user = self.db_user, passwd = self.db_passwd)
		curs = conn.cursor()
		if self.input_fname:
			header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname)
		else:
			from dbSNP2data import dbSNP2data
			snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m(curs, StockDB.Calls.table.name, StockDB.SNPs.table.name)
			strain_info_data = self.get_strain_id_info(self.QC_method_id, ignore_strains_with_qc=False)
			data_matrix = self.get_data_matrix(db, strain_info_data.strain_id2index, snp_id2index, StockDB.Calls.table.name)
			strain_acc_list = [strain_info_data.strain_id2acc[strain_id] for strain_id in strain_info_data.strain_id_list]	#tg_ecotypeid
			category_list = [strain_info_data.strain_id2category[strain_id] for strain_id in strain_info_data.strain_id_list]	#strainid
			header = ['ecotypeid', 'strainid']
			for snp_id in snp_id_list:
				snp_name, chromosome, position = snp_id2info[snp_id]
				header.append(snp_name)
		snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \
						snps_table='stock.snps')	#snps_table is set to the stock_250k snps_table
		if self.QC_method_id==4:
			snpData2 = snpData1
		else:
			self.cmp_data_filename = self.findOutCmpDataFilename(self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod)
			header, strain_acc_list, category_list, data_matrix = read_data(self.cmp_data_filename)
			strain_acc_list = map(int, strain_acc_list)	#it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
			snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \
							data_matrix=data_matrix)	#category_list is not used to facilitate row-id matching
		
		
		twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, \
							QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug,\
							max_mismatch_rate=max_mismatch_rate, min_no_of_non_NA_pairs=min_no_of_non_NA_pairs, report=report)
		return twoSNPData
    def prepareTwoSNPData(self,
                          db,
                          max_mismatch_rate=0.25,
                          min_no_of_non_NA_pairs=40,
                          report=0):
        """
		2009-9-23
			add arguments max_mismatch_rate & min_no_of_non_NA_pairs, and pass them to twoSNPData.
			However it's useless to control what should be inserted into db because TwoSNPData.qc_cross_match_table is
			not defined and even if it's defined, the table it'll create doesn't concord to the one in 149SNP db. 
		2008-09-10
			if self.input_fname is given, get 149SNP data from it , instead of database
		2008-8-28
			split out of run() so that MpiQC149CrossMatch could call this easily
		"""
        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname,
                               host=self.hostname,
                               user=self.db_user,
                               passwd=self.db_passwd)
        curs = conn.cursor()
        if self.input_fname:
            header, strain_acc_list, category_list, data_matrix = read_data(
                self.input_fname)
        else:
            from dbSNP2data import dbSNP2data
            snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m(
                curs, StockDB.Calls.table.name, StockDB.SNPs.table.name)
            strain_info_data = self.get_strain_id_info(
                self.QC_method_id, ignore_strains_with_qc=False)
            data_matrix = self.get_data_matrix(
                db, strain_info_data.strain_id2index, snp_id2index,
                StockDB.Calls.table.name)
            strain_acc_list = [
                strain_info_data.strain_id2acc[strain_id]
                for strain_id in strain_info_data.strain_id_list
            ]  #tg_ecotypeid
            category_list = [
                strain_info_data.strain_id2category[strain_id]
                for strain_id in strain_info_data.strain_id_list
            ]  #strainid
            header = ['ecotypeid', 'strainid']
            for snp_id in snp_id_list:
                snp_name, chromosome, position = snp_id2info[snp_id]
                header.append(snp_name)
        snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \
            snps_table='stock.snps') #snps_table is set to the stock_250k snps_table
        if self.QC_method_id == 4:
            snpData2 = snpData1
        else:
            self.cmp_data_filename = self.findOutCmpDataFilename(
                self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod)
            header, strain_acc_list, category_list, data_matrix = read_data(
                self.cmp_data_filename)
            strain_acc_list = map(
                int, strain_acc_list
            )  #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
            snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \
                data_matrix=data_matrix) #category_list is not used to facilitate row-id matching


        twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, \
             QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug,\
             max_mismatch_rate=max_mismatch_rate, min_no_of_non_NA_pairs=min_no_of_non_NA_pairs, report=report)
        return twoSNPData
Beispiel #4
0
    def run(self):

        if self.debug:
            import pdb
            pdb.set_trace()

        db = StockDB.StockDB(drivername=self.drivername,
                             username=self.db_user,
                             password=self.db_passwd,
                             hostname=self.hostname,
                             database=self.dbname)
        db.setup(create_tables=False)
        session = db.session
        session.begin()

        self.cmp_data_filename = self.findOutCmpDataFilename(
            self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod)
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.cmp_data_filename)
        strain_acc_list = map(
            int, strain_acc_list
        )  #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
        snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \
             data_matrix=data_matrix) #category_list is not used.

        readme = formReadmeObj(sys.argv, self.ad, StockDB.README)
        session.save(readme)

        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname,
                               host=self.hostname,
                               user=self.db_user,
                               passwd=self.db_passwd)
        curs = conn.cursor()
        from dbSNP2data import dbSNP2data
        snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m(
            curs, StockDB.Calls.table.name, StockDB.SNPs.table.name)
        strain_info_data = self.get_strain_id_info(self.QC_method_id)
        data_matrix = self.get_data_matrix(db,
                                           strain_info_data.strain_id2index,
                                           snp_id2index,
                                           StockDB.Calls.table.name)
        strain_acc_list = [
            strain_info_data.strain_id2acc[strain_id]
            for strain_id in strain_info_data.strain_id_list
        ]
        category_list = [
            strain_info_data.strain_id2category[strain_id]
            for strain_id in strain_info_data.strain_id_list
        ]
        header = ['ecotypeid', 'strainid']
        for snp_id in snp_id_list:
            snp_name, chromosome, position = snp_id2info[snp_id]
            header.append(snp_name)
        snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \
            snps_table='stock.snps') #snps_table is set to the stock_250k snps_table

        twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \
             QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug)
        if self.run_type == 1:
            row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise()
        elif self.run_type == 2:
            #twoSNPData.save_col_wise(session, readme)	#2008-08-18 need to implement a new one for 149SNP
            row_id2NA_mismatch_rate = {}
        else:
            sys.stderr.write("run_type=%s is not supported.\n" % self.run_type)
            sys.exit(5)
        if self.output_fname and self.run_type == 1 and row_id2NA_mismatch_rate:
            self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate,
                                                self.output_fname)

        if self.run_type == 1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate:
            #if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid)
            #row_id2NA_mismatch_rate might be None if it's method 0.
            self.submit_to_call_QC(session, row_id2NA_mismatch_rate, self.QC_method_id, self.db_user, \
                 twoSNPData.row_id12row_id2, readme)
        if self.commit:
            session.commit()
        else:
            session.rollback()