def run(self):
        """
		2008-06-02
		"""
        if self.debug:
            import pdb
            pdb.set_trace()
        if self.row_matching_by_which_value == 0:
            snpData1 = SNPData(input_fname=self.input_fname1,
                               turn_into_array=1,
                               ignore_2nd_column=1)
        else:
            snpData1 = SNPData(input_fname=self.input_fname1,
                               turn_into_array=1)
        snpData2 = SNPData(input_fname=self.input_fname2, turn_into_array=1)

        if self.row_matching_by_which_value == 1 or self.row_matching_by_which_value == 2:
            row_matching_by_which_value = self.row_matching_by_which_value - 1
        else:
            row_matching_by_which_value = None
        twoSNPData = TwoSNPData(
            SNPData1=snpData1,
            SNPData2=snpData2,
            debug=self.debug,
            row_matching_by_which_value=row_matching_by_which_value)
        newSnpData = twoSNPData.order2ndSNPDataRowsSameAs1stSNPData()
        newSnpData.tofile(self.output_fname)
Esempio n. 2
0
    def qcDataMatrixVSsnpData(self, pdata, snps_name2snps_id, snpData2, curs,
                              session, readme):
        """
		2008-08-16
			split from run() to enable one_by_one option
		"""
        #swap the ecotype_id_ls and call_info_id_ls when passing them to SNPData. now strain_acc_list=ecotype_id_ls
        snpData1 = SNPData(header=pdata.header, strain_acc_list=pdata.ecotype_id_ls, category_list=pdata.call_info_id_ls, data_matrix=pdata.data_matrix, \
            min_probability=self.min_probability, call_method_id=self.call_method_id, col_id2id=snps_name2snps_id,\
            max_call_info_mismatch_rate=self.max_call_info_mismatch_rate, snps_table='stock_250k.snps')
        #snps_table is set to the stock_250k snps_table

        twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \
             QC_method_id=self.QC_method_id, user=self.user, row_matching_by_which_value=0, debug=self.debug)

        if self.run_type == 1:
            row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise()
        elif self.run_type == 2:
            twoSNPData.save_col_wise(session, readme)
            row_id2NA_mismatch_rate = {}
        else:
            sys.stderr.write("run_type=%s is not supported.\n" % self.run_type)
            sys.exit(5)
        passingdata = PassingData()
        passingdata.row_id2NA_mismatch_rate = row_id2NA_mismatch_rate
        passingdata.row_id12row_id2 = twoSNPData.row_id12row_id2
        return passingdata
Esempio n. 3
0
	def run(self):
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		db = StockDB.StockDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname)
		db.setup(create_tables=False)
		session = db.session
		session.begin()
		
		self.cmp_data_filename = self.findOutCmpDataFilename(self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod)
		header, strain_acc_list, category_list, data_matrix = read_data(self.cmp_data_filename)
		strain_acc_list = map(int, strain_acc_list)	#it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
		snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \
							data_matrix=data_matrix)	#category_list is not used.
		
		readme = formReadmeObj(sys.argv, self.ad, StockDB.README)
		session.save(readme)
		
		import MySQLdb
		conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user = self.db_user, passwd = self.db_passwd)
		curs = conn.cursor()
		from dbSNP2data import dbSNP2data
		snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m(curs, StockDB.Calls.table.name, StockDB.SNPs.table.name)
		strain_info_data = self.get_strain_id_info(self.QC_method_id)
		data_matrix = self.get_data_matrix(db, strain_info_data.strain_id2index, snp_id2index, StockDB.Calls.table.name)
		strain_acc_list = [strain_info_data.strain_id2acc[strain_id] for strain_id in strain_info_data.strain_id_list]
		category_list = [strain_info_data.strain_id2category[strain_id] for strain_id in strain_info_data.strain_id_list]
		header = ['ecotypeid', 'strainid']
		for snp_id in snp_id_list:
			snp_name, chromosome, position = snp_id2info[snp_id]
			header.append(snp_name)
		snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \
						snps_table='stock.snps')	#snps_table is set to the stock_250k snps_table
		
		twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \
							QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug)
		if self.run_type==1:
			row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise()
		elif self.run_type==2:
			#twoSNPData.save_col_wise(session, readme)	#2008-08-18 need to implement a new one for 149SNP
			row_id2NA_mismatch_rate = {}
		else:
			sys.stderr.write("run_type=%s is not supported.\n"%self.run_type)
			sys.exit(5)
		if self.output_fname and self.run_type==1 and row_id2NA_mismatch_rate:
			self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate, self.output_fname)
		
		if self.run_type==1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate:
			#if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid)
			#row_id2NA_mismatch_rate might be None if it's method 0.
			self.submit_to_call_QC(session, row_id2NA_mismatch_rate, self.QC_method_id, self.db_user, \
								twoSNPData.row_id12row_id2, readme)
		if self.commit:
			session.commit()
		else:
			session.rollback()
Esempio n. 4
0
    def qcDataMatrixVSsnpData(self, pdata, snps_name2snps_id, snpData2, curs, session, readme):
        """
		2008-08-16
			split from run() to enable one_by_one option
		"""
        # swap the ecotype_id_ls and call_info_id_ls when passing them to SNPData. now strain_acc_list=ecotype_id_ls
        snpData1 = SNPData(
            header=pdata.header,
            strain_acc_list=pdata.ecotype_id_ls,
            category_list=pdata.call_info_id_ls,
            data_matrix=pdata.data_matrix,
            min_probability=self.min_probability,
            call_method_id=self.call_method_id,
            col_id2id=snps_name2snps_id,
            max_call_info_mismatch_rate=self.max_call_info_mismatch_rate,
            snps_table="stock_250k.snps",
        )
        # snps_table is set to the stock_250k snps_table

        twoSNPData = TwoSNPData(
            SNPData1=snpData1,
            SNPData2=snpData2,
            curs=curs,
            QC_method_id=self.QC_method_id,
            user=self.user,
            row_matching_by_which_value=0,
            debug=self.debug,
        )

        if self.run_type == 1:
            row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise()
        elif self.run_type == 2:
            twoSNPData.save_col_wise(session, readme)
            row_id2NA_mismatch_rate = {}
        else:
            sys.stderr.write("run_type=%s is not supported.\n" % self.run_type)
            sys.exit(5)
        passingdata = PassingData()
        passingdata.row_id2NA_mismatch_rate = row_id2NA_mismatch_rate
        passingdata.row_id12row_id2 = twoSNPData.row_id12row_id2
        return passingdata
Esempio n. 5
0
	def plone_run(self, min_call_info_mismatch_rate=0.1):
		"""
		2009-6-9
			pass self.max_mismatch_rate, self.min_no_of_non_NA_pairs to TwoSNPData to filter entries stored in db.
		2009-4-13
			add min_call_info_mismatch_rate
		2009-2-5
			add "create_tables=False" to db.setup()
		2008-07-02
			fix a bug which causes the program to continue read data even while call_info_id2fname is empty and input_dir is null.
		2008-07-01
			adjust to the newest functions in QC_250k.py
		2008-04-25
			return None if QC_method_id==0
		2008-04-20
			for plone to call it just to get row_id2NA_mismatch_rate
		"""
		
		import MySQLdb
		conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user = self.user, passwd = self.passwd)
		curs = conn.cursor()
		self.curs = curs
		
		#database connection and etc
		db = Stock_250kDB.Stock_250kDB(username=self.user,
				   password=self.passwd, hostname=self.hostname, database=self.dbname)
		db.setup(create_tables=False)
		session = db.session
		session.begin()
		#transaction = session.create_transaction()
		# if cmp_data_filename not specified, try to find in the data_description column in table QC_method.
		qm = QCMethod.query.get(self.QC_method_id)
		if not self.cmp_data_filename and self.QC_method_id!=0:
			if qm.data_description:
				data_description_ls = qm.data_description.split('=')
				if len(data_description_ls)>1:
					self.cmp_data_filename = qm.data_description.split('=')[1].strip()
		
		#after db query, cmp_data_filename is still nothing, exit program.
		if not self.cmp_data_filename and self.QC_method_id!=0:
			sys.stderr.write("cmp_data_filename is still nothing even after db query. please specify it on the commandline.\n")
			sys.exit(3)
		
		
		#from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix
		header, strain_acc_list, category_list, data_matrix = read_data(self.cmp_data_filename)
		strain_acc_list = map(int, strain_acc_list)	#it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
		snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \
						data_matrix=data_matrix, snps_table=self.QC_method_id2snps_table.get(self.QC_method_id), ignore_het=qm.ignore_het)
						#category_list is not used.
		
		if self.input_dir:
			#04/22/08 Watch: call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid)
			#no submission to db
			call_info_id2fname = self.get_array_id2fname(curs, self.input_dir)
		else:
			#call_info_id2fname = self.get_call_info_id2fname(curs, self.call_info_table, self.call_QC_table, self.QC_method_id)
			call_data = self.get_call_info_id2fname(db, self.QC_method_id, self.call_method_id, \
				filter_calls_QCed=0, max_call_info_mismatch_rate=1, min_call_info_mismatch_rate=min_call_info_mismatch_rate,\
				debug=self.debug)
			call_info_id2fname = call_data.call_info_id2fname
			call_info_ls_to_return = call_data.call_info_ls_to_return
		
		#2008-07-01 pick the call_info_ids to be handled
		new_call_info_id2fname = {}
		for call_info_id_wanted in self.call_info_id_ls:
			if call_info_id_wanted in call_info_id2fname:
				new_call_info_id2fname[call_info_id_wanted] = call_info_id2fname[call_info_id_wanted]
			elif self.report:
				sys.stderr.write("%s not in call_info_id2fname.\n"%(call_info_id_wanted))
		call_info_id2fname = new_call_info_id2fname
		
		if call_info_id2fname:
			pdata = self.read_call_matrix(call_info_id2fname, self.min_probability)
			header = pdata.header
			call_info_id_ls = pdata.call_info_id_ls
			array_id_ls = pdata.array_id_ls
			ecotype_id_ls = pdata.ecotype_id_ls
			data_matrix = pdata.data_matrix
		elif self.input_dir:	#2008-07-02
			#input file is SNP by strain format. double header (1st two lines)
			header, snps_name_ls, category_list, data_matrix = FilterStrainSNPMatrix.read_data(self.input_dir, double_header=1)
			ecotype_id_ls = header[0][2:]
			call_info_id_ls = header[1][2:]
			data_matrix = numpy.array(data_matrix)
			data_matrix = data_matrix.transpose()
			header = ['', ''] + snps_name_ls	#fake a header for SNPData
		else:	#2008-07-02
			sys.stderr.write("No good arrays.\n")
			return None
		
		snps_name2snps_id = None
		
		#swap the ecotype_id_ls and call_info_id_ls when passing them to SNPData. now strain_acc_list=ecotype_id_ls
		snpData1 = SNPData(header=header, strain_acc_list=ecotype_id_ls, category_list= call_info_id_ls, data_matrix=data_matrix, \
						min_probability=self.min_probability, call_method_id=self.call_method_id, col_id2id=snps_name2snps_id,\
						max_call_info_mismatch_rate=self.max_call_info_mismatch_rate, snps_table='stock_250k.snps')	#snps_table is set to the stock_250k snps_table
		
		twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \
							QC_method_id=self.QC_method_id, user=self.user, row_matching_by_which_value=0, debug=self.debug,\
							max_mismatch_rate=self.max_mismatch_rate, min_no_of_non_NA_pairs=self.min_no_of_non_NA_pairs)
							#2009-6-9 cross-matching results whose mismatch_rates are below max_mismatch_rate would be put into db. 
		
		row_id2NA_mismatch_rate = None
		
		#2008-05-01 create a cross match table temporarily
		twoSNPData.qc_cross_match_table = 'qc_cross_match'
		twoSNPData.new_QC_cross_match_table = self.new_QC_cross_match_table
		twoSNPData.cal_row_id2pairwise_dist()	#database submission is done along.
		return row_id2NA_mismatch_rate
    def prepareTwoSNPData(self,
                          db,
                          max_mismatch_rate=0.25,
                          min_no_of_non_NA_pairs=40,
                          report=0):
        """
		2009-9-23
			add arguments max_mismatch_rate & min_no_of_non_NA_pairs, and pass them to twoSNPData.
			However it's useless to control what should be inserted into db because TwoSNPData.qc_cross_match_table is
			not defined and even if it's defined, the table it'll create doesn't concord to the one in 149SNP db. 
		2008-09-10
			if self.input_fname is given, get 149SNP data from it , instead of database
		2008-8-28
			split out of run() so that MpiQC149CrossMatch could call this easily
		"""
        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname,
                               host=self.hostname,
                               user=self.db_user,
                               passwd=self.db_passwd)
        curs = conn.cursor()
        if self.input_fname:
            header, strain_acc_list, category_list, data_matrix = read_data(
                self.input_fname)
        else:
            from dbSNP2data import dbSNP2data
            snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m(
                curs, StockDB.Calls.table.name, StockDB.SNPs.table.name)
            strain_info_data = self.get_strain_id_info(
                self.QC_method_id, ignore_strains_with_qc=False)
            data_matrix = self.get_data_matrix(
                db, strain_info_data.strain_id2index, snp_id2index,
                StockDB.Calls.table.name)
            strain_acc_list = [
                strain_info_data.strain_id2acc[strain_id]
                for strain_id in strain_info_data.strain_id_list
            ]  #tg_ecotypeid
            category_list = [
                strain_info_data.strain_id2category[strain_id]
                for strain_id in strain_info_data.strain_id_list
            ]  #strainid
            header = ['ecotypeid', 'strainid']
            for snp_id in snp_id_list:
                snp_name, chromosome, position = snp_id2info[snp_id]
                header.append(snp_name)
        snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \
            snps_table='stock.snps') #snps_table is set to the stock_250k snps_table
        if self.QC_method_id == 4:
            snpData2 = snpData1
        else:
            self.cmp_data_filename = self.findOutCmpDataFilename(
                self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod)
            header, strain_acc_list, category_list, data_matrix = read_data(
                self.cmp_data_filename)
            strain_acc_list = map(
                int, strain_acc_list
            )  #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
            snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \
                data_matrix=data_matrix) #category_list is not used to facilitate row-id matching


        twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, \
             QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug,\
             max_mismatch_rate=max_mismatch_rate, min_no_of_non_NA_pairs=min_no_of_non_NA_pairs, report=report)
        return twoSNPData
Esempio n. 7
0
    def doFilter(self, snpData, snpData_qc_strain, snpData_qc_snp, min_call_probability, max_call_mismatch_rate, max_call_NA_rate,\
       max_snp_mismatch_rate, max_snp_NA_rate, npute_window_size , output_dir=None):
        """
		2009-10-11
			replace imputeData() with NPUTE.samplingImpute(..., no_of_accessions_per_sampling=300, coverage=3) to avoid memory blowup. 
		2008-12-22
			replace '=' and ',' with '_' in the output filename
		2008-05-19
			matrix_ls has to be of length >0 before concatenation
		2008-05-19
			use SNPData structure
		2008-05-18
			add onlyCommon=True to FilterAccessions.filterByError()
		2008-05-17
			add argument output_dir. if it's available, output data matrix before and after imputation
		2008-05-12
			add
			qcdata.no_of_accessions_filtered_by_mismatch
			qcdata.no_of_accessions_filtered_by_na
			qcdata.no_of_snps_filtered_by_mismatch
			qcdata.no_of_snps_filtered_by_na
			qcdata.no_of_monomorphic_snps_removed
		
		2008-05-11
			split up from computing_node_handler
		"""
        qcdata = PassingData()
        twoSNPData = TwoSNPData(SNPData1=snpData, SNPData2=snpData_qc_strain, \
            row_matching_by_which_value=0, debug=self.debug)
        row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise()
        del twoSNPData

        newSnpData = SNPData.removeRowsByMismatchRate(snpData,
                                                      row_id2NA_mismatch_rate,
                                                      max_call_mismatch_rate)
        qcdata.no_of_accessions_filtered_by_mismatch = newSnpData.no_of_rows_removed

        newSnpData = SNPData.removeRowsByNARate(newSnpData, max_call_NA_rate)
        qcdata.no_of_accessions_filtered_by_na = newSnpData.no_of_rows_removed

        twoSNPData = TwoSNPData(SNPData1=newSnpData, SNPData2=snpData_qc_snp, \
            row_matching_by_which_value=0, debug=self.debug)
        col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise()
        del twoSNPData
        newSnpData = SNPData.removeColsByMismatchRate(newSnpData,
                                                      col_id2NA_mismatch_rate,
                                                      max_snp_mismatch_rate)
        qcdata.no_of_snps_filtered_by_mismatch = newSnpData.no_of_cols_filtered_by_mismatch

        newSnpData = SNPData.removeColsByNARate(newSnpData, max_snp_NA_rate)
        qcdata.no_of_snps_filtered_by_na = newSnpData.no_of_cols_filtered_by_na

        twoSNPData = TwoSNPData(SNPData1=newSnpData, SNPData2=snpData_qc_snp, \
            row_matching_by_which_value=0, debug=self.debug)
        newSnpData = twoSNPData.mergeTwoSNPData(priority=2)
        del twoSNPData
        #MergeSnpsData.merge(snpsd_250k_tmp, snpsd_ls_qc_snp, unionType=0, priority=2)

        newSnpData = SNPData.removeMonomorphicCols(newSnpData)
        qcdata.no_of_monomorphic_snps_removed = newSnpData.no_of_monomorphic_cols

        #FilterSnps.filterMonomorphic(snpsd_250k_tmp)

        if output_dir:
            #output data here
            if not os.path.isdir(output_dir):
                os.makedirs(output_dir)
            output_fname_prefix_ls = ['min_oligo_call_probability_%s'%min_call_probability,\
                  'max_array_mismatch_rate_%s'%max_call_mismatch_rate,\
                  'max_array_NA_rate_%s'%max_call_NA_rate,\
                  'max_snp_mismatch_rate_%s'%max_snp_mismatch_rate,\
                  'max_snp_NA_rate_%s'%max_snp_NA_rate,\
                  'npute_window_size_%s'%npute_window_size]
            output_fname = os.path.join(
                output_dir,
                '_'.join(output_fname_prefix_ls + ['before_imputation.tsv']))
            newSnpData.tofile(output_fname)
            #chromosomes = [snpsd_250k_tmp[i].chromosome for i in range(len(snpsd_250k_tmp))]
            #snpsdata.writeRawSnpsDatasToFile(output_fname, snpsd_250k_tmp, chromosomes=chromosomes, deliminator=',', withArrayIds = True)
        """
		qcdata.no_of_snps_filtered_by_mismatch = 0
		qcdata.no_of_snps_filtered_by_na = 0
		qcdata.no_of_monomorphic_snps_removed = 0
		for snpsd in snpsd_250k_tmp:
			qcdata.no_of_snps_filtered_by_mismatch += snpsd.no_of_snps_filtered_by_mismatch
			qcdata.no_of_snps_filtered_by_na += snpsd.no_of_snps_filtered_by_na
			qcdata.no_of_monomorphic_snps_removed += snpsd.no_of_monomorphic_snps_removed
		"""

        #snpData0 = RawSnpsData_ls2SNPData(snpsd_250k_tmp)

        twoSNPData0 = TwoSNPData(SNPData1=newSnpData, SNPData2=snpData_qc_strain, \
            row_matching_by_which_value=0)
        row_id2NA_mismatch_rate0 = twoSNPData0.cmp_row_wise()
        col_id2NA_mismatch_rate0 = twoSNPData0.cmp_col_wise()
        del twoSNPData0

        result = []
        #for npute_window_size in npute_window_size_ls:
        #snpsd_250k_tmp_1 = copy.deepcopy(snpsd_250k_tmp)	#deepcopy, otherwise snpsd_250k_tmp_1[i].snps = [] would clear snpsd_250k_tmp up as well
        if len(newSnpData.row_id_ls) > 5:
            snps_name_ls = newSnpData.col_id_ls
            ## 2009-10-8 use NPUTE.samplingImpute()
            imputed_matrix, new_snps_name_ls = NPUTE.samplingImpute(snps_name_ls, newSnpData.data_matrix, \
                         input_file_format=1, input_NA_char=0, lower_case_for_imputation=False,\
                         npute_window_size=int(npute_window_size), \
                         no_of_accessions_per_sampling=300, coverage=3)
            snpData_imputed = SNPData(row_id_ls=newSnpData.row_id_ls,
                                      col_id_ls=new_snps_name_ls,
                                      data_matrix=imputed_matrix)
            """
			## 2009-10-8 use NPUTE.samplingImpute() instead. comment out below
			chr2no_of_snps = NPUTE.get_chr2no_of_snps(snps_name_ls)
			chr_ls = chr2no_of_snps.keys()
			chr_ls.sort()
			snpData_imputed = SNPData(row_id_ls = newSnpData.row_id_ls, col_id_ls=[])
			matrix_ls = []
			for chromosome in chr_ls:
				if chr2no_of_snps[chromosome]>5:	#enough for imputation
					npute_data_struc = NPUTESNPData(snps_name_ls=snps_name_ls, data_matrix=newSnpData.data_matrix, chromosome=chromosome, \
									input_file_format=1, input_NA_char=0)
					imputeData(npute_data_struc, int(npute_window_size))
					matrix_ls.append(npute_data_struc.snps)
					snpData_imputed.col_id_ls += npute_data_struc.chosen_snps_name_ls
			if len(matrix_ls)>0:
				snpData_imputed.data_matrix = numpy.transpose(numpy.concatenate(matrix_ls))
			"""
            if output_dir:  #2008-05-16 write the data out if output_fname is available
                #chromosomes = [snpsd_250k_tmp[i].chromosome for i in range(len(snpsd_250k_tmp))]	#already produced in the previous before_imputation output
                output_fname = os.path.join(
                    output_dir, '_'.join(output_fname_prefix_ls +
                                         ['after_imputation.tsv']))
                #snpsdata.writeRawSnpsDatasToFile(output_fname, snpsd_250k_tmp, chromosomes=chromosomes, deliminator=',', withArrayIds = True)
                snpData_imputed.tofile(output_fname)

            twoSNPData1 = TwoSNPData(SNPData1=snpData_imputed, SNPData2=snpData_qc_strain, \
                row_matching_by_which_value=0)
            qcdata.row_id2NA_mismatch_rate1 = twoSNPData1.cmp_row_wise()
            qcdata.col_id2NA_mismatch_rate1 = twoSNPData1.cmp_col_wise()
            del twoSNPData1, snpData_imputed
        else:
            snpData_imputed = None
            #qcdata.row_id2NA_mismatch_rate1 = {}
            #qcdata.col_id2NA_mismatch_rate1 = {}
        del newSnpData
        """
		for i in range(len(snpsd_250k_tmp)):
			#snpsd_250k_tmp_1[i].snps = []	#clear it up
			
			if len(snpsd_250k_tmp[i].accessions)>5 and len(snpsd_250k_tmp[i].positions)>5:	#not enough for imputation
				npute_data_struc = NPUTESNPData(inFile=snpsd_250k_tmp[i], input_NA_char='NA', input_file_format=4, lower_case_for_imputation=0)
				imputeData(npute_data_struc, int(npute_window_size))
				snpsd_250k_tmp[i].snps = npute_data_struc.snps
				del npute_data_struc
			"""
        qcdata.row_id2NA_mismatch_rate0 = row_id2NA_mismatch_rate0
        qcdata.col_id2NA_mismatch_rate0 = col_id2NA_mismatch_rate0

        qcdata.min_call_probability = min_call_probability
        qcdata.max_call_mismatch_rate = max_call_mismatch_rate
        qcdata.max_call_NA_rate = max_call_NA_rate
        qcdata.max_snp_mismatch_rate = max_snp_mismatch_rate
        qcdata.max_snp_NA_rate = max_snp_NA_rate
        qcdata.npute_window_size = npute_window_size
        result.append(qcdata)
        return result
Esempio n. 8
0
    def run(self):

        if self.debug:
            import pdb
            pdb.set_trace()

        db = StockDB.StockDB(drivername=self.drivername,
                             username=self.db_user,
                             password=self.db_passwd,
                             hostname=self.hostname,
                             database=self.dbname)
        db.setup(create_tables=False)
        session = db.session
        session.begin()

        self.cmp_data_filename = self.findOutCmpDataFilename(
            self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod)
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.cmp_data_filename)
        strain_acc_list = map(
            int, strain_acc_list
        )  #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
        snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \
             data_matrix=data_matrix) #category_list is not used.

        readme = formReadmeObj(sys.argv, self.ad, StockDB.README)
        session.save(readme)

        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname,
                               host=self.hostname,
                               user=self.db_user,
                               passwd=self.db_passwd)
        curs = conn.cursor()
        from dbSNP2data import dbSNP2data
        snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m(
            curs, StockDB.Calls.table.name, StockDB.SNPs.table.name)
        strain_info_data = self.get_strain_id_info(self.QC_method_id)
        data_matrix = self.get_data_matrix(db,
                                           strain_info_data.strain_id2index,
                                           snp_id2index,
                                           StockDB.Calls.table.name)
        strain_acc_list = [
            strain_info_data.strain_id2acc[strain_id]
            for strain_id in strain_info_data.strain_id_list
        ]
        category_list = [
            strain_info_data.strain_id2category[strain_id]
            for strain_id in strain_info_data.strain_id_list
        ]
        header = ['ecotypeid', 'strainid']
        for snp_id in snp_id_list:
            snp_name, chromosome, position = snp_id2info[snp_id]
            header.append(snp_name)
        snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \
            snps_table='stock.snps') #snps_table is set to the stock_250k snps_table

        twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \
             QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug)
        if self.run_type == 1:
            row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise()
        elif self.run_type == 2:
            #twoSNPData.save_col_wise(session, readme)	#2008-08-18 need to implement a new one for 149SNP
            row_id2NA_mismatch_rate = {}
        else:
            sys.stderr.write("run_type=%s is not supported.\n" % self.run_type)
            sys.exit(5)
        if self.output_fname and self.run_type == 1 and row_id2NA_mismatch_rate:
            self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate,
                                                self.output_fname)

        if self.run_type == 1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate:
            #if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid)
            #row_id2NA_mismatch_rate might be None if it's method 0.
            self.submit_to_call_QC(session, row_id2NA_mismatch_rate, self.QC_method_id, self.db_user, \
                 twoSNPData.row_id12row_id2, readme)
        if self.commit:
            session.commit()
        else:
            session.rollback()
Esempio n. 9
0
    def plone_run(self, min_call_info_mismatch_rate=0.1):
        """
		2009-6-9
			pass self.max_mismatch_rate, self.min_no_of_non_NA_pairs to TwoSNPData to filter entries stored in db.
		2009-4-13
			add min_call_info_mismatch_rate
		2009-2-5
			add "create_tables=False" to db.setup()
		2008-07-02
			fix a bug which causes the program to continue read data even while call_info_id2fname is empty and input_dir is null.
		2008-07-01
			adjust to the newest functions in QC_250k.py
		2008-04-25
			return None if QC_method_id==0
		2008-04-20
			for plone to call it just to get row_id2NA_mismatch_rate
		"""

        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname,
                               host=self.hostname,
                               user=self.user,
                               passwd=self.passwd)
        curs = conn.cursor()
        self.curs = curs

        #database connection and etc
        db = Stock_250kDB.Stock_250kDB(username=self.user,
                                       password=self.passwd,
                                       hostname=self.hostname,
                                       database=self.dbname)
        db.setup(create_tables=False)
        session = db.session
        session.begin()
        #transaction = session.create_transaction()
        # if cmp_data_filename not specified, try to find in the data_description column in table QC_method.
        qm = QCMethod.query.get(self.QC_method_id)
        if not self.cmp_data_filename and self.QC_method_id != 0:
            if qm.data_description:
                data_description_ls = qm.data_description.split('=')
                if len(data_description_ls) > 1:
                    self.cmp_data_filename = qm.data_description.split(
                        '=')[1].strip()

        #after db query, cmp_data_filename is still nothing, exit program.
        if not self.cmp_data_filename and self.QC_method_id != 0:
            sys.stderr.write(
                "cmp_data_filename is still nothing even after db query. please specify it on the commandline.\n"
            )
            sys.exit(3)

        #from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.cmp_data_filename)
        strain_acc_list = map(
            int, strain_acc_list
        )  #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
        snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \
            data_matrix=data_matrix, snps_table=self.QC_method_id2snps_table.get(self.QC_method_id), ignore_het=qm.ignore_het)
        #category_list is not used.

        if self.input_dir:
            #04/22/08 Watch: call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid)
            #no submission to db
            call_info_id2fname = self.get_array_id2fname(curs, self.input_dir)
        else:
            #call_info_id2fname = self.get_call_info_id2fname(curs, self.call_info_table, self.call_QC_table, self.QC_method_id)
            call_data = self.get_call_info_id2fname(db, self.QC_method_id, self.call_method_id, \
             filter_calls_QCed=0, max_call_info_mismatch_rate=1, min_call_info_mismatch_rate=min_call_info_mismatch_rate,\
             debug=self.debug)
            call_info_id2fname = call_data.call_info_id2fname
            call_info_ls_to_return = call_data.call_info_ls_to_return

        #2008-07-01 pick the call_info_ids to be handled
        new_call_info_id2fname = {}
        for call_info_id_wanted in self.call_info_id_ls:
            if call_info_id_wanted in call_info_id2fname:
                new_call_info_id2fname[
                    call_info_id_wanted] = call_info_id2fname[
                        call_info_id_wanted]
            elif self.report:
                sys.stderr.write("%s not in call_info_id2fname.\n" %
                                 (call_info_id_wanted))
        call_info_id2fname = new_call_info_id2fname

        if call_info_id2fname:
            pdata = self.read_call_matrix(call_info_id2fname,
                                          self.min_probability)
            header = pdata.header
            call_info_id_ls = pdata.call_info_id_ls
            array_id_ls = pdata.array_id_ls
            ecotype_id_ls = pdata.ecotype_id_ls
            data_matrix = pdata.data_matrix
        elif self.input_dir:  #2008-07-02
            #input file is SNP by strain format. double header (1st two lines)
            header, snps_name_ls, category_list, data_matrix = FilterStrainSNPMatrix.read_data(
                self.input_dir, double_header=1)
            ecotype_id_ls = header[0][2:]
            call_info_id_ls = header[1][2:]
            data_matrix = numpy.array(data_matrix)
            data_matrix = data_matrix.transpose()
            header = ['', ''] + snps_name_ls  #fake a header for SNPData
        else:  #2008-07-02
            sys.stderr.write("No good arrays.\n")
            return None

        snps_name2snps_id = None

        #swap the ecotype_id_ls and call_info_id_ls when passing them to SNPData. now strain_acc_list=ecotype_id_ls
        snpData1 = SNPData(header=header, strain_acc_list=ecotype_id_ls, category_list= call_info_id_ls, data_matrix=data_matrix, \
            min_probability=self.min_probability, call_method_id=self.call_method_id, col_id2id=snps_name2snps_id,\
            max_call_info_mismatch_rate=self.max_call_info_mismatch_rate, snps_table='stock_250k.snps') #snps_table is set to the stock_250k snps_table

        twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \
             QC_method_id=self.QC_method_id, user=self.user, row_matching_by_which_value=0, debug=self.debug,\
             max_mismatch_rate=self.max_mismatch_rate, min_no_of_non_NA_pairs=self.min_no_of_non_NA_pairs)
        #2009-6-9 cross-matching results whose mismatch_rates are below max_mismatch_rate would be put into db.

        row_id2NA_mismatch_rate = None

        #2008-05-01 create a cross match table temporarily
        twoSNPData.qc_cross_match_table = 'qc_cross_match'
        twoSNPData.new_QC_cross_match_table = self.new_QC_cross_match_table
        twoSNPData.cal_row_id2pairwise_dist(
        )  #database submission is done along.
        return row_id2NA_mismatch_rate
Esempio n. 10
0
    def doFilter(
        self,
        snpData,
        snpData_qc_strain,
        snpData_qc_snp,
        min_call_probability,
        max_call_mismatch_rate,
        max_call_NA_rate,
        max_snp_mismatch_rate,
        max_snp_NA_rate,
        npute_window_size,
        output_dir=None,
    ):
        """
		2009-10-11
			replace imputeData() with NPUTE.samplingImpute(..., no_of_accessions_per_sampling=300, coverage=3) to avoid memory blowup. 
		2008-12-22
			replace '=' and ',' with '_' in the output filename
		2008-05-19
			matrix_ls has to be of length >0 before concatenation
		2008-05-19
			use SNPData structure
		2008-05-18
			add onlyCommon=True to FilterAccessions.filterByError()
		2008-05-17
			add argument output_dir. if it's available, output data matrix before and after imputation
		2008-05-12
			add
			qcdata.no_of_accessions_filtered_by_mismatch
			qcdata.no_of_accessions_filtered_by_na
			qcdata.no_of_snps_filtered_by_mismatch
			qcdata.no_of_snps_filtered_by_na
			qcdata.no_of_monomorphic_snps_removed
		
		2008-05-11
			split up from computing_node_handler
		"""
        qcdata = PassingData()
        twoSNPData = TwoSNPData(
            SNPData1=snpData, SNPData2=snpData_qc_strain, row_matching_by_which_value=0, debug=self.debug
        )
        row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise()
        del twoSNPData

        newSnpData = SNPData.removeRowsByMismatchRate(snpData, row_id2NA_mismatch_rate, max_call_mismatch_rate)
        qcdata.no_of_accessions_filtered_by_mismatch = newSnpData.no_of_rows_filtered_by_mismatch

        newSnpData = SNPData.removeRowsByNARate(newSnpData, max_call_NA_rate)
        qcdata.no_of_accessions_filtered_by_na = newSnpData.no_of_rows_filtered_by_na

        twoSNPData = TwoSNPData(
            SNPData1=newSnpData, SNPData2=snpData_qc_snp, row_matching_by_which_value=0, debug=self.debug
        )
        col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise()
        del twoSNPData
        newSnpData = SNPData.removeColsByMismatchRate(newSnpData, col_id2NA_mismatch_rate, max_snp_mismatch_rate)
        qcdata.no_of_snps_filtered_by_mismatch = newSnpData.no_of_cols_filtered_by_mismatch

        newSnpData = SNPData.removeColsByNARate(newSnpData, max_snp_NA_rate)
        qcdata.no_of_snps_filtered_by_na = newSnpData.no_of_cols_filtered_by_na

        twoSNPData = TwoSNPData(
            SNPData1=newSnpData, SNPData2=snpData_qc_snp, row_matching_by_which_value=0, debug=self.debug
        )
        newSnpData = twoSNPData.mergeTwoSNPData(priority=2)
        del twoSNPData
        # MergeSnpsData.merge(snpsd_250k_tmp, snpsd_ls_qc_snp, unionType=0, priority=2)

        newSnpData = SNPData.removeMonomorphicCols(newSnpData)
        qcdata.no_of_monomorphic_snps_removed = newSnpData.no_of_monomorphic_cols

        # FilterSnps.filterMonomorphic(snpsd_250k_tmp)

        if output_dir:
            # output data here
            if not os.path.isdir(output_dir):
                os.makedirs(output_dir)
            output_fname_prefix_ls = [
                "min_oligo_call_probability_%s" % min_call_probability,
                "max_array_mismatch_rate_%s" % max_call_mismatch_rate,
                "max_array_NA_rate_%s" % max_call_NA_rate,
                "max_snp_mismatch_rate_%s" % max_snp_mismatch_rate,
                "max_snp_NA_rate_%s" % max_snp_NA_rate,
                "npute_window_size_%s" % npute_window_size,
            ]
            output_fname = os.path.join(output_dir, "_".join(output_fname_prefix_ls + ["before_imputation.tsv"]))
            newSnpData.tofile(output_fname)
            # chromosomes = [snpsd_250k_tmp[i].chromosome for i in range(len(snpsd_250k_tmp))]
            # snpsdata.writeRawSnpsDatasToFile(output_fname, snpsd_250k_tmp, chromosomes=chromosomes, deliminator=',', withArrayIds = True)

        """
		qcdata.no_of_snps_filtered_by_mismatch = 0
		qcdata.no_of_snps_filtered_by_na = 0
		qcdata.no_of_monomorphic_snps_removed = 0
		for snpsd in snpsd_250k_tmp:
			qcdata.no_of_snps_filtered_by_mismatch += snpsd.no_of_snps_filtered_by_mismatch
			qcdata.no_of_snps_filtered_by_na += snpsd.no_of_snps_filtered_by_na
			qcdata.no_of_monomorphic_snps_removed += snpsd.no_of_monomorphic_snps_removed
		"""

        # snpData0 = RawSnpsData_ls2SNPData(snpsd_250k_tmp)

        twoSNPData0 = TwoSNPData(SNPData1=newSnpData, SNPData2=snpData_qc_strain, row_matching_by_which_value=0)
        row_id2NA_mismatch_rate0 = twoSNPData0.cmp_row_wise()
        col_id2NA_mismatch_rate0 = twoSNPData0.cmp_col_wise()
        del twoSNPData0

        result = []
        # for npute_window_size in npute_window_size_ls:
        # snpsd_250k_tmp_1 = copy.deepcopy(snpsd_250k_tmp)	#deepcopy, otherwise snpsd_250k_tmp_1[i].snps = [] would clear snpsd_250k_tmp up as well
        if len(newSnpData.row_id_ls) > 5:
            snps_name_ls = newSnpData.col_id_ls
            ## 2009-10-8 use NPUTE.samplingImpute()
            imputed_matrix, new_snps_name_ls = NPUTE.samplingImpute(
                snps_name_ls,
                newSnpData.data_matrix,
                input_file_format=1,
                input_NA_char=0,
                lower_case_for_imputation=False,
                npute_window_size=int(npute_window_size),
                no_of_accessions_per_sampling=300,
                coverage=3,
            )
            snpData_imputed = SNPData(
                row_id_ls=newSnpData.row_id_ls, col_id_ls=new_snps_name_ls, data_matrix=imputed_matrix
            )
            """
			## 2009-10-8 use NPUTE.samplingImpute() instead. comment out below
			chr2no_of_snps = NPUTE.get_chr2no_of_snps(snps_name_ls)
			chr_ls = chr2no_of_snps.keys()
			chr_ls.sort()
			snpData_imputed = SNPData(row_id_ls = newSnpData.row_id_ls, col_id_ls=[])
			matrix_ls = []
			for chromosome in chr_ls:
				if chr2no_of_snps[chromosome]>5:	#enough for imputation
					npute_data_struc = NPUTESNPData(snps_name_ls=snps_name_ls, data_matrix=newSnpData.data_matrix, chromosome=chromosome, \
									input_file_format=1, input_NA_char=0)
					imputeData(npute_data_struc, int(npute_window_size))
					matrix_ls.append(npute_data_struc.snps)
					snpData_imputed.col_id_ls += npute_data_struc.chosen_snps_name_ls
			if len(matrix_ls)>0:
				snpData_imputed.data_matrix = num.transpose(num.concatenate(matrix_ls))
			"""
            if output_dir:  # 2008-05-16 write the data out if output_fname is available
                # chromosomes = [snpsd_250k_tmp[i].chromosome for i in range(len(snpsd_250k_tmp))]	#already produced in the previous before_imputation output
                output_fname = os.path.join(output_dir, "_".join(output_fname_prefix_ls + ["after_imputation.tsv"]))
                # snpsdata.writeRawSnpsDatasToFile(output_fname, snpsd_250k_tmp, chromosomes=chromosomes, deliminator=',', withArrayIds = True)
                snpData_imputed.tofile(output_fname)

            twoSNPData1 = TwoSNPData(
                SNPData1=snpData_imputed, SNPData2=snpData_qc_strain, row_matching_by_which_value=0
            )
            qcdata.row_id2NA_mismatch_rate1 = twoSNPData1.cmp_row_wise()
            qcdata.col_id2NA_mismatch_rate1 = twoSNPData1.cmp_col_wise()
            del twoSNPData1, snpData_imputed
        else:
            snpData_imputed = None
            # qcdata.row_id2NA_mismatch_rate1 = {}
            # qcdata.col_id2NA_mismatch_rate1 = {}
        del newSnpData
        """
		for i in range(len(snpsd_250k_tmp)):
			#snpsd_250k_tmp_1[i].snps = []	#clear it up
			
			if len(snpsd_250k_tmp[i].accessions)>5 and len(snpsd_250k_tmp[i].positions)>5:	#not enough for imputation
				npute_data_struc = NPUTESNPData(inFile=snpsd_250k_tmp[i], input_NA_char='NA', input_file_format=4, lower_case_for_imputation=0)
				imputeData(npute_data_struc, int(npute_window_size))
				snpsd_250k_tmp[i].snps = npute_data_struc.snps
				del npute_data_struc
			"""
        qcdata.row_id2NA_mismatch_rate0 = row_id2NA_mismatch_rate0
        qcdata.col_id2NA_mismatch_rate0 = col_id2NA_mismatch_rate0

        qcdata.min_call_probability = min_call_probability
        qcdata.max_call_mismatch_rate = max_call_mismatch_rate
        qcdata.max_call_NA_rate = max_call_NA_rate
        qcdata.max_snp_mismatch_rate = max_snp_mismatch_rate
        qcdata.max_snp_NA_rate = max_snp_NA_rate
        qcdata.npute_window_size = npute_window_size
        result.append(qcdata)
        return result