コード例 #1
0
    def readInData(
        cls, phenotype_fname, input_fname, eigen_vector_fname, phenotype_method_id_ls, test_type=1, report=0
    ):
        """
		2010-2-25
			call removeUnPhenotypedSNPData() to shrink the snp dataset by removing un-phenotyped ecotypes
		2009-3-20
			refactored out of run(), easy for MpiAssociation.py to call
		"""
        header, strain_acc_list, category_list, data_matrix = read_data(input_fname)
        snpData = SNPData(
            header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix
        )

        header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(
            phenotype_fname, turn_into_integer=0
        )
        snpData = cls.removeUnPhenotypedSNPData(
            snpData, header_phen, strain_acc_list_phen, data_matrix_phen, phenotype_method_id_ls
        )

        newSnpData, allele2index_ls = snpData.convertSNPAllele2Index(
            report
        )  # 0 (NA) or -2 (untouched) is all converted to -2 as 0 is used to denote allele
        newSnpData.header = snpData.header

        data_matrix_phen = cls.get_phenotype_matrix_in_data_matrix_order(
            strain_acc_list, strain_acc_list_phen, data_matrix_phen
        )
        phenData = SNPData(header=header_phen, strain_acc_list=snpData.strain_acc_list, data_matrix=data_matrix_phen)

        if eigen_vector_fname:
            PC_data = cls.getPCFromFile(eigen_vector_fname)
            PC_matrix = PC_data.PC_matrix
        else:
            if test_type == 4:  # eigen_vector_fname not given for this test_type. calcualte PCs.
                import pca_module

                T, P, explained_var = pca_module.PCA_svd(newSnpData.data_matrix, standardize=False)
                PC_matrix = T
            else:
                PC_matrix = None

        del snpData
        if phenotype_method_id_ls:
            which_phenotype_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(phenData, Set(phenotype_method_id_ls))
        else:  # if not available, take all phenotypes
            which_phenotype_ls = range(len(phenData.col_id_ls))
        pdata = PassingData(
            snpData=newSnpData,
            phenData=phenData,
            PC_matrix=PC_matrix,
            which_phenotype_ls=which_phenotype_ls,
            phenotype_method_id_ls=phenotype_method_id_ls,
        )
        return pdata
コード例 #2
0
	def load_dstruc(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		QualityControl.load_dstruc(self)
		self.header1, self.strain_acc_list1, self.category_list1, self.data_matrix1 = read_data(self.input_fname1)
		self.header2, self.strain_acc_list2, self.category_list2, self.data_matrix2 = read_data(self.input_fname2)
	 	
		self.col_id2col_index1, self.col_id2col_index2, self.col_id12col_id2 = self.get_col_matching_dstruc(self.header1, self.header2)
		self.row_id2row_index1, self.row_id2row_index2, self.row_id12row_id2 = self.get_row_matching_dstruc(self.strain_acc_list1, self.category_list1, self.strain_acc_list2)
コード例 #3
0
    def inputNodePrepare(self, snp_info=None):
        """
		2009-2-16
			get phenData.phenotype_method_id_ls in the same order as phenData.col_id_ls
		2009-2-11
			refactored out of run()
		"""
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.input_fname)
        snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \
            data_matrix=data_matrix, turn_into_array=1) #category_list is not used to facilitate row-id matching

        picklef = open(self.snps_context_fname)
        snps_context_wrapper = cPickle.load(picklef)
        del picklef
        gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(snps_context_wrapper)
        del snps_context_wrapper
        gene_id_ls = gene_id2snps_id_ls.keys()
        gene_id_ls.sort()

        header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(
            self.phenotype_fname, turn_into_integer=0)
        phenData = SNPData(header=header_phen,
                           strain_acc_list=strain_acc_list_phen,
                           data_matrix=data_matrix_phen)
        phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(
            snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix)
        phenData.phenotype_method_id_ls = get_phenotype_method_id_lsFromPhenData(
            phenData)  #2009-2-16

        self.phenotype_index_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(
            phenData, Set(self.phenotype_method_id_ls))

        if not self.phenotype_index_ls:
            self.phenotype_index_ls = range(len(phenData.col_id_ls))

        pdata = PassingData(gene_id_ls=gene_id_ls, gene_id2snps_id_ls=gene_id2snps_id_ls, \
            phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info)
        params_ls = self.generate_params(self.gene_id_fname, pdata,
                                         self.block_size)

        other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=pdata.gene_id_ls, phenData=phenData, \
              phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info)
        other_data_pickle = cPickle.dumps(other_data, -1)
        del other_data

        output_node_data = PassingData(phenotype_label_ls=phenData.col_id_ls, \
              phenotype_index_ls=self.phenotype_index_ls)
        output_node_data_pickle = cPickle.dumps(output_node_data, -1)

        snpData_pickle = cPickle.dumps(snpData, -1)
        del snpData, data_matrix
        return_data = PassingData(snpData_pickle=snpData_pickle, other_data_pickle=other_data_pickle,\
              output_node_data_pickle=output_node_data_pickle, params_ls=params_ls)
        return return_data
コード例 #4
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0)
		header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname)
		if type(data_matrix)==list:
			data_matrix = numpy.array(data_matrix)
		
		data_matrix_phen = self.get_phenotype_matrix_in_data_matrix_order(strain_acc_list, strain_acc_list_phen, data_matrix_phen)
		kw_results = self._kruskal_wallis_whole_matrix(data_matrix, data_matrix_phen[:, self.which_phenotype], self.min_data_point)
		self.output_kw_results(kw_results, header[2:], self.output_fname, self.minus_log_pvalue)
コード例 #5
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		
		import MySQLdb
		conn = MySQLdb.connect(db=self.dbname,host=self.hostname, user=self.user, passwd = self.passwd)
		curs = conn.cursor()
		
		if self.ecotype_duplicate2tg_ecotypeid_table:
			ecotype_duplicate2tg_ecotypeid = self.get_ecotype_duplicate2tg_ecotypeid(curs, self.ecotype_duplicate2tg_ecotypeid_table)
		else:
			ecotype_duplicate2tg_ecotypeid = None
		from pymodule import figureOutDelimiter
		delimiter = figureOutDelimiter(self.input_fname)
		header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, delimiter=delimiter)
		
		tg_ecotypeid2ecotypeid_duplicate_index_ls = self.get_tg_ecotypeid2ecotypeid_duplicate_index_ls(strain_acc_list, category_list, ecotype_duplicate2tg_ecotypeid)
		
		ecotypeid2nativename = get_ecotypeid2nativename(curs, ecotype_table=self.ecotype_table)
		tg_ecotypeid_ls, merge_matrix = self.get_merged_matrix(tg_ecotypeid2ecotypeid_duplicate_index_ls, data_matrix, \
															ecotypeid2nativename, self.stat_output_fname)
		
		tg_nativename_ls = []
		for ecotypeid in tg_ecotypeid_ls:
			tg_nativename_ls.append(ecotypeid2nativename[ecotypeid])
		header[1] = 'nativename'
		write_data_matrix(merge_matrix, self.output_fname, header, tg_ecotypeid_ls, tg_nativename_ls, delimiter=delimiter)
コード例 #6
0
    def run(self):
        """
		2007-04-30
		2007-05-14
			add nt_alphabet_bits
		"""
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.input_fname, int(self.nt_alphabet_bits[0]))
        data_matrix = num.array(data_matrix)
        strain_homo_perc_vector = self.cal_strain_homo_perc_vector(data_matrix)
        snp_locus_log_prob = self.cal_snp_locus_log_prob(
            data_matrix, strain_homo_perc_vector)
        from sets import Set
        cols_to_be_tossed_out_set = Set()
        for i in range(len(snp_locus_log_prob)):
            if snp_locus_log_prob[i] <= min_log_prob:
                cols_to_be_tossed_out_set.add(i)
        print "%sSNPs removed:" % (len(cols_to_be_tossed_out_set))
        for col_index in cols_to_be_tossed_out_set:
            print '\t%s\t%s' % (col_index, header[2 + col_index])
        write_data_matrix(data_matrix,
                          self.output_fname,
                          header,
                          strain_acc_list,
                          category_list,
                          cols_to_be_tossed_out=cols_to_be_tossed_out_set,
                          nt_alphabet=int(self.nt_alphabet_bits[1]))
        import pylab
        pylab.title("histogram of snp locus log probability")
        pylab.hist(snp_locus_log_prob, 20)
        pylab.show()
コード例 #7
0
	def inputNodePrepare(self, snp_info=None):
		"""
		2009-2-16
			get phenData.phenotype_method_id_ls in the same order as phenData.col_id_ls
		2009-2-11
			refactored out of run()
		"""
		header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname)
		snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \
						data_matrix=data_matrix, turn_into_array=1)	#category_list is not used to facilitate row-id matching
		
		picklef = open(self.snps_context_fname)
		snps_context_wrapper = cPickle.load(picklef)
		del picklef
		gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(snps_context_wrapper)
		del snps_context_wrapper
		gene_id_ls = gene_id2snps_id_ls.keys()
		gene_id_ls.sort()
		
		header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0)
		phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen)
		phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix)
		phenData.phenotype_method_id_ls = get_phenotype_method_id_lsFromPhenData(phenData)	#2009-2-16
		
		self.phenotype_index_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(phenData, Set(self.phenotype_method_id_ls))
		
		if not self.phenotype_index_ls:
			self.phenotype_index_ls = range(len(phenData.col_id_ls))
		
		pdata = PassingData(gene_id_ls=gene_id_ls, gene_id2snps_id_ls=gene_id2snps_id_ls, \
						phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info)
		params_ls = self.generate_params(self.gene_id_fname, pdata, self.block_size)
		
		other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=pdata.gene_id_ls, phenData=phenData, \
								phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info)
		other_data_pickle = cPickle.dumps(other_data, -1)
		del other_data
		
		output_node_data = PassingData(phenotype_label_ls=phenData.col_id_ls, \
								phenotype_index_ls=self.phenotype_index_ls)
		output_node_data_pickle = cPickle.dumps(output_node_data, -1)
		
		snpData_pickle = cPickle.dumps(snpData, -1)
		del snpData, data_matrix
		return_data = PassingData(snpData_pickle=snpData_pickle, other_data_pickle=other_data_pickle,\
								output_node_data_pickle=output_node_data_pickle, params_ls=params_ls)
		return return_data
コード例 #8
0
	def run(self):
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		db = StockDB.StockDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname)
		db.setup(create_tables=False)
		session = db.session
		session.begin()
		
		self.cmp_data_filename = self.findOutCmpDataFilename(self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod)
		header, strain_acc_list, category_list, data_matrix = read_data(self.cmp_data_filename)
		strain_acc_list = map(int, strain_acc_list)	#it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
		snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \
							data_matrix=data_matrix)	#category_list is not used.
		
		readme = formReadmeObj(sys.argv, self.ad, StockDB.README)
		session.save(readme)
		
		import MySQLdb
		conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user = self.db_user, passwd = self.db_passwd)
		curs = conn.cursor()
		from dbSNP2data import dbSNP2data
		snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m(curs, StockDB.Calls.table.name, StockDB.SNPs.table.name)
		strain_info_data = self.get_strain_id_info(self.QC_method_id)
		data_matrix = self.get_data_matrix(db, strain_info_data.strain_id2index, snp_id2index, StockDB.Calls.table.name)
		strain_acc_list = [strain_info_data.strain_id2acc[strain_id] for strain_id in strain_info_data.strain_id_list]
		category_list = [strain_info_data.strain_id2category[strain_id] for strain_id in strain_info_data.strain_id_list]
		header = ['ecotypeid', 'strainid']
		for snp_id in snp_id_list:
			snp_name, chromosome, position = snp_id2info[snp_id]
			header.append(snp_name)
		snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \
						snps_table='stock.snps')	#snps_table is set to the stock_250k snps_table
		
		twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \
							QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug)
		if self.run_type==1:
			row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise()
		elif self.run_type==2:
			#twoSNPData.save_col_wise(session, readme)	#2008-08-18 need to implement a new one for 149SNP
			row_id2NA_mismatch_rate = {}
		else:
			sys.stderr.write("run_type=%s is not supported.\n"%self.run_type)
			sys.exit(5)
		if self.output_fname and self.run_type==1 and row_id2NA_mismatch_rate:
			self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate, self.output_fname)
		
		if self.run_type==1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate:
			#if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid)
			#row_id2NA_mismatch_rate might be None if it's method 0.
			self.submit_to_call_QC(session, row_id2NA_mismatch_rate, self.QC_method_id, self.db_user, \
								twoSNPData.row_id12row_id2, readme)
		if self.commit:
			session.commit()
		else:
			session.rollback()
コード例 #9
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		
		sys.stderr.write("This program is outdated. Please run Association.py instead.\n")
		sys.exit(0)
		header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0)
		header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname)
		if type(data_matrix)==list:
			data_matrix = numpy.array(data_matrix)
		
		snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
						data_matrix=data_matrix, turn_into_array=1, ignore_2nd_column=ignore_2nd_column)
		
		data_matrix_phen = self.get_phenotype_matrix_in_data_matrix_order(strain_acc_list, strain_acc_list_phen, data_matrix_phen)
		kw_results = self._kruskal_wallis_whole_matrix(snpData, data_matrix_phen[:, self.which_phenotype], self.min_data_point)
		self.output_kw_results(kw_results, header[2:], self.output_fname, self.minus_log_pvalue)
コード例 #10
0
	def prepareTwoSNPData(self, db, max_mismatch_rate=0.25, min_no_of_non_NA_pairs=40, report=0):
		"""
		2009-9-23
			add arguments max_mismatch_rate & min_no_of_non_NA_pairs, and pass them to twoSNPData.
			However it's useless to control what should be inserted into db because TwoSNPData.qc_cross_match_table is
			not defined and even if it's defined, the table it'll create doesn't concord to the one in 149SNP db. 
		2008-09-10
			if self.input_fname is given, get 149SNP data from it , instead of database
		2008-8-28
			split out of run() so that MpiQC149CrossMatch could call this easily
		"""
		import MySQLdb
		conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user = self.db_user, passwd = self.db_passwd)
		curs = conn.cursor()
		if self.input_fname:
			header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname)
		else:
			from dbSNP2data import dbSNP2data
			snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m(curs, StockDB.Calls.table.name, StockDB.SNPs.table.name)
			strain_info_data = self.get_strain_id_info(self.QC_method_id, ignore_strains_with_qc=False)
			data_matrix = self.get_data_matrix(db, strain_info_data.strain_id2index, snp_id2index, StockDB.Calls.table.name)
			strain_acc_list = [strain_info_data.strain_id2acc[strain_id] for strain_id in strain_info_data.strain_id_list]	#tg_ecotypeid
			category_list = [strain_info_data.strain_id2category[strain_id] for strain_id in strain_info_data.strain_id_list]	#strainid
			header = ['ecotypeid', 'strainid']
			for snp_id in snp_id_list:
				snp_name, chromosome, position = snp_id2info[snp_id]
				header.append(snp_name)
		snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \
						snps_table='stock.snps')	#snps_table is set to the stock_250k snps_table
		if self.QC_method_id==4:
			snpData2 = snpData1
		else:
			self.cmp_data_filename = self.findOutCmpDataFilename(self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod)
			header, strain_acc_list, category_list, data_matrix = read_data(self.cmp_data_filename)
			strain_acc_list = map(int, strain_acc_list)	#it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
			snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \
							data_matrix=data_matrix)	#category_list is not used to facilitate row-id matching
		
		
		twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, \
							QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug,\
							max_mismatch_rate=max_mismatch_rate, min_no_of_non_NA_pairs=min_no_of_non_NA_pairs, report=report)
		return twoSNPData
コード例 #11
0
    def run(self):
        """
		2007-02-27
		2007-09-14
			filtering_bits
		-read_data()
		-remove_rows_with_too_many_NAs()
		-remove_cols_with_too_many_NAs()
		-remove_identity_strains()
		-write_data_matrix()
		"""
        if self.debug:
            import pdb

            pdb.set_trace()
        delimiter = figureOutDelimiter(self.input_fname, report=self.report)
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.input_fname, int(self.nt_alphabet_bits[0]), delimiter=delimiter
        )
        data_matrix = num.array(data_matrix)
        if self.filtering_bits[0] == "1":
            remove_rows_data = self.remove_rows_with_too_many_NAs(data_matrix, self.row_cutoff)
            rows_with_too_many_NAs_set = remove_rows_data.rows_with_too_many_NAs_set
            strain_index2no_of_NAs = remove_rows_data.row_index2no_of_NAs
        else:
            rows_with_too_many_NAs_set = Set()
        if self.filtering_bits[1] == "1":
            remove_cols_data = self.remove_cols_with_too_many_NAs(data_matrix, col_cutoff, rows_with_too_many_NAs_set)
            cols_with_too_many_NAs_set = remove_cols_data.cols_with_too_many_NAs_set
        else:
            cols_with_too_many_NAs_set = Set()
        if self.filtering_bits[2] == "1":
            no_of_rows, no_of_cols = data_matrix.shape
            total_rows_set = Set(range(no_of_rows))
            rows_to_be_checked = total_rows_set - rows_with_too_many_NAs_set
            total_cols_set = Set(range(no_of_cols))
            cols_to_be_checked = total_cols_set - cols_with_too_many_NAs_set
            identity_strains_to_be_removed = self.remove_identity_strains(
                data_matrix, rows_to_be_checked, cols_to_be_checked
            )
        else:
            identity_strains_to_be_removed = Set()
        rows_to_be_tossed_out = rows_with_too_many_NAs_set | identity_strains_to_be_removed
        # self.write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out, cols_with_too_many_NAs_set, int(self.nt_alphabet_bits[1]))
        write_data_matrix(
            data_matrix,
            self.output_fname,
            header,
            strain_acc_list,
            category_list,
            rows_to_be_tossed_out,
            cols_with_too_many_NAs_set,
            nt_alphabet=int(self.nt_alphabet_bits[1]),
            delimiter=delimiter,
        )
コード例 #12
0
	def run(self):
		self.communicator = MPI.world.duplicate()
		node_rank = self.communicator.rank
		free_computing_nodes = range(1, self.communicator.size-1)	#exclude the 1st and last node
		free_computing_node_set = Set(free_computing_nodes)
		output_node_rank = self.communicator.size-1
		
		"""
		if node_rank!=output_node_rank:
			header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname)
			snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \
							data_matrix=data_matrix)	#category_list is not used to facilitate row-id matching
		"""
		if node_rank == 0:
			header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname)
			snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \
							data_matrix=data_matrix)	#category_list is not used to facilitate row-id matching
			snpData_pickle = cPickle.dumps(snpData, -1)
			for node in free_computing_nodes:	#send it to the computing_node
				sys.stderr.write("passing initial data to nodes from %s to %s ... "%(node_rank, node))
				self.communicator.send(snpData_pickle, node, 0)
				sys.stderr.write(".\n")
			del snpData_pickle
			params_ls = self.generate_params(len(snpData.col_id_ls), self.block_size)
			del snpData
		elif node_rank in free_computing_node_set:
			data, source, tag = self.communicator.receiveString(0, 0)
			snpData =  cPickle.loads(data)
			del data
		else:
			pass
		
		self.synchronize()
		if node_rank == 0:
			param_obj = PassingData(params_ls=params_ls, output_node_rank=output_node_rank, report=self.report, counter=0)
			self.inputNode(param_obj, free_computing_nodes, param_generator = params_ls)
			#self.input_node(param_obj, free_computing_nodes, input_handler=self.input_handler, message_size=1)
			#self.input_node(param_obj, free_computing_nodes, self.message_size)
		elif node_rank in free_computing_node_set:
			computing_parameter_obj = PassingData(snpData=snpData, min_LD_to_output=self.min_LD_to_output, min_MAF=self.min_MAF, discard_perc=self.discard_perc)
			self.computing_node(computing_parameter_obj, self.computing_node_handler)
		else:
			if getattr(self, 'output_fname', None):
				writer = csv.writer(open(self.output_fname, 'w'), delimiter='\t')
				#header_row = ['snp1_id', 'snp2_id', 'r2', 'D', "D'", "no_of_pairs"]
				#writer.writerow(header_row)
			else:
				writer = None
			param_obj = PassingData(writer=writer, is_header_written=False)
			self.output_node(free_computing_nodes, param_obj, self.output_node_handler)
			del writer
		self.synchronize()	#to avoid some node early exits
コード例 #13
0
    def run(self):
        """
		2008-5-12
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        #database connection and etc
        db = self.db_250k

        session = db.session
        session.begin()

        delimiter = figureOutDelimiter(self.inputFname, report=self.report)
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.inputFname, delimiter=delimiter)

        if self.snp_id_type == 1:
            #2011-2-27 translate the db_id into chr_pos because the new StrainXSNP dataset uses db_id to identify SNPs.
            # but if col-id is already chr_pos, it's fine.
            new_header = header[:2]
            data_matrix_col_index_to_be_kept = []
            for i in xrange(2, len(header)):
                snp_id = header[i]
                chr_pos = db.get_chr_pos_given_db_id2chr_pos(snp_id, )
                if chr_pos is not None:
                    data_matrix_col_index_to_be_kept.append(i - 2)
                    new_header.append(chr_pos)
            # to remove no-db_id columns from data matrix
            data_matrix = numpy.array(data_matrix)
            data_matrix = data_matrix[:, data_matrix_col_index_to_be_kept]
            header = new_header

        if self.array_id_2nd_column:
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
                data_matrix=data_matrix)
        else:
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\
                data_matrix=data_matrix) #ignore category_list

        rawSnpsData_ls = SNPData2RawSnpsData_ls(snpData,
                                                need_transposeSNPData=1,
                                                report=self.report)
        chromosomes = [
            rawSnpsData.chromosome for rawSnpsData in rawSnpsData_ls
        ]
        snpsdata.writeRawSnpsDatasToFile(self.outputFname,
                                         rawSnpsData_ls,
                                         chromosomes=chromosomes,
                                         deliminator=',',
                                         withArrayIds=self.array_id_2nd_column)
コード例 #14
0
    def main(self):
        if self.debug:
            import pdb

            pdb.set_trace()
        if self.input_file_format == 1:
            header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, turn_into_integer=0)
            snps_name_ls = header[2:]
            no_of_rows = len(strain_acc_list)
            no_of_samplings = int(math.ceil(self.coverage * no_of_rows / float(self.no_of_accessions_per_sampling)))
            if no_of_samplings > 1:
                imputed_matrix, new_snps_name_ls = self.samplingImpute(
                    snps_name_ls,
                    data_matrix,
                    input_file_format=1,
                    input_NA_char="0",
                    lower_case_for_imputation=self.lower_case_for_imputation,
                    npute_window_size=self.single_window_size,
                    no_of_accessions_per_sampling=self.no_of_accessions_per_sampling,
                    coverage=self.coverage,
                )
                imputedData = YuSNPData(
                    strain_acc_list=strain_acc_list,
                    category_list=category_list,
                    col_id_ls=snps_name_ls,
                    data_matrix=imputed_matrix,
                )
                imputedData.tofile(self.output_fname)
            else:
                self.outputHeader(self.output_fname, strain_acc_list, category_list)
                chr2no_of_snps = self.get_chr2no_of_snps(snps_name_ls)
                chr_ls = chr2no_of_snps.keys()
                chr_ls.sort()
                for chromosome in chr_ls:
                    snpData = SNPData(
                        inFile=self.input_fname,
                        snps_name_ls=snps_name_ls,
                        data_matrix=data_matrix,
                        chromosome=chromosome,
                        input_file_format=self.input_file_format,
                        lower_case_for_imputation=self.lower_case_for_imputation,
                    )
                    self.run(snpData)
        else:
            snpData = SNPData(
                inFile=self.input_fname,
                input_file_format=self.input_file_format,
                lower_case_for_imputation=self.lower_case_for_imputation,
            )
            self.run(snpData)
コード例 #15
0
	def run(self):
		from pymodule import read_data
		header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname)
		snp_acc_ls = header[2:]
		if self.debug:
			import pdb
			pdb.set_trace()
		if self.drawing_type==1:
			matrix_value2label= number2nt
			matrix_value2color = number2color
			data_matrix = numpy.array(data_matrix)
		elif self.drawing_type==2:
			import MySQLdb
			#conn = MySQLdb.connect(db="stock",host='natural.uchicago.edu', user='******', passwd='iamhereatusc')
			conn = MySQLdb.connect(db=self.dbname,host=self.hostname)
			curs = conn.cursor()
			snp_allele2index_ls = self.get_snp_allele2index_ls(curs, snp_acc_ls, self.snps_sequenom_info_table)
			data_matrix = self.transformMatrixIntoTwoAllelesAndHetero(data_matrix, snp_allele2index_ls)
			matrix_value2label= {-1:'deletion', 0:'NA', 1:'allele1', 2:'allele2', 3:'hetero'}
			matrix_value2color = {-1:(0,0,0), 0:(255,255,255), 1:(0,0,255), 2:(0,255,0), 3:(255,0,0)}
		elif self.drawing_type==3:
			data_matrix = self.transformMatrixIntoHomoAndHetero(data_matrix)
			matrix_value2label= {-1:'deletion', 0:'NA', 1:'h**o', 2:'hetero'}
			matrix_value2color = {-1:(0,0,0), 0:(255,255,255), 1:(0,0,255), 2:(255,0,0)}
		elif self.drawing_type==4:
			data_matrix = self.transformMatrixIntoTwoAllelesAndHetero(data_matrix)
			matrix_value2label= {-1:'deletion', 0:'NA', 1:'allele1', 2:'allele2', 3:'hetero'}
			matrix_value2color = {-1:(0,0,0), 0:(255,255,255), 1:(0,0,255), 2:(0,255,0), 3:(255,0,0)}
		elif self.drawing_type==5:
			data_matrix = self.transformMatrixIntoFourNucleotides(data_matrix)
			matrix_value2label= {-1: '-', 0: 'NA',	1:'A',	2:'C',	3:'G',	4:'T'}
			matrix_value2color = {-1:(0,0,0), 0:(255,255,255), 1:(0,0,255), 2:(0,255,0), 3:(255,0,0), 4:(122,0,122)}
			new_strain_acc_list = []
			for strain_acc in strain_acc_list:
				new_strain_acc_list.append(strain_acc)
				new_strain_acc_list.append(strain_acc)
			strain_acc_list = new_strain_acc_list
		else:
			sys.stderr.write("drawing_type %s not supported\n"%drawing_type)
			sys.exit(2)
		row_label_type2label_ls = {1:strain_acc_list,
			2:category_list}
		
		font = get_font(self.font_path, font_size=self.font_size)	#2008-08-01
		im = drawLegend(matrix_value2label, matrix_value2color, font)
		im.save('%s_legend.png'%self.output_fname_prefix)
		im = drawMatrix(data_matrix, matrix_value2color, row_label_type2label_ls[self.row_label_type], snp_acc_ls, with_grid=1, font=font)
		im.save('%s.png'%self.output_fname_prefix)
コード例 #16
0
    def run(self):
        """
		2009-5-28
		"""
        if self.debug:
            import pdb
            pdb.set_trace()
        db = Stock_250kDB(drivername=self.drivername,
                          username=self.db_user,
                          password=self.db_passwd,
                          hostname=self.hostname,
                          database=self.dbname,
                          schema=self.schema)
        db.setup(create_tables=False)

        nativename2tg_ecotypeid_set = getNativename2TgEcotypeIDSet(
            db.metadata.bind, turnUpperCase=True)
        ecotype_id_set_250k_in_pipeline = get_ecotype_id_set_250k_in_pipeline(
            ArrayInfo)
        ecotypeid2tg_ecotypeid = get_ecotypeid2tg_ecotypeid(db.metadata.bind)

        #turn_into_integer=2 because it's not nucleotides
        header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(
            self.input_fname, turn_into_integer=2, matrix_data_type=float)
        data_matrix_phen = numpy.array(data_matrix_phen)

        #2009-8-19 bug here. strain_acc_list_phen is not unique for each row. causing replicates to have the same value
        #from Association import Association
        #data_matrix_phen = Association.get_phenotype_matrix_in_data_matrix_order(strain_acc_list_phen, strain_acc_list_phen, data_matrix_phen)

        phenData = SNPData(header=header_phen,
                           strain_acc_list=strain_acc_list_phen,
                           data_matrix=data_matrix_phen)

        ecotype_id_ls = self.straightenEcotypeID(phenData.row_id_ls, nativename2tg_ecotypeid_set, ecotypeid2tg_ecotypeid, \
                  ecotype_id_set_250k_in_pipeline)

        session = db.session
        session.begin()
        if self.run_type == 1:
            self.putPhenotypeIntoDB(db, phenData, ecotype_id_ls)
        elif self.run_type == 2:
            self.putReplicatePhenotypeIntoDB(db, phenData, ecotype_id_ls)
        else:
            sys.stderr.write("Unsupported run type: %s.\n" % (self.run_type))
        if self.commit:
            session.commit()
コード例 #17
0
	def run(self):
		"""
		2008-9-7
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		
		delimiter = figureOutDelimiter(self.input_fname, report=self.report)
		header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, delimiter=delimiter)
		
		snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
						data_matrix=data_matrix)
		newSnpData, allele_index2allele_ls = snpData.convert2Binary(self.report)
		
		if self.mapping_fname:	#output allele_index2allele_ls
			self.output_allele2index_ls(snpData, allele_index2allele_ls, self.mapping_fname)
		
		newSnpData.tofile(self.output_fname)
コード例 #18
0
	def run(self):
		"""
		2008-5-12
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		
		delimiter = figureOutDelimiter(self.input_fname, report=self.report)
		header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, delimiter=delimiter)
		
		if self.array_id_2nd_column:
			snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
							data_matrix=data_matrix)
		else:
			snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\
							data_matrix=data_matrix)	#ignore category_list
		
		rawSnpsData_ls = SNPData2RawSnpsData_ls(snpData, need_transposeSNPData=1, report=self.report)
		chromosomes = [rawSnpsData.chromosome for rawSnpsData in rawSnpsData_ls]
		snpsdata.writeRawSnpsDatasToFile(self.output_fname, rawSnpsData_ls, chromosomes=chromosomes, deliminator=',', withArrayIds=self.array_id_2nd_column)
コード例 #19
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		session = db.session
		
		header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0)
		phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen)	#row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
		phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(phenData.row_id_ls, strain_acc_list_phen, phenData.data_matrix)	#tricky, using strain_acc_list_phen
		
		phenotype_col_index1 = self.findOutWhichPhenotypeColumn(phenData, Set([self.phenotype_method_id1]))[0]
		phenotype_col_index2 = self.findOutWhichPhenotypeColumn(phenData, Set([self.phenotype_method_id2]))[0]
		
		x_ls = []
		y_ls = []
		for i in range(phenData.data_matrix.shape[0]):
			if not numpy.isnan(phenData.data_matrix[i][phenotype_col_index1]) and not numpy.isnan(phenData.data_matrix[i][phenotype_col_index2]):
				x_ls.append(phenData.data_matrix[i][phenotype_col_index1])
				y_ls.append(phenData.data_matrix[i][phenotype_col_index2])
		
		pylab.clf()
		pylab.title('Phenotype Contrast')
		pylab.plot(x_ls, y_ls, '.', alpha=0.6)
		pylab.grid(alpha=0.3)
		phenotype_method1 = Stock_250kDB.PhenotypeMethod.get(self.phenotype_method_id1)
		phenotype_method2 = Stock_250kDB.PhenotypeMethod.get(self.phenotype_method_id2)
		pylab.xlabel(phenotype_method1.short_name)
		pylab.ylabel(phenotype_method2.short_name)
		
		#draw diagonal line to show perfect correlation
		max_min_value = max(min(x_ls), min(y_ls))
		min_max_value = min(max(x_ls), max(y_ls))
		pylab.plot([max_min_value, min_max_value], [max_min_value, min_max_value], c='g', alpha=0.7)
		
		png_output_fname = '%s.png'%self.output_fname_prefix
		pylab.savefig(png_output_fname, dpi=400)
		pylab.savefig('%s.svg'%self.output_fname_prefix)
コード例 #20
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname,
                               host=self.hostname,
                               user=self.user,
                               passwd=self.passwd)
        curs = conn.cursor()

        if self.ecotype_duplicate2tg_ecotypeid_table:
            ecotype_duplicate2tg_ecotypeid = self.get_ecotype_duplicate2tg_ecotypeid(
                curs, self.ecotype_duplicate2tg_ecotypeid_table)
        else:
            ecotype_duplicate2tg_ecotypeid = None
        from pymodule import figureOutDelimiter
        delimiter = figureOutDelimiter(self.input_fname)
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.input_fname, delimiter=delimiter)

        tg_ecotypeid2ecotypeid_duplicate_index_ls = self.get_tg_ecotypeid2ecotypeid_duplicate_index_ls(
            strain_acc_list, category_list, ecotype_duplicate2tg_ecotypeid)

        ecotypeid2nativename = get_ecotypeid2nativename(
            curs, ecotype_table=self.ecotype_table)
        tg_ecotypeid_ls, merge_matrix = self.get_merged_matrix(tg_ecotypeid2ecotypeid_duplicate_index_ls, data_matrix, \
                     ecotypeid2nativename, self.stat_output_fname)

        tg_nativename_ls = []
        for ecotypeid in tg_ecotypeid_ls:
            tg_nativename_ls.append(ecotypeid2nativename[ecotypeid])
        header[1] = 'nativename'
        write_data_matrix(merge_matrix,
                          self.output_fname,
                          header,
                          tg_ecotypeid_ls,
                          tg_nativename_ls,
                          delimiter=delimiter)
コード例 #21
0
	def run(self):
		"""
		2007-02-27
		2007-09-14
			filtering_bits
		-read_data()
		-remove_rows_with_too_many_NAs()
		-remove_cols_with_too_many_NAs()
		-remove_identity_strains()
		-write_data_matrix()
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		delimiter = figureOutDelimiter(self.input_fname, report=self.report)
		header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, int(self.nt_alphabet_bits[0]), delimiter=delimiter)
		data_matrix = numpy.array(data_matrix)
		if self.filtering_bits[0]=='1':
			remove_rows_data = self.remove_rows_with_too_many_NAs(data_matrix, self.row_cutoff)
			rows_with_too_many_NAs_set = remove_rows_data.rows_with_too_many_NAs_set
			strain_index2no_of_NAs = remove_rows_data.row_index2no_of_NAs
		else:
			rows_with_too_many_NAs_set = set()
		if self.filtering_bits[1]=='1':
			remove_cols_data = self.remove_cols_with_too_many_NAs(data_matrix, col_cutoff, rows_with_too_many_NAs_set)
			cols_with_too_many_NAs_set = remove_cols_data.cols_with_too_many_NAs_set			
		else:
			cols_with_too_many_NAs_set = set()
		if self.filtering_bits[2]=='1':
			no_of_rows, no_of_cols = data_matrix.shape
			total_rows_set = set(range(no_of_rows))
			rows_to_be_checked = total_rows_set - rows_with_too_many_NAs_set
			total_cols_set = set(range(no_of_cols))
			cols_to_be_checked = total_cols_set - cols_with_too_many_NAs_set
			identity_strains_to_be_removed = self.remove_identity_strains(data_matrix, rows_to_be_checked, cols_to_be_checked)
		else:
			identity_strains_to_be_removed = set()
		rows_to_be_tossed_out = rows_with_too_many_NAs_set | identity_strains_to_be_removed
		#self.write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out, cols_with_too_many_NAs_set, int(self.nt_alphabet_bits[1]))
		write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out, cols_with_too_many_NAs_set, nt_alphabet=int(self.nt_alphabet_bits[1]), delimiter=delimiter)
コード例 #22
0
    def run(self):
        """
		2008-9-7
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        delimiter = figureOutDelimiter(self.input_fname, report=self.report)
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.input_fname, delimiter=delimiter)

        snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
            data_matrix=data_matrix)
        newSnpData, allele_index2allele_ls = snpData.convert2Binary(
            self.report)

        if self.mapping_fname:  #output allele_index2allele_ls
            self.output_allele2index_ls(snpData, allele_index2allele_ls,
                                        self.mapping_fname)

        newSnpData.tofile(self.output_fname)
コード例 #23
0
	def run(self):
		"""
		2009-5-28
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		db = Stock_250kDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		
		nativename2tg_ecotypeid_set = getNativename2TgEcotypeIDSet(db.metadata.bind, turnUpperCase=True)
		ecotype_id_set_250k_in_pipeline = get_ecotype_id_set_250k_in_pipeline(ArrayInfo)
		ecotypeid2tg_ecotypeid = get_ecotypeid2tg_ecotypeid(db.metadata.bind)
		
		#turn_into_integer=2 because it's not nucleotides
		header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.input_fname, turn_into_integer=2, matrix_data_type=float)
		data_matrix_phen = numpy.array(data_matrix_phen)
		
		#2009-8-19 bug here. strain_acc_list_phen is not unique for each row. causing replicates to have the same value
		#from Association import Association
		#data_matrix_phen = Association.get_phenotype_matrix_in_data_matrix_order(strain_acc_list_phen, strain_acc_list_phen, data_matrix_phen)
		
		phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen)
		
		ecotype_id_ls = self.straightenEcotypeID(phenData.row_id_ls, nativename2tg_ecotypeid_set, ecotypeid2tg_ecotypeid, \
												ecotype_id_set_250k_in_pipeline)
		
		session = db.session
		session.begin()
		if self.run_type==1:
			self.putPhenotypeIntoDB(db, phenData, ecotype_id_ls)
		elif self.run_type==2:
			self.putReplicatePhenotypeIntoDB(db, phenData, ecotype_id_ls)
		else:
			sys.stderr.write("Unsupported run type: %s.\n"%(self.run_type))
		if self.commit:
			session.commit()
コード例 #24
0
 def main(self):
     if self.debug:
         import pdb
         pdb.set_trace()
     if self.input_file_format == 1:
         header, strain_acc_list, category_list, data_matrix = read_data(
             self.input_fname, turn_into_integer=0)
         snps_name_ls = header[2:]
         no_of_rows = len(strain_acc_list)
         no_of_samplings = int(
             math.ceil(self.coverage * no_of_rows /
                       float(self.no_of_accessions_per_sampling)))
         if no_of_samplings > 1:
             imputed_matrix, new_snps_name_ls = self.samplingImpute(snps_name_ls, data_matrix, input_file_format=1, \
                      input_NA_char='0', lower_case_for_imputation=self.lower_case_for_imputation,\
                      npute_window_size=self.single_window_size, no_of_accessions_per_sampling=self.no_of_accessions_per_sampling,\
                      coverage=self.coverage)
             imputedData = YuSNPData(strain_acc_list=strain_acc_list,
                                     category_list=category_list,
                                     col_id_ls=snps_name_ls,
                                     data_matrix=imputed_matrix)
             imputedData.tofile(self.output_fname)
         else:
             self.outputHeader(self.output_fname, strain_acc_list,
                               category_list)
             chr2no_of_snps = self.get_chr2no_of_snps(snps_name_ls)
             chr_ls = chr2no_of_snps.keys()
             chr_ls.sort()
             for chromosome in chr_ls:
                 snpData = SNPData(inFile=self.input_fname, snps_name_ls=snps_name_ls, data_matrix=data_matrix, chromosome=chromosome, \
                     input_file_format=self.input_file_format, lower_case_for_imputation=self.lower_case_for_imputation)
                 self.run(snpData)
     else:
         snpData = SNPData(
             inFile=self.input_fname,
             input_file_format=self.input_file_format,
             lower_case_for_imputation=self.lower_case_for_imputation)
         self.run(snpData)
コード例 #25
0
	def run(self):
		"""
		2007-04-30
		2007-05-14
			add nt_alphabet_bits
		"""
		header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, int(self.nt_alphabet_bits[0]))
		data_matrix = num.array(data_matrix)
		strain_homo_perc_vector = self.cal_strain_homo_perc_vector(data_matrix)
		snp_locus_log_prob = self.cal_snp_locus_log_prob(data_matrix, strain_homo_perc_vector)
		from sets import Set
		cols_to_be_tossed_out_set = Set()
		for i in range(len(snp_locus_log_prob)):
			if snp_locus_log_prob[i]<=min_log_prob:
				cols_to_be_tossed_out_set.add(i)
		print "%sSNPs removed:"%(len(cols_to_be_tossed_out_set))
		for col_index in cols_to_be_tossed_out_set:
			print '\t%s\t%s'%(col_index, header[2+col_index])
		write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, cols_to_be_tossed_out=cols_to_be_tossed_out_set, nt_alphabet=int(self.nt_alphabet_bits[1]))
		import pylab
		pylab.title("histogram of snp locus log probability")
		pylab.hist(snp_locus_log_prob, 20)
		pylab.show()
コード例 #26
0
    def run(self):
        """
		2008-09-06
		"""
        if self.debug:
            #for one-node testing purpose
            import pdb
            pdb.set_trace()
            header, strain_acc_list, category_list, data_matrix = read_data(
                self.input_fname)
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \
                data_matrix=data_matrix, turn_into_array=1) #category_list is not used to facilitate row-id matching

            picklef = open(self.snps_context_fname)
            snps_context_wrapper = cPickle.load(picklef)
            del picklef
            gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(
                snps_context_wrapper)
            gene_id_ls = gene_id2snps_id_ls.keys()
            gene_id_ls.sort()

            header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(
                self.phenotype_fname, turn_into_integer=0)
            phenData = SNPData(
                header=header_phen,
                strain_acc_list=strain_acc_list,
                data_matrix=data_matrix_phen
            )  #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
            phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(
                snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix)

            other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls,
                                     gene_id_ls=gene_id_ls,
                                     phenData=phenData)
            other_data_pickle = cPickle.dumps(other_data, -1)
            phenotype_label_ls_pickle = cPickle.dumps(phenData.col_id_ls, -1)
            snpData_pickle = cPickle.dumps(snpData, -1)
            sys.exit(2)

        self.communicator = MPI.world.duplicate()
        node_rank = self.communicator.rank
        free_computing_nodes = range(1, self.communicator.size -
                                     1)  #exclude the 1st and last node
        free_computing_node_set = Set(free_computing_nodes)
        output_node_rank = self.communicator.size - 1

        if node_rank == 0:
            dstruc = self.inputNodePrepare()
            params_ls = dstruc.params_ls
            #send the output node the phenotype_label_ls
            self.communicator.send(dstruc.output_node_data_pickle,
                                   output_node_rank, 0)
            del dstruc.output_node_data_pickle

            for node in free_computing_nodes:  #send it to the computing_node
                sys.stderr.write(
                    "passing initial data to nodes from %s to %s ... " %
                    (node_rank, node))
                self.communicator.send(dstruc.snpData_pickle, node, 0)
                self.communicator.send(dstruc.other_data_pickle, node, 0)
                sys.stderr.write(".\n")
            del dstruc

        elif node_rank in free_computing_node_set:
            data, source, tag = self.communicator.receiveString(0, 0)
            snpData = cPickle.loads(data)
            del data
            data, source, tag = self.communicator.receiveString(0, 0)
            other_data = cPickle.loads(data)
            del data
            self.phenotype_index_ls = other_data.phenotype_index_ls
        else:
            data, source, tag = self.communicator.receiveString(0, 0)
            output_node_data_pickle = cPickle.loads(data)
            phenotype_label_ls = output_node_data_pickle.phenotype_label_ls
            self.phenotype_index_ls = output_node_data_pickle.phenotype_index_ls

        self.synchronize()
        if node_rank == 0:
            param_obj = PassingData(params_ls=params_ls,
                                    output_node_rank=output_node_rank,
                                    report=self.report,
                                    counter=0)
            self.inputNode(param_obj,
                           free_computing_nodes,
                           param_generator=params_ls)
            #self.input_node(param_obj, free_computing_nodes, input_handler=self.input_fetch_handler, message_size=1)
        elif node_rank in free_computing_node_set:
            computing_parameter_obj = PassingData(snpData=snpData, gene_id_ls=other_data.gene_id_ls, \
                     gene_id2snps_id_ls=other_data.gene_id2snps_id_ls, phenData=other_data.phenData,
                     phenotype_index_ls=self.phenotype_index_ls, min_data_point=self.min_data_point,
                     test_type=self.test_type)
            self.computing_node(computing_parameter_obj,
                                self.computing_node_handler)
        else:
            self.general_output_node(self.output_dir, self.phenotype_index_ls,
                                     phenotype_label_ls, free_computing_nodes)
        self.synchronize()  #to avoid some node early exits
コード例 #27
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		session = db.session
		
		snpData = SNPData(input_fname=self.input_fname, turn_into_integer=1, turn_into_array=1, ignore_2nd_column=1)
		
		
		if self.eigen_vector_fname and self.eigen_value_fname:
			eigen_value_ls = self.getEigenValueFromFile(self.eigen_value_fname)
			eigen_value_ls = numpy.array(eigen_value_ls)
			explained_var = eigen_value_ls/numpy.sum(eigen_value_ls)
			PC_data = self.getPCFromFile(self.eigen_vector_fname)
			PC_matrix = PC_data.PC_matrix
		else:
			max_no_of_snps = 10000
			if len(snpData.col_id_ls)>max_no_of_snps:	#2008-12-01 randomly pick max_no_of_snps SNPs
				picked_col_index_ls = random.sample(range(len(snpData.col_id_ls)), max_no_of_snps)
				new_col_id_ls = [snpData.col_id_ls[i] for i in picked_col_index_ls]
				newSnpData = SNPData(row_id_ls=snpData.row_id_ls, col_id_ls=new_col_id_ls, strain_acc_list=snpData.strain_acc_list,\
								category_list=snpData.category_list)
				newSnpData.data_matrix = snpData.data_matrix[:, picked_col_index_ls]
				snpData = newSnpData
		
			snpData, allele_index2allele_ls = snpData.convertSNPAllele2Index()
			explained_var = None
			PC_matrix = None
		
		header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0)
		phenData = SNPData(header=header_phen, strain_acc_list=snpData.strain_acc_list, data_matrix=data_matrix_phen)	#row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
		phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, strain_acc_list_phen, phenData.data_matrix)	#tricky, using strain_acc_list_phen
		
		phenotype_col_index = self.findOutWhichPhenotypeColumn(phenData, Set([self.phenotype_method_id]))[0]
		
		
		ecotype_info = getEcotypeInfo(db, self.country_order_type)
		
		#the offset below decides where the label of strains/snps should start in axe_snp_matrix
		#2008-11-14 only for PlotGroupOfSNPs.py. you can set it to 1 cuz we dont' draw axe_snp_matrix here.
		snp_id_label_y_offset = 0.95
		StrainID2PCAPosInfo = self.getStrainID2PCAPosInfo(snpData, pca_range=[0,1], snp_id_label_y_offset=snp_id_label_y_offset, explained_var=explained_var, T=PC_matrix)
		
		axe_y_offset1 = 0.03
		axe_height1 = 0.45	#height of axe_chromosome, twice height of axe_map_phenotype_legend
		axe_y_offset2 = axe_y_offset1+axe_height1
		axe_height2 = 0.5	#height of axe_strain_pca, axe_snp_matrix, axe_map
		axe_y_offset3 = axe_y_offset2+axe_height2
		
		axe_x_offset1 = 0.05
		axe_width1 = 0.8	#width of axe_strain_pca
		axe_x_offset2 = axe_x_offset1 + 0.02 + axe_width1
		axe_width2 = 0.05	#width of axe_chromosome, axe_snp_matrix, axe_snp_pca
		axe_x_offset3 = axe_x_offset2 + axe_width2
		axe_width3 = 0.02	#width of axe_phenotype
		
		phenotype_method = Stock_250kDB.PhenotypeMethod.get(self.phenotype_method_id)
		
		phenotype_cmap = mpl.cm.jet
		max_phenotype = numpy.nanmax(phenData.data_matrix[:,phenotype_col_index])	#nanmax ignores the nan elements
		min_phenotype = numpy.nanmin(phenData.data_matrix[:,phenotype_col_index])	#nanmin ignores the nan elements
		phenotype_gap = max_phenotype - min_phenotype
		phenotype_jitter = phenotype_gap/10.
		phenotype_norm = mpl.colors.Normalize(vmin=min_phenotype-phenotype_jitter, vmax=max_phenotype+phenotype_jitter)
		axe_map_phenotype_legend = pylab.axes([axe_x_offset2, axe_y_offset1, axe_width2, 0.3], frameon=False)
		cb = mpl.colorbar.ColorbarBase(axe_map_phenotype_legend, cmap=phenotype_cmap,
									norm=phenotype_norm,
									orientation='vertical')
		cb.set_label('Legend Of Phenotype %s %s'%(phenotype_method.id, phenotype_method.short_name))
		
		axe_strain_map = pylab.axes([axe_x_offset1, axe_y_offset2, axe_width1, axe_height2], frameon=False)
		axe_strain_pca = pylab.axes([axe_x_offset1, axe_y_offset1, axe_width1, axe_height1], frameon=False)
		axe_strain_map_pca_cover = pylab.axes([axe_x_offset1, axe_y_offset1, axe_width1, axe_height1+axe_height2], frameon=False, \
											sharex=axe_strain_pca)	#cover both axe_strain_map and axe_strain_pca
		axe_strain_map_pca_cover.set_yticks([])
		axe_strain_pca_xlim = [-0.05,1.05]
		axe_strain_pca_ylim = [0, 1.05]
		axe_strain_pca.set_xlim(axe_strain_pca_xlim)
		axe_strain_pca.set_ylim(axe_strain_pca_ylim)
		axe_strain_map_pca_cover_ylim = [0, (axe_height1+axe_height2)/axe_height1]	#set it accordingly
		axe_strain_map_pca_cover.set_ylim(axe_strain_map_pca_cover_ylim)
				
		axe_strain_pca.grid(True, alpha=0.3)
		axe_strain_pca.set_xticks([])
		axe_strain_pca.set_yticks([])
		axe_strain_pca_legend = None	#no pca legend
		self.drawStrainPCA(axe_strain_pca, axe_strain_map, axe_strain_map_pca_cover, axe_strain_pca_legend, StrainID2PCAPosInfo, \
						ecotype_info, phenData, \
					phenotype_col_index, phenotype_cmap, phenotype_norm, rightmost_x_value=axe_strain_pca_xlim[1],\
					strain_color_type=2, pca2map_line_color=None, ecotype_width_on_map=10,\
					draw_lines_to_axe_snp_matrix = False, strain_size_on_axe_strain_pca=14, pic_area=self.pic_area,\
					map_pca_line_alpha=0.2, map_pca_linewidth=0.2)	#customize a couple of things
		
		axe_strain_pca.set_xlim(axe_strain_pca_xlim)
		axe_strain_pca.set_ylim(axe_strain_pca_ylim)
		axe_strain_map_pca_cover.set_ylim(axe_strain_map_pca_cover_ylim)
		
		png_output_fname = '%s.png'%self.output_fname_prefix
		pylab.savefig(png_output_fname, dpi=400)
		pylab.savefig('%s.svg'%self.output_fname_prefix)
		
		self.plotLatLonPhenVsPC(ecotype_info, StrainID2PCAPosInfo, phenData, phenotype_col_index, phenotype_cmap, phenotype_norm, 
						self.output_fname_prefix, commit=self.commit)
コード例 #28
0
	def plone_run(self, min_call_info_mismatch_rate=0.1):
		"""
		2009-6-9
			pass self.max_mismatch_rate, self.min_no_of_non_NA_pairs to TwoSNPData to filter entries stored in db.
		2009-4-13
			add min_call_info_mismatch_rate
		2009-2-5
			add "create_tables=False" to db.setup()
		2008-07-02
			fix a bug which causes the program to continue read data even while call_info_id2fname is empty and input_dir is null.
		2008-07-01
			adjust to the newest functions in QC_250k.py
		2008-04-25
			return None if QC_method_id==0
		2008-04-20
			for plone to call it just to get row_id2NA_mismatch_rate
		"""
		
		import MySQLdb
		conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user = self.user, passwd = self.passwd)
		curs = conn.cursor()
		self.curs = curs
		
		#database connection and etc
		db = Stock_250kDB.Stock_250kDB(username=self.user,
				   password=self.passwd, hostname=self.hostname, database=self.dbname)
		db.setup(create_tables=False)
		session = db.session
		session.begin()
		#transaction = session.create_transaction()
		# if cmp_data_filename not specified, try to find in the data_description column in table QC_method.
		qm = QCMethod.query.get(self.QC_method_id)
		if not self.cmp_data_filename and self.QC_method_id!=0:
			if qm.data_description:
				data_description_ls = qm.data_description.split('=')
				if len(data_description_ls)>1:
					self.cmp_data_filename = qm.data_description.split('=')[1].strip()
		
		#after db query, cmp_data_filename is still nothing, exit program.
		if not self.cmp_data_filename and self.QC_method_id!=0:
			sys.stderr.write("cmp_data_filename is still nothing even after db query. please specify it on the commandline.\n")
			sys.exit(3)
		
		
		#from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix
		header, strain_acc_list, category_list, data_matrix = read_data(self.cmp_data_filename)
		strain_acc_list = map(int, strain_acc_list)	#it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
		snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \
						data_matrix=data_matrix, snps_table=self.QC_method_id2snps_table.get(self.QC_method_id), ignore_het=qm.ignore_het)
						#category_list is not used.
		
		if self.input_dir:
			#04/22/08 Watch: call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid)
			#no submission to db
			call_info_id2fname = self.get_array_id2fname(curs, self.input_dir)
		else:
			#call_info_id2fname = self.get_call_info_id2fname(curs, self.call_info_table, self.call_QC_table, self.QC_method_id)
			call_data = self.get_call_info_id2fname(db, self.QC_method_id, self.call_method_id, \
				filter_calls_QCed=0, max_call_info_mismatch_rate=1, min_call_info_mismatch_rate=min_call_info_mismatch_rate,\
				debug=self.debug)
			call_info_id2fname = call_data.call_info_id2fname
			call_info_ls_to_return = call_data.call_info_ls_to_return
		
		#2008-07-01 pick the call_info_ids to be handled
		new_call_info_id2fname = {}
		for call_info_id_wanted in self.call_info_id_ls:
			if call_info_id_wanted in call_info_id2fname:
				new_call_info_id2fname[call_info_id_wanted] = call_info_id2fname[call_info_id_wanted]
			elif self.report:
				sys.stderr.write("%s not in call_info_id2fname.\n"%(call_info_id_wanted))
		call_info_id2fname = new_call_info_id2fname
		
		if call_info_id2fname:
			pdata = self.read_call_matrix(call_info_id2fname, self.min_probability)
			header = pdata.header
			call_info_id_ls = pdata.call_info_id_ls
			array_id_ls = pdata.array_id_ls
			ecotype_id_ls = pdata.ecotype_id_ls
			data_matrix = pdata.data_matrix
		elif self.input_dir:	#2008-07-02
			#input file is SNP by strain format. double header (1st two lines)
			header, snps_name_ls, category_list, data_matrix = FilterStrainSNPMatrix.read_data(self.input_dir, double_header=1)
			ecotype_id_ls = header[0][2:]
			call_info_id_ls = header[1][2:]
			data_matrix = numpy.array(data_matrix)
			data_matrix = data_matrix.transpose()
			header = ['', ''] + snps_name_ls	#fake a header for SNPData
		else:	#2008-07-02
			sys.stderr.write("No good arrays.\n")
			return None
		
		snps_name2snps_id = None
		
		#swap the ecotype_id_ls and call_info_id_ls when passing them to SNPData. now strain_acc_list=ecotype_id_ls
		snpData1 = SNPData(header=header, strain_acc_list=ecotype_id_ls, category_list= call_info_id_ls, data_matrix=data_matrix, \
						min_probability=self.min_probability, call_method_id=self.call_method_id, col_id2id=snps_name2snps_id,\
						max_call_info_mismatch_rate=self.max_call_info_mismatch_rate, snps_table='stock_250k.snps')	#snps_table is set to the stock_250k snps_table
		
		twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \
							QC_method_id=self.QC_method_id, user=self.user, row_matching_by_which_value=0, debug=self.debug,\
							max_mismatch_rate=self.max_mismatch_rate, min_no_of_non_NA_pairs=self.min_no_of_non_NA_pairs)
							#2009-6-9 cross-matching results whose mismatch_rates are below max_mismatch_rate would be put into db. 
		
		row_id2NA_mismatch_rate = None
		
		#2008-05-01 create a cross match table temporarily
		twoSNPData.qc_cross_match_table = 'qc_cross_match'
		twoSNPData.new_QC_cross_match_table = self.new_QC_cross_match_table
		twoSNPData.cal_row_id2pairwise_dist()	#database submission is done along.
		return row_id2NA_mismatch_rate
コード例 #29
0
ファイル: QC_250k.py プロジェクト: bopopescu/gwasmodules
    def run(self):
        """
		2008-04-25
			return None if QC_method_id==0
		2008-04-20
			for plone to call it just to get row_id2NA_mismatch_rate
		"""
        #database connection and etc
        db = Stock_250kDB.Stock_250kDB(drivername=self.drivername,
                                       username=self.user,
                                       password=self.passwd,
                                       hostname=self.hostname,
                                       database=self.dbname)
        db.setup(create_tables=False)
        session = db.session
        session.begin()
        #transaction = session.create_transaction()

        self.cmp_data_filename = self.findOutCmpDataFilename(
            self.cmp_data_filename, self.QC_method_id, self.QCMethod_class)
        qm = self.QCMethod_class.query.get(self.QC_method_id)  #2009-5-20

        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname,
                               host=self.hostname,
                               user=self.user,
                               passwd=self.passwd)
        curs = conn.cursor()
        self.curs = curs
        if self.debug:
            import pdb
            pdb.set_trace()

        readme = formReadmeObj(sys.argv, self.ad, Stock_250kDB.README)
        session.add(readme)

        QC_method_id2snps_table = self.QC_method_id2snps_table

        if self.QC_method_id == 0:
            self.cal_independent_NA_rate(db, self.min_probability, readme)
            row_id2NA_mismatch_rate = None
        else:
            #from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix
            header, strain_acc_list, category_list, data_matrix = read_data(
                self.cmp_data_filename, ignore_het=qm.ignore_het)
            strain_acc_list = map(
                int, strain_acc_list
            )  #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
            snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \
                data_matrix=data_matrix, snps_table=QC_method_id2snps_table.get(self.QC_method_id),\
                ignore_het=qm.ignore_het) #category_list is not used. 05/20/09 ignore_het is useless cuz data_matrix is provided.
            """
			if self.input_dir and os.path.isdir(self.input_dir):
				#04/22/08 Watch: call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid)
				#no submission to db
				call_info_id2fname = self.get_array_id2fname(curs, self.input_dir)
			"""
            if self.input_dir and os.path.isfile(self.input_dir):  #it's file
                call_info_id2fname = None
            else:
                if self.run_type == 2:  #no filtering on call_info entries that have been QCed.
                    filter_calls_QCed = 0
                elif self.run_type == 1:
                    filter_calls_QCed = 1
                    self.max_call_info_mismatch_rate = 1  #don't use this when doing accession-wise QC
                else:
                    sys.stderr.write("run_type=%s is not supported.\n" %
                                     self.run_type)
                    sys.exit(5)
                call_data = self.get_call_info_id2fname(db, self.QC_method_id, self.call_method_id, \
                          filter_calls_QCed, self.max_call_info_mismatch_rate, self.debug,\
                          min_no_of_non_NA_pairs=self.min_no_of_non_NA_pairs, input_dir=self.input_dir)
                call_info_id2fname = call_data.call_info_id2fname
                call_info_ls_to_return = call_data.call_info_ls_to_return
            if self.run_type == 2:
                snps_name2snps_id = self.get_snps_name2snps_id(db)
            else:
                snps_name2snps_id = None

            if call_info_id2fname:
                db_id2chr_pos = db.getSNPID2ChrPos()  #2011-22
                from DB_250k2data import DB_250k2Data
                db_id2index = DB_250k2Data.getSNPID2index(
                    call_info_id2fname.values()[0][1], db_id2chr_pos)
                if self.one_by_one and self.run_type == 1:  #one_by_one only for QC by accession
                    row_id2NA_mismatch_rate = {}
                    row_id12row_id2 = {}
                    counter = 0
                    for call_info_id, value in call_info_id2fname.iteritems():
                        counter += 1
                        print "No", counter
                        tmp_dict = {}
                        tmp_dict[call_info_id] = value
                        pdata = self.read_call_matrix(
                            tmp_dict,
                            self.min_probability,
                            db_id2chr_pos=db_id2chr_pos,
                            db_id2index=db_id2index)
                        #05/20/09 no need for qm.ignore_het because 250k is all h**o
                        passingdata = self.qcDataMatrixVSsnpData(
                            pdata, snps_name2snps_id, snpData2, curs, session,
                            readme)
                        row_id2NA_mismatch_rate.update(
                            passingdata.row_id2NA_mismatch_rate)
                        row_id12row_id2.update(passingdata.row_id12row_id2)
                        del pdata

                        if self.debug and counter == 10:
                            break
                else:
                    pdata = self.read_call_matrix(call_info_id2fname,
                                                  self.min_probability,
                                                  db_id2chr_pos=db_id2chr_pos,
                                                  db_id2index=db_id2index)
                    #05/20/09 no need for qm.ignore_het because 250k is all h**o
                    passingdata = self.qcDataMatrixVSsnpData(
                        pdata, snps_name2snps_id, snpData2, curs, session,
                        readme)
                    row_id2NA_mismatch_rate = passingdata.row_id2NA_mismatch_rate
                    row_id12row_id2 = passingdata.row_id12row_id2
                    del pdata
            else:
                #input file is SNP by strain format. double header (1st two lines)
                header, snps_name_ls, category_list, data_matrix = read_data(
                    self.input_dir, double_header=1, ignore_het=qm.ignore_het)
                pdata = PassingData()
                pdata.ecotype_id_ls = header[0][2:]
                pdata.call_info_id_ls = header[1][2:]
                data_matrix = numpy.array(data_matrix)
                pdata.data_matrix = data_matrix.transpose()
                pdata.header = ['', ''
                                ] + snps_name_ls  #fake a header for SNPData
                passingdata = self.qcDataMatrixVSsnpData(
                    pdata, snps_name2snps_id, snpData2, curs, session, readme)
                row_id2NA_mismatch_rate = passingdata.row_id2NA_mismatch_rate
                row_id12row_id2 = passingdata.row_id12row_id2
                del pdata

        if self.output_fname and self.run_type == 1 and row_id2NA_mismatch_rate:
            self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate,
                                                self.output_fname)

        if self.run_type == 1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate:
            #if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid)
            #row_id2NA_mismatch_rate might be None if it's method 0.
            self.submit_to_call_QC(session, row_id2NA_mismatch_rate, self.QC_method_id, self.user, self.min_probability, \
                 row_id12row_id2, self.call_method_id, readme)
        if self.commit:
            curs.execute("commit")
            session.commit()
        else:
            session.rollback()

        self.row_id2NA_mismatch_rate = row_id2NA_mismatch_rate  #for plone to get the data structure
コード例 #30
0
    def load_dstruc(self):
        if self.debug:
            import pdb
            pdb.set_trace()
        QualityControl.load_dstruc(self)
        self.header1, self.strain_acc_list1, self.category_list1, self.data_matrix1 = read_data(
            self.input_fname1)
        self.header2, self.strain_acc_list2, self.category_list2, self.data_matrix2 = read_data(
            self.input_fname2)

        self.col_id2col_index1, self.col_id2col_index2, self.col_id12col_id2 = self.get_col_matching_dstruc(
            self.header1, self.header2)
        self.row_id2row_index1, self.row_id2row_index2, self.row_id12row_id2 = self.get_row_matching_dstruc(
            self.strain_acc_list1, self.category_list1, self.strain_acc_list2)
コード例 #31
0
	def run(self):
		"""
		2008-12-02
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		
		delimiter = figureOutDelimiter(self.input_fname, report=self.report)
		header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, delimiter=delimiter)
		
		if self.array_id_2nd_column:
			snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
							data_matrix=data_matrix)
		else:
			snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\
							data_matrix=data_matrix)	#ignore category_list
		
		newSnpData, allele_index2allele_ls = snpData.convert2Binary(self.report)
		
		if self.phenotype_fname and self.phenotype_method_id:
			header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0)
			phenData = SNPData(header=header_phen, strain_acc_list=newSnpData.strain_acc_list, data_matrix=data_matrix_phen)	#row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
			phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(newSnpData.row_id_ls, strain_acc_list_phen, phenData.data_matrix)	#tricky, using strain_acc_list_phen
			
			phenotype_col_index = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(phenData, Set([self.phenotype_method_id]))[0]
			phenotype_label = phenData.col_id_ls[phenotype_col_index]
			phenotype_f = open('%s_%s.pheno'%(self.output_fname_prefix, phenotype_label.replace('/', '_')), 'w')
			for phenotype_value in phenData.data_matrix[:,phenotype_col_index]:
				if self.phenotype_is_binary:	#binary and non-binary have different NA designator
					if numpy.isnan(phenotype_value):
						phenotype_value = 9
					else:
						phenotype_value = int(phenotype_value)
				else:
					if numpy.isnan(phenotype_value):
						phenotype_value = -100.0
				phenotype_f.write('%s\n'%phenotype_value)
			del phenotype_f
		
		genotype_f = open('%s.geno'%self.output_fname_prefix, 'w')
		ind_writer = csv.writer(open('%s.ind'%self.output_fname_prefix, 'w'), delimiter='\t')
		snp_writer = csv.writer(open('%s.snp'%self.output_fname_prefix, 'w'), delimiter='\t')
		
		#transpose it
		newSnpData = transposeSNPData(newSnpData)
		
		no_of_rows = len(newSnpData.data_matrix)
		no_of_cols = len(newSnpData.data_matrix[0])
		for i in range(no_of_rows):
			snp_id = newSnpData.row_id_ls[i]
			chr, pos = snp_id.split('_')
			allele1 = allele_index2allele_ls[i][0]	#major allele
			allele2 = allele_index2allele_ls[i][1]	#minor allele
			snp_writer.writerow([snp_id, chr, 0.0, pos, allele1, allele2])
			geno_line = ''
			for j in range(no_of_cols):
				if i==0:	#write out the accessions
					ind_writer.writerow([newSnpData.col_id_ls[j], 'U', 'Case'])
				allele = newSnpData.data_matrix[i][j]
				if allele==0:
					geno_line += '0'
				elif allele==1:
					geno_line += '2'
				else:
					geno_line += '9'
			geno_line += '\n'
			genotype_f.write(geno_line)
		
		del genotype_f, ind_writer, snp_writer
コード例 #32
0
    def run(self):
        from pymodule import read_data
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.input_fname)
        snp_acc_ls = header[2:]
        if self.debug:
            import pdb
            pdb.set_trace()
        if self.drawing_type == 1:
            matrix_value2label = number2nt
            matrix_value2color = number2color
            data_matrix = numpy.array(data_matrix)
        elif self.drawing_type == 2:
            import MySQLdb
            #conn = MySQLdb.connect(db="stock",host='natural.uchicago.edu', user='******', passwd='iamhereatusc')
            conn = MySQLdb.connect(db=self.dbname, host=self.hostname)
            curs = conn.cursor()
            snp_allele2index_ls = self.get_snp_allele2index_ls(
                curs, snp_acc_ls, self.snps_sequenom_info_table)
            data_matrix = self.transformMatrixIntoTwoAllelesAndHetero(
                data_matrix, snp_allele2index_ls)
            matrix_value2label = {
                -1: 'deletion',
                0: 'NA',
                1: 'allele1',
                2: 'allele2',
                3: 'hetero'
            }
            matrix_value2color = {
                -1: (0, 0, 0),
                0: (255, 255, 255),
                1: (0, 0, 255),
                2: (0, 255, 0),
                3: (255, 0, 0)
            }
        elif self.drawing_type == 3:
            data_matrix = self.transformMatrixIntoHomoAndHetero(data_matrix)
            matrix_value2label = {
                -1: 'deletion',
                0: 'NA',
                1: 'h**o',
                2: 'hetero'
            }
            matrix_value2color = {
                -1: (0, 0, 0),
                0: (255, 255, 255),
                1: (0, 0, 255),
                2: (255, 0, 0)
            }
        elif self.drawing_type == 4:
            data_matrix = self.transformMatrixIntoTwoAllelesAndHetero(
                data_matrix)
            matrix_value2label = {
                -1: 'deletion',
                0: 'NA',
                1: 'allele1',
                2: 'allele2',
                3: 'hetero'
            }
            matrix_value2color = {
                -1: (0, 0, 0),
                0: (255, 255, 255),
                1: (0, 0, 255),
                2: (0, 255, 0),
                3: (255, 0, 0)
            }
        elif self.drawing_type == 5:
            data_matrix = self.transformMatrixIntoFourNucleotides(data_matrix)
            matrix_value2label = {
                -1: '-',
                0: 'NA',
                1: 'A',
                2: 'C',
                3: 'G',
                4: 'T'
            }
            matrix_value2color = {
                -1: (0, 0, 0),
                0: (255, 255, 255),
                1: (0, 0, 255),
                2: (0, 255, 0),
                3: (255, 0, 0),
                4: (122, 0, 122)
            }
            new_strain_acc_list = []
            for strain_acc in strain_acc_list:
                new_strain_acc_list.append(strain_acc)
                new_strain_acc_list.append(strain_acc)
            strain_acc_list = new_strain_acc_list
        else:
            sys.stderr.write("drawing_type %s not supported\n" % drawing_type)
            sys.exit(2)
        row_label_type2label_ls = {1: strain_acc_list, 2: category_list}

        font = get_font(self.font_path, font_size=self.font_size)  #2008-08-01
        im = drawLegend(matrix_value2label, matrix_value2color, font)
        im.save('%s_legend.png' % self.output_fname_prefix)
        im = drawMatrix(data_matrix,
                        matrix_value2color,
                        row_label_type2label_ls[self.row_label_type],
                        snp_acc_ls,
                        with_grid=1,
                        font=font)
        im.save('%s.png' % self.output_fname_prefix)
コード例 #33
0
    def plone_run(self, min_call_info_mismatch_rate=0.1):
        """
		2009-6-9
			pass self.max_mismatch_rate, self.min_no_of_non_NA_pairs to TwoSNPData to filter entries stored in db.
		2009-4-13
			add min_call_info_mismatch_rate
		2009-2-5
			add "create_tables=False" to db.setup()
		2008-07-02
			fix a bug which causes the program to continue read data even while call_info_id2fname is empty and input_dir is null.
		2008-07-01
			adjust to the newest functions in QC_250k.py
		2008-04-25
			return None if QC_method_id==0
		2008-04-20
			for plone to call it just to get row_id2NA_mismatch_rate
		"""

        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname,
                               host=self.hostname,
                               user=self.user,
                               passwd=self.passwd)
        curs = conn.cursor()
        self.curs = curs

        #database connection and etc
        db = Stock_250kDB.Stock_250kDB(username=self.user,
                                       password=self.passwd,
                                       hostname=self.hostname,
                                       database=self.dbname)
        db.setup(create_tables=False)
        session = db.session
        session.begin()
        #transaction = session.create_transaction()
        # if cmp_data_filename not specified, try to find in the data_description column in table QC_method.
        qm = QCMethod.query.get(self.QC_method_id)
        if not self.cmp_data_filename and self.QC_method_id != 0:
            if qm.data_description:
                data_description_ls = qm.data_description.split('=')
                if len(data_description_ls) > 1:
                    self.cmp_data_filename = qm.data_description.split(
                        '=')[1].strip()

        #after db query, cmp_data_filename is still nothing, exit program.
        if not self.cmp_data_filename and self.QC_method_id != 0:
            sys.stderr.write(
                "cmp_data_filename is still nothing even after db query. please specify it on the commandline.\n"
            )
            sys.exit(3)

        #from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.cmp_data_filename)
        strain_acc_list = map(
            int, strain_acc_list
        )  #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
        snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \
            data_matrix=data_matrix, snps_table=self.QC_method_id2snps_table.get(self.QC_method_id), ignore_het=qm.ignore_het)
        #category_list is not used.

        if self.input_dir:
            #04/22/08 Watch: call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid)
            #no submission to db
            call_info_id2fname = self.get_array_id2fname(curs, self.input_dir)
        else:
            #call_info_id2fname = self.get_call_info_id2fname(curs, self.call_info_table, self.call_QC_table, self.QC_method_id)
            call_data = self.get_call_info_id2fname(db, self.QC_method_id, self.call_method_id, \
             filter_calls_QCed=0, max_call_info_mismatch_rate=1, min_call_info_mismatch_rate=min_call_info_mismatch_rate,\
             debug=self.debug)
            call_info_id2fname = call_data.call_info_id2fname
            call_info_ls_to_return = call_data.call_info_ls_to_return

        #2008-07-01 pick the call_info_ids to be handled
        new_call_info_id2fname = {}
        for call_info_id_wanted in self.call_info_id_ls:
            if call_info_id_wanted in call_info_id2fname:
                new_call_info_id2fname[
                    call_info_id_wanted] = call_info_id2fname[
                        call_info_id_wanted]
            elif self.report:
                sys.stderr.write("%s not in call_info_id2fname.\n" %
                                 (call_info_id_wanted))
        call_info_id2fname = new_call_info_id2fname

        if call_info_id2fname:
            pdata = self.read_call_matrix(call_info_id2fname,
                                          self.min_probability)
            header = pdata.header
            call_info_id_ls = pdata.call_info_id_ls
            array_id_ls = pdata.array_id_ls
            ecotype_id_ls = pdata.ecotype_id_ls
            data_matrix = pdata.data_matrix
        elif self.input_dir:  #2008-07-02
            #input file is SNP by strain format. double header (1st two lines)
            header, snps_name_ls, category_list, data_matrix = FilterStrainSNPMatrix.read_data(
                self.input_dir, double_header=1)
            ecotype_id_ls = header[0][2:]
            call_info_id_ls = header[1][2:]
            data_matrix = numpy.array(data_matrix)
            data_matrix = data_matrix.transpose()
            header = ['', ''] + snps_name_ls  #fake a header for SNPData
        else:  #2008-07-02
            sys.stderr.write("No good arrays.\n")
            return None

        snps_name2snps_id = None

        #swap the ecotype_id_ls and call_info_id_ls when passing them to SNPData. now strain_acc_list=ecotype_id_ls
        snpData1 = SNPData(header=header, strain_acc_list=ecotype_id_ls, category_list= call_info_id_ls, data_matrix=data_matrix, \
            min_probability=self.min_probability, call_method_id=self.call_method_id, col_id2id=snps_name2snps_id,\
            max_call_info_mismatch_rate=self.max_call_info_mismatch_rate, snps_table='stock_250k.snps') #snps_table is set to the stock_250k snps_table

        twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \
             QC_method_id=self.QC_method_id, user=self.user, row_matching_by_which_value=0, debug=self.debug,\
             max_mismatch_rate=self.max_mismatch_rate, min_no_of_non_NA_pairs=self.min_no_of_non_NA_pairs)
        #2009-6-9 cross-matching results whose mismatch_rates are below max_mismatch_rate would be put into db.

        row_id2NA_mismatch_rate = None

        #2008-05-01 create a cross match table temporarily
        twoSNPData.qc_cross_match_table = 'qc_cross_match'
        twoSNPData.new_QC_cross_match_table = self.new_QC_cross_match_table
        twoSNPData.cal_row_id2pairwise_dist(
        )  #database submission is done along.
        return row_id2NA_mismatch_rate
コード例 #34
0
    def run(self):
        """
		2008-12-02
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        delimiter = figureOutDelimiter(self.input_fname, report=self.report)
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.input_fname, delimiter=delimiter)

        if self.array_id_2nd_column:
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
                data_matrix=data_matrix)
        else:
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\
                data_matrix=data_matrix) #ignore category_list

        newSnpData, allele_index2allele_ls = snpData.convert2Binary(
            self.report)

        if self.phenotype_fname and self.phenotype_method_id:
            header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(
                self.phenotype_fname, turn_into_integer=0)
            phenData = SNPData(
                header=header_phen,
                strain_acc_list=newSnpData.strain_acc_list,
                data_matrix=data_matrix_phen
            )  #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
            phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(
                newSnpData.row_id_ls, strain_acc_list_phen,
                phenData.data_matrix)  #tricky, using strain_acc_list_phen

            phenotype_col_index = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(
                phenData, Set([self.phenotype_method_id]))[0]
            phenotype_label = phenData.col_id_ls[phenotype_col_index]
            phenotype_f = open(
                '%s_%s.pheno' %
                (self.output_fname_prefix, phenotype_label.replace('/', '_')),
                'w')
            for phenotype_value in phenData.data_matrix[:,
                                                        phenotype_col_index]:
                if self.phenotype_is_binary:  #binary and non-binary have different NA designator
                    if numpy.isnan(phenotype_value):
                        phenotype_value = 9
                    else:
                        phenotype_value = int(phenotype_value)
                else:
                    if numpy.isnan(phenotype_value):
                        phenotype_value = -100.0
                phenotype_f.write('%s\n' % phenotype_value)
            del phenotype_f

        genotype_f = open('%s.geno' % self.output_fname_prefix, 'w')
        ind_writer = csv.writer(open('%s.ind' % self.output_fname_prefix, 'w'),
                                delimiter='\t')
        snp_writer = csv.writer(open('%s.snp' % self.output_fname_prefix, 'w'),
                                delimiter='\t')

        #transpose it
        newSnpData = transposeSNPData(newSnpData)

        no_of_rows = len(newSnpData.data_matrix)
        no_of_cols = len(newSnpData.data_matrix[0])
        for i in range(no_of_rows):
            snp_id = newSnpData.row_id_ls[i]
            chr, pos = snp_id.split('_')
            allele1 = allele_index2allele_ls[i][0]  #major allele
            allele2 = allele_index2allele_ls[i][1]  #minor allele
            snp_writer.writerow([snp_id, chr, 0.0, pos, allele1, allele2])
            geno_line = ''
            for j in range(no_of_cols):
                if i == 0:  #write out the accessions
                    ind_writer.writerow([newSnpData.col_id_ls[j], 'U', 'Case'])
                allele = newSnpData.data_matrix[i][j]
                if allele == 0:
                    geno_line += '0'
                elif allele == 1:
                    geno_line += '2'
                else:
                    geno_line += '9'
            geno_line += '\n'
            genotype_f.write(geno_line)

        del genotype_f, ind_writer, snp_writer
コード例 #35
0
	def loadDataStructure(self, db_250k=None, association_locus_id=None, association_landscape_type_id=None, \
						locusExtensionDistance=5000,\
						data_dir=None, list_type_id_list=None, gene_annotation_pickleFname=None, \
						snpInfoPickleFname=None, locus_type_id=1, snp_matrix_fname=None, snp_matrix_data_type=None, \
						phenotype_fname=None):
		"""
		2012.11.14
		"""
		sys.stderr.write("Fetching GWAS landscape for association-locus %s, landscape type %s ..."%(association_locus_id, association_landscape_type_id))
		# fetch the associationLocus
		associationLocus = Stock_250kDB.AssociationLocus.get(association_locus_id)
		associationLandscapeType = Stock_250kDB.AssociationLandscapeType.get(association_landscape_type_id)
		
		# fetch all result-peaks
		landscape_gwr_ls = []
		# fetch landscape within this interval
		start = max(1, associationLocus.start-locusExtensionDistance)
		stop = associationLocus.stop + locusExtensionDistance
		pd = PassingData(min_MAF=associationLandscapeType.min_MAF, data_dir=data_dir, \
						need_chr_pos_ls=0, chromosome=associationLocus.chromosome, \
						start=start, stop=stop, report=False)	#report controls whether getResultMethodContent() will report progress.
		association_landscape_id_set = set()
		
		for association_peak in associationLocus.association_peak_ls:
			association_landscape = db_250k.getAssociationLandscape(result_id=association_peak.result_id, association_landscape_type_id=associationLandscapeType.id)
			if association_landscape and association_landscape.id not in association_landscape_id_set:
				association_landscape_id_set.add(association_landscape.id)
				genome_wide_result = db_250k.getResultMethodContent(association_landscape=association_landscape, data_dir=data_dir, \
												construct_chr_pos2index=True, pdata=pd)
				landscape_gwr_ls.append(genome_wide_result)
				sys.stderr.write(" %s%s "%('\x08'*80, len(landscape_gwr_ls)))
		sys.stderr.write("%s landscapes.\n"%(len(landscape_gwr_ls)))
		
		centralLocus = SNPPassingData(chromosome=associationLocus.chromosome, position=start, \
						snps_id=associationLocus.id, start=start, stop=stop,
						fileNamePrefix="")
		
		LD_info = None
		gene_annotation = DrawSNPRegion.dealWithGeneAnnotation(gene_annotation_pickleFname)
		if snpInfoPickleFname:
			snp_info = db_250k.dealWithSNPInfo(snpInfoPickleFname, locus_type_id=locus_type_id)	#2012.3.8
		else:
			snp_info = None
		
		candidate_gene_set = set()
		if list_type_id_list:
			for list_type_id in list_type_id_list:
				candidate_gene_list = db_250k.getGeneList(list_type_id)
				candidate_gene_set |= set(candidate_gene_list)
		
		if snp_matrix_fname and phenotype_fname:
			if snp_matrix_data_type==3:
				matrix_data_type=float		#2009-3-23 for CNV amplitude file
			else:
				matrix_data_type=int
			snpData = SNPData(input_fname=snp_matrix_fname, turn_into_integer=1, turn_into_array=1, ignore_2nd_column=1,\
							matrix_data_type=matrix_data_type)
			if snpData.data_matrix is None:
				sys.stderr.write("Error. snpData.data_matrix is None.\n")
				sys.exit(3)
			header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(phenotype_fname, turn_into_integer=0)
			
			phenData = SNPData(header=header_phen, strain_acc_list=snpData.strain_acc_list, data_matrix=data_matrix_phen)
			#row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
			
			phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, \
																		strain_acc_list_phen, phenData.data_matrix)
			#tricky, using strain_acc_list_phen
			
			#2008-12-05 fake a snp_info for findSNPsInRegion
			DrawSNPRegion.construct_chr_pos2index_forSNPData(snpData, snp_info=snp_info)
			ecotype_info = getEcotypeInfo(db_250k)
		else:
			snpData = None
			phenData = None
			ecotype_info = None
		
		return_data = PassingData(associationLocus=associationLocus, associationLandscapeType=associationLandscapeType, \
								landscape_gwr_ls=landscape_gwr_ls, \
								gene_annotation=gene_annotation, snp_info=snp_info, LD_info=LD_info, \
								candidate_gene_set=candidate_gene_set, snpData=snpData, phenData=phenData,\
								ecotype_info=ecotype_info, centralLocus=centralLocus)
		return return_data
コード例 #36
0
ファイル: MpiLD.py プロジェクト: bopopescu/gwasmodules
    def run(self):
        self.communicator = MPI.world.duplicate()
        node_rank = self.communicator.rank
        free_computing_nodes = range(1, self.communicator.size -
                                     1)  #exclude the 1st and last node
        free_computing_node_set = Set(free_computing_nodes)
        output_node_rank = self.communicator.size - 1
        """
		if node_rank!=output_node_rank:
			header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname)
			snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \
							data_matrix=data_matrix)	#category_list is not used to facilitate row-id matching
		"""
        if node_rank == 0:
            header, strain_acc_list, category_list, data_matrix = read_data(
                self.input_fname)
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \
                data_matrix=data_matrix) #category_list is not used to facilitate row-id matching
            snpData_pickle = cPickle.dumps(snpData, -1)
            for node in free_computing_nodes:  #send it to the computing_node
                sys.stderr.write(
                    "passing initial data to nodes from %s to %s ... " %
                    (node_rank, node))
                self.communicator.send(snpData_pickle, node, 0)
                sys.stderr.write(".\n")
            del snpData_pickle
            params_ls = self.generate_params(len(snpData.col_id_ls),
                                             self.block_size)
            del snpData
        elif node_rank in free_computing_node_set:
            data, source, tag = self.communicator.receiveString(0, 0)
            snpData = cPickle.loads(data)
            del data
        else:
            pass

        self.synchronize()
        if node_rank == 0:
            param_obj = PassingData(params_ls=params_ls,
                                    output_node_rank=output_node_rank,
                                    report=self.report,
                                    counter=0)
            self.inputNode(param_obj,
                           free_computing_nodes,
                           param_generator=params_ls)
            #self.input_node(param_obj, free_computing_nodes, input_handler=self.input_handler, message_size=1)
            #self.input_node(param_obj, free_computing_nodes, self.message_size)
        elif node_rank in free_computing_node_set:
            computing_parameter_obj = PassingData(
                snpData=snpData,
                min_LD_to_output=self.min_LD_to_output,
                min_MAF=self.min_MAF,
                discard_perc=self.discard_perc)
            self.computing_node(computing_parameter_obj,
                                self.computing_node_handler)
        else:
            if getattr(self, 'output_fname', None):
                writer = csv.writer(open(self.output_fname, 'w'),
                                    delimiter='\t')
                #header_row = ['snp1_id', 'snp2_id', 'r2', 'D', "D'", "no_of_pairs"]
                #writer.writerow(header_row)
            else:
                writer = None
            param_obj = PassingData(writer=writer, is_header_written=False)
            self.output_node(free_computing_nodes, param_obj,
                             self.output_node_handler)
            del writer
        self.synchronize()  #to avoid some node early exits
コード例 #37
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()
        db = Stock_250kDB.Stock_250kDB(drivername=self.drivername,
                                       username=self.db_user,
                                       password=self.db_passwd,
                                       hostname=self.hostname,
                                       database=self.dbname,
                                       schema=self.schema)
        db.setup(create_tables=False)
        session = db.session

        header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(
            self.phenotype_fname, turn_into_integer=0)
        phenData = SNPData(
            header=header_phen,
            strain_acc_list=strain_acc_list_phen,
            data_matrix=data_matrix_phen
        )  #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
        phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(
            phenData.row_id_ls, strain_acc_list_phen,
            phenData.data_matrix)  #tricky, using strain_acc_list_phen

        phenotype_col_index1 = self.findOutWhichPhenotypeColumn(
            phenData, Set([self.phenotype_method_id1]))[0]
        phenotype_col_index2 = self.findOutWhichPhenotypeColumn(
            phenData, Set([self.phenotype_method_id2]))[0]

        x_ls = []
        y_ls = []
        for i in range(phenData.data_matrix.shape[0]):
            if not numpy.isnan(
                    phenData.data_matrix[i]
                [phenotype_col_index1]) and not numpy.isnan(
                    phenData.data_matrix[i][phenotype_col_index2]):
                x_ls.append(phenData.data_matrix[i][phenotype_col_index1])
                y_ls.append(phenData.data_matrix[i][phenotype_col_index2])

        pylab.clf()
        pylab.title('Phenotype Contrast')
        pylab.plot(x_ls, y_ls, '.', alpha=0.6)
        pylab.grid(alpha=0.3)
        phenotype_method1 = Stock_250kDB.PhenotypeMethod.get(
            self.phenotype_method_id1)
        phenotype_method2 = Stock_250kDB.PhenotypeMethod.get(
            self.phenotype_method_id2)
        pylab.xlabel(phenotype_method1.short_name)
        pylab.ylabel(phenotype_method2.short_name)

        #draw diagonal line to show perfect correlation
        max_min_value = max(min(x_ls), min(y_ls))
        min_max_value = min(max(x_ls), max(y_ls))
        pylab.plot([max_min_value, min_max_value],
                   [max_min_value, min_max_value],
                   c='g',
                   alpha=0.7)

        png_output_fname = '%s.png' % self.output_fname_prefix
        pylab.savefig(png_output_fname, dpi=400)
        pylab.savefig('%s.svg' % self.output_fname_prefix)
コード例 #38
0
    def prepareTwoSNPData(self,
                          db,
                          max_mismatch_rate=0.25,
                          min_no_of_non_NA_pairs=40,
                          report=0):
        """
		2009-9-23
			add arguments max_mismatch_rate & min_no_of_non_NA_pairs, and pass them to twoSNPData.
			However it's useless to control what should be inserted into db because TwoSNPData.qc_cross_match_table is
			not defined and even if it's defined, the table it'll create doesn't concord to the one in 149SNP db. 
		2008-09-10
			if self.input_fname is given, get 149SNP data from it , instead of database
		2008-8-28
			split out of run() so that MpiQC149CrossMatch could call this easily
		"""
        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname,
                               host=self.hostname,
                               user=self.db_user,
                               passwd=self.db_passwd)
        curs = conn.cursor()
        if self.input_fname:
            header, strain_acc_list, category_list, data_matrix = read_data(
                self.input_fname)
        else:
            from dbSNP2data import dbSNP2data
            snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m(
                curs, StockDB.Calls.table.name, StockDB.SNPs.table.name)
            strain_info_data = self.get_strain_id_info(
                self.QC_method_id, ignore_strains_with_qc=False)
            data_matrix = self.get_data_matrix(
                db, strain_info_data.strain_id2index, snp_id2index,
                StockDB.Calls.table.name)
            strain_acc_list = [
                strain_info_data.strain_id2acc[strain_id]
                for strain_id in strain_info_data.strain_id_list
            ]  #tg_ecotypeid
            category_list = [
                strain_info_data.strain_id2category[strain_id]
                for strain_id in strain_info_data.strain_id_list
            ]  #strainid
            header = ['ecotypeid', 'strainid']
            for snp_id in snp_id_list:
                snp_name, chromosome, position = snp_id2info[snp_id]
                header.append(snp_name)
        snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \
            snps_table='stock.snps') #snps_table is set to the stock_250k snps_table
        if self.QC_method_id == 4:
            snpData2 = snpData1
        else:
            self.cmp_data_filename = self.findOutCmpDataFilename(
                self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod)
            header, strain_acc_list, category_list, data_matrix = read_data(
                self.cmp_data_filename)
            strain_acc_list = map(
                int, strain_acc_list
            )  #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
            snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \
                data_matrix=data_matrix) #category_list is not used to facilitate row-id matching


        twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, \
             QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug,\
             max_mismatch_rate=max_mismatch_rate, min_no_of_non_NA_pairs=min_no_of_non_NA_pairs, report=report)
        return twoSNPData
コード例 #39
0
	def run(self):
		"""
		2008-09-06
		"""
		if self.debug:
			#for one-node testing purpose
			import pdb
			pdb.set_trace()
			header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname)
			snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \
							data_matrix=data_matrix, turn_into_array=1)	#category_list is not used to facilitate row-id matching
			
			picklef = open(self.snps_context_fname)
			snps_context_wrapper = cPickle.load(picklef)
			del picklef
			gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(snps_context_wrapper)
			gene_id_ls = gene_id2snps_id_ls.keys()
			gene_id_ls.sort()
			
			header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0)
			phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list, data_matrix=data_matrix_phen)	#row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
			phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix)
			
			other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=gene_id_ls, phenData=phenData)
			other_data_pickle = cPickle.dumps(other_data, -1)
			phenotype_label_ls_pickle = cPickle.dumps(phenData.col_id_ls, -1)
			snpData_pickle = cPickle.dumps(snpData, -1)
			sys.exit(2)
		
		self.communicator = MPI.world.duplicate()
		node_rank = self.communicator.rank
		free_computing_nodes = range(1, self.communicator.size-1)	#exclude the 1st and last node
		free_computing_node_set = Set(free_computing_nodes)
		output_node_rank = self.communicator.size-1
		
		if node_rank == 0:
			dstruc = self.inputNodePrepare()
			params_ls = dstruc.params_ls
			#send the output node the phenotype_label_ls
			self.communicator.send(dstruc.output_node_data_pickle, output_node_rank, 0)
			del dstruc.output_node_data_pickle
			
			for node in free_computing_nodes:	#send it to the computing_node
				sys.stderr.write("passing initial data to nodes from %s to %s ... "%(node_rank, node))
				self.communicator.send(dstruc.snpData_pickle, node, 0)
				self.communicator.send(dstruc.other_data_pickle, node, 0)
				sys.stderr.write(".\n")
			del dstruc
			
		elif node_rank in free_computing_node_set:
			data, source, tag = self.communicator.receiveString(0, 0)
			snpData =  cPickle.loads(data)
			del data
			data, source, tag = self.communicator.receiveString(0, 0)
			other_data = cPickle.loads(data)
			del data
			self.phenotype_index_ls = other_data.phenotype_index_ls
		else:
			data, source, tag = self.communicator.receiveString(0, 0)
			output_node_data_pickle = cPickle.loads(data)
			phenotype_label_ls = output_node_data_pickle.phenotype_label_ls
			self.phenotype_index_ls = output_node_data_pickle.phenotype_index_ls
			
		self.synchronize()
		if node_rank == 0:
			param_obj = PassingData(params_ls=params_ls, output_node_rank=output_node_rank, report=self.report, counter=0)
			self.inputNode(param_obj, free_computing_nodes, param_generator = params_ls)
			#self.input_node(param_obj, free_computing_nodes, input_handler=self.input_fetch_handler, message_size=1)
		elif node_rank in free_computing_node_set:
			computing_parameter_obj = PassingData(snpData=snpData, gene_id_ls=other_data.gene_id_ls, \
												gene_id2snps_id_ls=other_data.gene_id2snps_id_ls, phenData=other_data.phenData,
												phenotype_index_ls=self.phenotype_index_ls, min_data_point=self.min_data_point,
												test_type=self.test_type)
			self.computing_node(computing_parameter_obj, self.computing_node_handler)
		else:
			self.general_output_node(self.output_dir, self.phenotype_index_ls, phenotype_label_ls, free_computing_nodes)
		self.synchronize()	#to avoid some node early exits
コード例 #40
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix
		
		#to check whether two input file are in different orientation
		file_format2count = {}
		file_format_ls = [self.input_fname1_format, self.input_fname2_format]
		for file_format in file_format_ls:
			if file_format not in file_format2count:
				file_format2count[file_format] = 0
			file_format2count[file_format] += 1
		

		#2008-05-15 TwoSNPData can handle character matrix/2D-list. but transposeSNPData needs numeric matrix to transpose except when numpy is installed.
		if 1 in file_format2count and file_format2count[1]==1:	#there's one and only one strain x snp format.
			#it needs transpose matrix. only numpy works on character matrix. not sure Numeric or numarray is imported. so transform the input matrix to integer.
			use_nt2number = 1
		else:
			use_nt2number = 0
		
		if self.input_fname1_format==1:
			header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname1)
			snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
							data_matrix=data_matrix)
		elif self.input_fname1_format==2:
			snpsd_ls = dataParsers.parseCSVData(self.input_fname1, withArrayIds=False, use_nt2number=use_nt2number)
			snpData1 = RawSnpsData_ls2SNPData(snpsd_ls, report=self.report, use_nt2number=0)	#already nt in number
			del snpsd_ls
		elif self.input_fname1_format==3:
			snpsd_ls = dataParsers.parseCSVData(self.input_fname1, withArrayIds=True, use_nt2number=use_nt2number)
			snpData1 = RawSnpsData_ls2SNPData(snpsd_ls, report=self.report, use_nt2number=0)
			del snpsd_ls
		else:
			sys.stderr.write('Error: unsupported input_fname1 format, %s\n' % self.input_fname1_format)
			sys.exit(2)
		
		if self.run_type!=2:
			if self.input_fname2_format==1:
				header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname2)
				snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list,\
								data_matrix=data_matrix)
			elif self.input_fname2_format==2:
				snpsd_ls = dataParsers.parseCSVData(self.input_fname2, withArrayIds=False, use_nt2number=use_nt2number)
				snpData2 = RawSnpsData_ls2SNPData(snpsd_ls, report=self.report, use_nt2number=0)
				del snpsd_ls
			else:
				sys.stderr.write('Error: unsupported input_fname2 format, %s\n' % self.input_fname2_format)
				sys.exit(2)
			
	
			if 1 in file_format2count and file_format2count[1]==1:	#there's one and only one strain x snp format. transpose the 2nd snpData
				snpData2 = transposeSNPData(snpData2, report=self.report)
			
			if self.input_fname1_format == 1:	#row_id for the 1st file = (ecotype_id, duplicate). for 2nd file, row_id=ecotype_id.
				row_matching_by_which_value = 0
				col_matching_by_which_value = None
			elif self.input_fname1_format == 2:	#col_id for the 1st file = accession. for 2nd file, col_id=accession.
				row_matching_by_which_value = None
				col_matching_by_which_value = None
			elif self.input_fname1_format == 3:	#col_id for the 1st file = (array_id, accession). for 2nd file, col_id=accession.
				row_matching_by_which_value = None
				col_matching_by_which_value = 1
		else:
			#2008-10-12 pairwise mismatch between same data
			snpData2 = snpData1
			row_matching_by_which_value = None
			col_matching_by_which_value = None
		
		twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, row_matching_by_which_value=row_matching_by_which_value,\
							col_matching_by_which_value=col_matching_by_which_value, debug=self.debug)
		
		if self.run_type==3:
			#2008-10-12 compare snpData1 and snpData2 only for designated entries from snpData1
			if not self.ecotype_id_ls:
				sys.stderr.write("Run_type %s: ecotype_id_ls (%s) is not specified.\n"%(self.run_type, self.ecotype_id_ls))
				sys.exit(3)
			ecotype_id_set = Set(self.ecotype_id_ls)
			row_id_ls = []	#test against 
			for row_id in snpData1.row_id_ls:
				
				if not isinstance(row_id, str) and hasattr(row_id, '__len__'):
					ecotype_id = row_id[0]
				else:
					ecotype_id = row_id
				if ecotype_id in ecotype_id_set:
					row_id_ls.append(row_id)
			print '%s arrays'%(len(row_id_ls))
			if self.ecotype_id_ls:
				for row_id in row_id_ls:
					col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise(row_id=row_id)
					if col_id2NA_mismatch_rate:
						if not isinstance(row_id, str) and hasattr(row_id, '__len__'):
							row_id_name = '_'.join(row_id)
						else:
							row_id_name = row_id
						output_fname = '%s_%s'%(self.output_fname, row_id_name)
						twoSNPData.output_col_id2NA_mismatch_rate_InGWRFormat(col_id2NA_mismatch_rate, output_fname)
		elif self.run_type==2:
			#2008-10-12	column-wise mismatch of snpData1 vs snpData1 between rows with same ecotype_id but different array_id
			row_id_pair_set = Set()
			for row_id in snpData1.row_id_ls:
				
				if not isinstance(row_id, str) and hasattr(row_id, '__len__'):
					ecotype_id = row_id[0]
				else:
					ecotype_id = row_id
				for row_id2 in snpData2.row_id_ls:
					if row_id2[0]==ecotype_id and row_id2[1]!=row_id[1]:	#same ecotype_id but different array_id
						row_id_pair_set.add((row_id, row_id2))
			
			print '%s arrays'%(len(row_id_pair_set))
			for row_id1, row_id2 in row_id_pair_set:
				row_id12row_id2 = {row_id1:row_id2}
				col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise(row_id=row_id1, row_id12row_id2=row_id12row_id2)
				if col_id2NA_mismatch_rate:
					output_fname = '%s_%s_vs_%s'%(self.output_fname, '_'.join(row_id1), '_'.join(row_id2))
					twoSNPData.output_col_id2NA_mismatch_rate_InGWRFormat(col_id2NA_mismatch_rate, output_fname)
		elif self.run_type==1:
			#sys.exit(2)	#2008-10-12 skip all original functions
			row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise()
			col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise()
			if row_id2NA_mismatch_rate:
				QC_250k.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate, self.output_fname, file_1st_open=1)
			if col_id2NA_mismatch_rate:
				QC_250k.output_row_id2NA_mismatch_rate(col_id2NA_mismatch_rate, self.output_fname, file_1st_open=0)
コード例 #41
0
    def run(self):
        """
		2008-04-25
			return None if QC_method_id==0
		2008-04-20
			for plone to call it just to get row_id2NA_mismatch_rate
		"""
        # database connection and etc
        db = Stock_250kDB.Stock_250kDB(
            drivername=self.drivername,
            username=self.user,
            password=self.passwd,
            hostname=self.hostname,
            database=self.dbname,
        )
        db.setup()
        session = db.session
        session.begin()
        # transaction = session.create_transaction()

        self.cmp_data_filename = self.findOutCmpDataFilename(
            self.cmp_data_filename, self.QC_method_id, self.QCMethod_class
        )
        qm = self.QCMethod_class.query.get(self.QC_method_id)  # 2009-5-20

        import MySQLdb

        conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.user, passwd=self.passwd)
        curs = conn.cursor()
        self.curs = curs
        if self.debug:
            import pdb

            pdb.set_trace()

        readme = formReadmeObj(sys.argv, self.ad, Stock_250kDB.README)
        session.save(readme)

        QC_method_id2snps_table = self.QC_method_id2snps_table

        if self.QC_method_id == 0:
            self.cal_independent_NA_rate(db, self.min_probability, readme)
            row_id2NA_mismatch_rate = None
        else:
            # from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix
            header, strain_acc_list, category_list, data_matrix = read_data(
                self.cmp_data_filename, ignore_het=qm.ignore_het
            )
            strain_acc_list = map(
                int, strain_acc_list
            )  # it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
            snpData2 = SNPData(
                header=header,
                strain_acc_list=strain_acc_list,
                data_matrix=data_matrix,
                snps_table=QC_method_id2snps_table.get(self.QC_method_id),
                ignore_het=qm.ignore_het,
            )  # category_list is not used. 05/20/09 ignore_het is useless cuz data_matrix is provided.
            """
			if self.input_dir and os.path.isdir(self.input_dir):
				#04/22/08 Watch: call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid)
				#no submission to db
				call_info_id2fname = self.get_array_id2fname(curs, self.input_dir)
			"""
            if self.input_dir and os.path.isfile(self.input_dir):  # it's file
                call_info_id2fname = None
            else:
                if self.run_type == 2:  # no filtering on call_info entries that have been QCed.
                    filter_calls_QCed = 0
                elif self.run_type == 1:
                    filter_calls_QCed = 1
                    self.max_call_info_mismatch_rate = 1  # don't use this when doing accession-wise QC
                else:
                    sys.stderr.write("run_type=%s is not supported.\n" % self.run_type)
                    sys.exit(5)
                call_data = self.get_call_info_id2fname(
                    db,
                    self.QC_method_id,
                    self.call_method_id,
                    filter_calls_QCed,
                    self.max_call_info_mismatch_rate,
                    self.debug,
                    min_no_of_non_NA_pairs=self.min_no_of_non_NA_pairs,
                    input_dir=self.input_dir,
                )
                call_info_id2fname = call_data.call_info_id2fname
                call_info_ls_to_return = call_data.call_info_ls_to_return
            if self.run_type == 2:
                snps_name2snps_id = self.get_snps_name2snps_id(db)
            else:
                snps_name2snps_id = None

            if call_info_id2fname:
                if self.one_by_one and self.run_type == 1:  # one_by_one only for QC by accession
                    row_id2NA_mismatch_rate = {}
                    row_id12row_id2 = {}
                    counter = 0
                    for call_info_id, value in call_info_id2fname.iteritems():
                        counter += 1
                        print "No", counter
                        tmp_dict = {}
                        tmp_dict[call_info_id] = value
                        pdata = self.read_call_matrix(
                            tmp_dict, self.min_probability
                        )  # 05/20/09 no need for qm.ignore_het because 250k is all h**o
                        passingdata = self.qcDataMatrixVSsnpData(
                            pdata, snps_name2snps_id, snpData2, curs, session, readme
                        )
                        row_id2NA_mismatch_rate.update(passingdata.row_id2NA_mismatch_rate)
                        row_id12row_id2.update(passingdata.row_id12row_id2)
                        del pdata

                        if self.debug and counter == 10:
                            break
                else:
                    pdata = self.read_call_matrix(
                        call_info_id2fname, self.min_probability
                    )  # 05/20/09 no need for qm.ignore_het because 250k is all h**o
                    passingdata = self.qcDataMatrixVSsnpData(pdata, snps_name2snps_id, snpData2, curs, session, readme)
                    row_id2NA_mismatch_rate = passingdata.row_id2NA_mismatch_rate
                    row_id12row_id2 = passingdata.row_id12row_id2
                    del pdata
            else:
                # input file is SNP by strain format. double header (1st two lines)
                header, snps_name_ls, category_list, data_matrix = read_data(
                    self.input_dir, double_header=1, ignore_het=qm.ignore_het
                )
                pdata = PassingData()
                pdata.ecotype_id_ls = header[0][2:]
                pdata.call_info_id_ls = header[1][2:]
                data_matrix = numpy.array(data_matrix)
                pdata.data_matrix = data_matrix.transpose()
                pdata.header = ["", ""] + snps_name_ls  # fake a header for SNPData
                passingdata = self.qcDataMatrixVSsnpData(pdata, snps_name2snps_id, snpData2, curs, session, readme)
                row_id2NA_mismatch_rate = passingdata.row_id2NA_mismatch_rate
                row_id12row_id2 = passingdata.row_id12row_id2
                del pdata

        if self.output_fname and self.run_type == 1 and row_id2NA_mismatch_rate:
            self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate, self.output_fname)

        if self.run_type == 1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate:
            # if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid)
            # row_id2NA_mismatch_rate might be None if it's method 0.
            self.submit_to_call_QC(
                session,
                row_id2NA_mismatch_rate,
                self.QC_method_id,
                self.user,
                self.min_probability,
                row_id12row_id2,
                self.call_method_id,
                readme,
            )
        if self.commit:
            curs.execute("commit")
            session.commit()
        else:
            session.rollback()

        self.row_id2NA_mismatch_rate = row_id2NA_mismatch_rate  # for plone to get the data structure
コード例 #42
0
    def run(self):

        if self.debug:
            import pdb
            pdb.set_trace()

        db = StockDB.StockDB(drivername=self.drivername,
                             username=self.db_user,
                             password=self.db_passwd,
                             hostname=self.hostname,
                             database=self.dbname)
        db.setup(create_tables=False)
        session = db.session
        session.begin()

        self.cmp_data_filename = self.findOutCmpDataFilename(
            self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod)
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.cmp_data_filename)
        strain_acc_list = map(
            int, strain_acc_list
        )  #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
        snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \
             data_matrix=data_matrix) #category_list is not used.

        readme = formReadmeObj(sys.argv, self.ad, StockDB.README)
        session.save(readme)

        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname,
                               host=self.hostname,
                               user=self.db_user,
                               passwd=self.db_passwd)
        curs = conn.cursor()
        from dbSNP2data import dbSNP2data
        snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m(
            curs, StockDB.Calls.table.name, StockDB.SNPs.table.name)
        strain_info_data = self.get_strain_id_info(self.QC_method_id)
        data_matrix = self.get_data_matrix(db,
                                           strain_info_data.strain_id2index,
                                           snp_id2index,
                                           StockDB.Calls.table.name)
        strain_acc_list = [
            strain_info_data.strain_id2acc[strain_id]
            for strain_id in strain_info_data.strain_id_list
        ]
        category_list = [
            strain_info_data.strain_id2category[strain_id]
            for strain_id in strain_info_data.strain_id_list
        ]
        header = ['ecotypeid', 'strainid']
        for snp_id in snp_id_list:
            snp_name, chromosome, position = snp_id2info[snp_id]
            header.append(snp_name)
        snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \
            snps_table='stock.snps') #snps_table is set to the stock_250k snps_table

        twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \
             QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug)
        if self.run_type == 1:
            row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise()
        elif self.run_type == 2:
            #twoSNPData.save_col_wise(session, readme)	#2008-08-18 need to implement a new one for 149SNP
            row_id2NA_mismatch_rate = {}
        else:
            sys.stderr.write("run_type=%s is not supported.\n" % self.run_type)
            sys.exit(5)
        if self.output_fname and self.run_type == 1 and row_id2NA_mismatch_rate:
            self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate,
                                                self.output_fname)

        if self.run_type == 1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate:
            #if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid)
            #row_id2NA_mismatch_rate might be None if it's method 0.
            self.submit_to_call_QC(session, row_id2NA_mismatch_rate, self.QC_method_id, self.db_user, \
                 twoSNPData.row_id12row_id2, readme)
        if self.commit:
            session.commit()
        else:
            session.rollback()