def readInData( cls, phenotype_fname, input_fname, eigen_vector_fname, phenotype_method_id_ls, test_type=1, report=0 ): """ 2010-2-25 call removeUnPhenotypedSNPData() to shrink the snp dataset by removing un-phenotyped ecotypes 2009-3-20 refactored out of run(), easy for MpiAssociation.py to call """ header, strain_acc_list, category_list, data_matrix = read_data(input_fname) snpData = SNPData( header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix ) header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data( phenotype_fname, turn_into_integer=0 ) snpData = cls.removeUnPhenotypedSNPData( snpData, header_phen, strain_acc_list_phen, data_matrix_phen, phenotype_method_id_ls ) newSnpData, allele2index_ls = snpData.convertSNPAllele2Index( report ) # 0 (NA) or -2 (untouched) is all converted to -2 as 0 is used to denote allele newSnpData.header = snpData.header data_matrix_phen = cls.get_phenotype_matrix_in_data_matrix_order( strain_acc_list, strain_acc_list_phen, data_matrix_phen ) phenData = SNPData(header=header_phen, strain_acc_list=snpData.strain_acc_list, data_matrix=data_matrix_phen) if eigen_vector_fname: PC_data = cls.getPCFromFile(eigen_vector_fname) PC_matrix = PC_data.PC_matrix else: if test_type == 4: # eigen_vector_fname not given for this test_type. calcualte PCs. import pca_module T, P, explained_var = pca_module.PCA_svd(newSnpData.data_matrix, standardize=False) PC_matrix = T else: PC_matrix = None del snpData if phenotype_method_id_ls: which_phenotype_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(phenData, Set(phenotype_method_id_ls)) else: # if not available, take all phenotypes which_phenotype_ls = range(len(phenData.col_id_ls)) pdata = PassingData( snpData=newSnpData, phenData=phenData, PC_matrix=PC_matrix, which_phenotype_ls=which_phenotype_ls, phenotype_method_id_ls=phenotype_method_id_ls, ) return pdata
def load_dstruc(self): if self.debug: import pdb pdb.set_trace() QualityControl.load_dstruc(self) self.header1, self.strain_acc_list1, self.category_list1, self.data_matrix1 = read_data(self.input_fname1) self.header2, self.strain_acc_list2, self.category_list2, self.data_matrix2 = read_data(self.input_fname2) self.col_id2col_index1, self.col_id2col_index2, self.col_id12col_id2 = self.get_col_matching_dstruc(self.header1, self.header2) self.row_id2row_index1, self.row_id2row_index2, self.row_id12row_id2 = self.get_row_matching_dstruc(self.strain_acc_list1, self.category_list1, self.strain_acc_list2)
def inputNodePrepare(self, snp_info=None): """ 2009-2-16 get phenData.phenotype_method_id_ls in the same order as phenData.col_id_ls 2009-2-11 refactored out of run() """ header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix, turn_into_array=1) #category_list is not used to facilitate row-id matching picklef = open(self.snps_context_fname) snps_context_wrapper = cPickle.load(picklef) del picklef gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(snps_context_wrapper) del snps_context_wrapper gene_id_ls = gene_id2snps_id_ls.keys() gene_id_ls.sort() header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data( self.phenotype_fname, turn_into_integer=0) phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen) phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order( snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix) phenData.phenotype_method_id_ls = get_phenotype_method_id_lsFromPhenData( phenData) #2009-2-16 self.phenotype_index_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn( phenData, Set(self.phenotype_method_id_ls)) if not self.phenotype_index_ls: self.phenotype_index_ls = range(len(phenData.col_id_ls)) pdata = PassingData(gene_id_ls=gene_id_ls, gene_id2snps_id_ls=gene_id2snps_id_ls, \ phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info) params_ls = self.generate_params(self.gene_id_fname, pdata, self.block_size) other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=pdata.gene_id_ls, phenData=phenData, \ phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info) other_data_pickle = cPickle.dumps(other_data, -1) del other_data output_node_data = PassingData(phenotype_label_ls=phenData.col_id_ls, \ phenotype_index_ls=self.phenotype_index_ls) output_node_data_pickle = cPickle.dumps(output_node_data, -1) snpData_pickle = cPickle.dumps(snpData, -1) del snpData, data_matrix return_data = PassingData(snpData_pickle=snpData_pickle, other_data_pickle=other_data_pickle,\ output_node_data_pickle=output_node_data_pickle, params_ls=params_ls) return return_data
def run(self): if self.debug: import pdb pdb.set_trace() header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0) header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname) if type(data_matrix)==list: data_matrix = numpy.array(data_matrix) data_matrix_phen = self.get_phenotype_matrix_in_data_matrix_order(strain_acc_list, strain_acc_list_phen, data_matrix_phen) kw_results = self._kruskal_wallis_whole_matrix(data_matrix, data_matrix_phen[:, self.which_phenotype], self.min_data_point) self.output_kw_results(kw_results, header[2:], self.output_fname, self.minus_log_pvalue)
def run(self): if self.debug: import pdb pdb.set_trace() import MySQLdb conn = MySQLdb.connect(db=self.dbname,host=self.hostname, user=self.user, passwd = self.passwd) curs = conn.cursor() if self.ecotype_duplicate2tg_ecotypeid_table: ecotype_duplicate2tg_ecotypeid = self.get_ecotype_duplicate2tg_ecotypeid(curs, self.ecotype_duplicate2tg_ecotypeid_table) else: ecotype_duplicate2tg_ecotypeid = None from pymodule import figureOutDelimiter delimiter = figureOutDelimiter(self.input_fname) header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, delimiter=delimiter) tg_ecotypeid2ecotypeid_duplicate_index_ls = self.get_tg_ecotypeid2ecotypeid_duplicate_index_ls(strain_acc_list, category_list, ecotype_duplicate2tg_ecotypeid) ecotypeid2nativename = get_ecotypeid2nativename(curs, ecotype_table=self.ecotype_table) tg_ecotypeid_ls, merge_matrix = self.get_merged_matrix(tg_ecotypeid2ecotypeid_duplicate_index_ls, data_matrix, \ ecotypeid2nativename, self.stat_output_fname) tg_nativename_ls = [] for ecotypeid in tg_ecotypeid_ls: tg_nativename_ls.append(ecotypeid2nativename[ecotypeid]) header[1] = 'nativename' write_data_matrix(merge_matrix, self.output_fname, header, tg_ecotypeid_ls, tg_nativename_ls, delimiter=delimiter)
def run(self): """ 2007-04-30 2007-05-14 add nt_alphabet_bits """ header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname, int(self.nt_alphabet_bits[0])) data_matrix = num.array(data_matrix) strain_homo_perc_vector = self.cal_strain_homo_perc_vector(data_matrix) snp_locus_log_prob = self.cal_snp_locus_log_prob( data_matrix, strain_homo_perc_vector) from sets import Set cols_to_be_tossed_out_set = Set() for i in range(len(snp_locus_log_prob)): if snp_locus_log_prob[i] <= min_log_prob: cols_to_be_tossed_out_set.add(i) print "%sSNPs removed:" % (len(cols_to_be_tossed_out_set)) for col_index in cols_to_be_tossed_out_set: print '\t%s\t%s' % (col_index, header[2 + col_index]) write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, cols_to_be_tossed_out=cols_to_be_tossed_out_set, nt_alphabet=int(self.nt_alphabet_bits[1])) import pylab pylab.title("histogram of snp locus log probability") pylab.hist(snp_locus_log_prob, 20) pylab.show()
def inputNodePrepare(self, snp_info=None): """ 2009-2-16 get phenData.phenotype_method_id_ls in the same order as phenData.col_id_ls 2009-2-11 refactored out of run() """ header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix, turn_into_array=1) #category_list is not used to facilitate row-id matching picklef = open(self.snps_context_fname) snps_context_wrapper = cPickle.load(picklef) del picklef gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(snps_context_wrapper) del snps_context_wrapper gene_id_ls = gene_id2snps_id_ls.keys() gene_id_ls.sort() header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0) phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen) phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix) phenData.phenotype_method_id_ls = get_phenotype_method_id_lsFromPhenData(phenData) #2009-2-16 self.phenotype_index_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(phenData, Set(self.phenotype_method_id_ls)) if not self.phenotype_index_ls: self.phenotype_index_ls = range(len(phenData.col_id_ls)) pdata = PassingData(gene_id_ls=gene_id_ls, gene_id2snps_id_ls=gene_id2snps_id_ls, \ phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info) params_ls = self.generate_params(self.gene_id_fname, pdata, self.block_size) other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=pdata.gene_id_ls, phenData=phenData, \ phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info) other_data_pickle = cPickle.dumps(other_data, -1) del other_data output_node_data = PassingData(phenotype_label_ls=phenData.col_id_ls, \ phenotype_index_ls=self.phenotype_index_ls) output_node_data_pickle = cPickle.dumps(output_node_data, -1) snpData_pickle = cPickle.dumps(snpData, -1) del snpData, data_matrix return_data = PassingData(snpData_pickle=snpData_pickle, other_data_pickle=other_data_pickle,\ output_node_data_pickle=output_node_data_pickle, params_ls=params_ls) return return_data
def run(self): if self.debug: import pdb pdb.set_trace() db = StockDB.StockDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname) db.setup(create_tables=False) session = db.session session.begin() self.cmp_data_filename = self.findOutCmpDataFilename(self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod) header, strain_acc_list, category_list, data_matrix = read_data(self.cmp_data_filename) strain_acc_list = map(int, strain_acc_list) #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix) #category_list is not used. readme = formReadmeObj(sys.argv, self.ad, StockDB.README) session.save(readme) import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user = self.db_user, passwd = self.db_passwd) curs = conn.cursor() from dbSNP2data import dbSNP2data snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m(curs, StockDB.Calls.table.name, StockDB.SNPs.table.name) strain_info_data = self.get_strain_id_info(self.QC_method_id) data_matrix = self.get_data_matrix(db, strain_info_data.strain_id2index, snp_id2index, StockDB.Calls.table.name) strain_acc_list = [strain_info_data.strain_id2acc[strain_id] for strain_id in strain_info_data.strain_id_list] category_list = [strain_info_data.strain_id2category[strain_id] for strain_id in strain_info_data.strain_id_list] header = ['ecotypeid', 'strainid'] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] header.append(snp_name) snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \ snps_table='stock.snps') #snps_table is set to the stock_250k snps_table twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \ QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug) if self.run_type==1: row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise() elif self.run_type==2: #twoSNPData.save_col_wise(session, readme) #2008-08-18 need to implement a new one for 149SNP row_id2NA_mismatch_rate = {} else: sys.stderr.write("run_type=%s is not supported.\n"%self.run_type) sys.exit(5) if self.output_fname and self.run_type==1 and row_id2NA_mismatch_rate: self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate, self.output_fname) if self.run_type==1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate: #if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid) #row_id2NA_mismatch_rate might be None if it's method 0. self.submit_to_call_QC(session, row_id2NA_mismatch_rate, self.QC_method_id, self.db_user, \ twoSNPData.row_id12row_id2, readme) if self.commit: session.commit() else: session.rollback()
def run(self): if self.debug: import pdb pdb.set_trace() sys.stderr.write("This program is outdated. Please run Association.py instead.\n") sys.exit(0) header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0) header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname) if type(data_matrix)==list: data_matrix = numpy.array(data_matrix) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix, turn_into_array=1, ignore_2nd_column=ignore_2nd_column) data_matrix_phen = self.get_phenotype_matrix_in_data_matrix_order(strain_acc_list, strain_acc_list_phen, data_matrix_phen) kw_results = self._kruskal_wallis_whole_matrix(snpData, data_matrix_phen[:, self.which_phenotype], self.min_data_point) self.output_kw_results(kw_results, header[2:], self.output_fname, self.minus_log_pvalue)
def prepareTwoSNPData(self, db, max_mismatch_rate=0.25, min_no_of_non_NA_pairs=40, report=0): """ 2009-9-23 add arguments max_mismatch_rate & min_no_of_non_NA_pairs, and pass them to twoSNPData. However it's useless to control what should be inserted into db because TwoSNPData.qc_cross_match_table is not defined and even if it's defined, the table it'll create doesn't concord to the one in 149SNP db. 2008-09-10 if self.input_fname is given, get 149SNP data from it , instead of database 2008-8-28 split out of run() so that MpiQC149CrossMatch could call this easily """ import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user = self.db_user, passwd = self.db_passwd) curs = conn.cursor() if self.input_fname: header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname) else: from dbSNP2data import dbSNP2data snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m(curs, StockDB.Calls.table.name, StockDB.SNPs.table.name) strain_info_data = self.get_strain_id_info(self.QC_method_id, ignore_strains_with_qc=False) data_matrix = self.get_data_matrix(db, strain_info_data.strain_id2index, snp_id2index, StockDB.Calls.table.name) strain_acc_list = [strain_info_data.strain_id2acc[strain_id] for strain_id in strain_info_data.strain_id_list] #tg_ecotypeid category_list = [strain_info_data.strain_id2category[strain_id] for strain_id in strain_info_data.strain_id_list] #strainid header = ['ecotypeid', 'strainid'] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] header.append(snp_name) snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \ snps_table='stock.snps') #snps_table is set to the stock_250k snps_table if self.QC_method_id==4: snpData2 = snpData1 else: self.cmp_data_filename = self.findOutCmpDataFilename(self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod) header, strain_acc_list, category_list, data_matrix = read_data(self.cmp_data_filename) strain_acc_list = map(int, strain_acc_list) #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix) #category_list is not used to facilitate row-id matching twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, \ QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug,\ max_mismatch_rate=max_mismatch_rate, min_no_of_non_NA_pairs=min_no_of_non_NA_pairs, report=report) return twoSNPData
def run(self): """ 2007-02-27 2007-09-14 filtering_bits -read_data() -remove_rows_with_too_many_NAs() -remove_cols_with_too_many_NAs() -remove_identity_strains() -write_data_matrix() """ if self.debug: import pdb pdb.set_trace() delimiter = figureOutDelimiter(self.input_fname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname, int(self.nt_alphabet_bits[0]), delimiter=delimiter ) data_matrix = num.array(data_matrix) if self.filtering_bits[0] == "1": remove_rows_data = self.remove_rows_with_too_many_NAs(data_matrix, self.row_cutoff) rows_with_too_many_NAs_set = remove_rows_data.rows_with_too_many_NAs_set strain_index2no_of_NAs = remove_rows_data.row_index2no_of_NAs else: rows_with_too_many_NAs_set = Set() if self.filtering_bits[1] == "1": remove_cols_data = self.remove_cols_with_too_many_NAs(data_matrix, col_cutoff, rows_with_too_many_NAs_set) cols_with_too_many_NAs_set = remove_cols_data.cols_with_too_many_NAs_set else: cols_with_too_many_NAs_set = Set() if self.filtering_bits[2] == "1": no_of_rows, no_of_cols = data_matrix.shape total_rows_set = Set(range(no_of_rows)) rows_to_be_checked = total_rows_set - rows_with_too_many_NAs_set total_cols_set = Set(range(no_of_cols)) cols_to_be_checked = total_cols_set - cols_with_too_many_NAs_set identity_strains_to_be_removed = self.remove_identity_strains( data_matrix, rows_to_be_checked, cols_to_be_checked ) else: identity_strains_to_be_removed = Set() rows_to_be_tossed_out = rows_with_too_many_NAs_set | identity_strains_to_be_removed # self.write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out, cols_with_too_many_NAs_set, int(self.nt_alphabet_bits[1])) write_data_matrix( data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out, cols_with_too_many_NAs_set, nt_alphabet=int(self.nt_alphabet_bits[1]), delimiter=delimiter, )
def run(self): self.communicator = MPI.world.duplicate() node_rank = self.communicator.rank free_computing_nodes = range(1, self.communicator.size-1) #exclude the 1st and last node free_computing_node_set = Set(free_computing_nodes) output_node_rank = self.communicator.size-1 """ if node_rank!=output_node_rank: header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix) #category_list is not used to facilitate row-id matching """ if node_rank == 0: header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix) #category_list is not used to facilitate row-id matching snpData_pickle = cPickle.dumps(snpData, -1) for node in free_computing_nodes: #send it to the computing_node sys.stderr.write("passing initial data to nodes from %s to %s ... "%(node_rank, node)) self.communicator.send(snpData_pickle, node, 0) sys.stderr.write(".\n") del snpData_pickle params_ls = self.generate_params(len(snpData.col_id_ls), self.block_size) del snpData elif node_rank in free_computing_node_set: data, source, tag = self.communicator.receiveString(0, 0) snpData = cPickle.loads(data) del data else: pass self.synchronize() if node_rank == 0: param_obj = PassingData(params_ls=params_ls, output_node_rank=output_node_rank, report=self.report, counter=0) self.inputNode(param_obj, free_computing_nodes, param_generator = params_ls) #self.input_node(param_obj, free_computing_nodes, input_handler=self.input_handler, message_size=1) #self.input_node(param_obj, free_computing_nodes, self.message_size) elif node_rank in free_computing_node_set: computing_parameter_obj = PassingData(snpData=snpData, min_LD_to_output=self.min_LD_to_output, min_MAF=self.min_MAF, discard_perc=self.discard_perc) self.computing_node(computing_parameter_obj, self.computing_node_handler) else: if getattr(self, 'output_fname', None): writer = csv.writer(open(self.output_fname, 'w'), delimiter='\t') #header_row = ['snp1_id', 'snp2_id', 'r2', 'D', "D'", "no_of_pairs"] #writer.writerow(header_row) else: writer = None param_obj = PassingData(writer=writer, is_header_written=False) self.output_node(free_computing_nodes, param_obj, self.output_node_handler) del writer self.synchronize() #to avoid some node early exits
def run(self): """ 2008-5-12 """ if self.debug: import pdb pdb.set_trace() #database connection and etc db = self.db_250k session = db.session session.begin() delimiter = figureOutDelimiter(self.inputFname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data( self.inputFname, delimiter=delimiter) if self.snp_id_type == 1: #2011-2-27 translate the db_id into chr_pos because the new StrainXSNP dataset uses db_id to identify SNPs. # but if col-id is already chr_pos, it's fine. new_header = header[:2] data_matrix_col_index_to_be_kept = [] for i in xrange(2, len(header)): snp_id = header[i] chr_pos = db.get_chr_pos_given_db_id2chr_pos(snp_id, ) if chr_pos is not None: data_matrix_col_index_to_be_kept.append(i - 2) new_header.append(chr_pos) # to remove no-db_id columns from data matrix data_matrix = numpy.array(data_matrix) data_matrix = data_matrix[:, data_matrix_col_index_to_be_kept] header = new_header if self.array_id_2nd_column: snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) else: snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\ data_matrix=data_matrix) #ignore category_list rawSnpsData_ls = SNPData2RawSnpsData_ls(snpData, need_transposeSNPData=1, report=self.report) chromosomes = [ rawSnpsData.chromosome for rawSnpsData in rawSnpsData_ls ] snpsdata.writeRawSnpsDatasToFile(self.outputFname, rawSnpsData_ls, chromosomes=chromosomes, deliminator=',', withArrayIds=self.array_id_2nd_column)
def main(self): if self.debug: import pdb pdb.set_trace() if self.input_file_format == 1: header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, turn_into_integer=0) snps_name_ls = header[2:] no_of_rows = len(strain_acc_list) no_of_samplings = int(math.ceil(self.coverage * no_of_rows / float(self.no_of_accessions_per_sampling))) if no_of_samplings > 1: imputed_matrix, new_snps_name_ls = self.samplingImpute( snps_name_ls, data_matrix, input_file_format=1, input_NA_char="0", lower_case_for_imputation=self.lower_case_for_imputation, npute_window_size=self.single_window_size, no_of_accessions_per_sampling=self.no_of_accessions_per_sampling, coverage=self.coverage, ) imputedData = YuSNPData( strain_acc_list=strain_acc_list, category_list=category_list, col_id_ls=snps_name_ls, data_matrix=imputed_matrix, ) imputedData.tofile(self.output_fname) else: self.outputHeader(self.output_fname, strain_acc_list, category_list) chr2no_of_snps = self.get_chr2no_of_snps(snps_name_ls) chr_ls = chr2no_of_snps.keys() chr_ls.sort() for chromosome in chr_ls: snpData = SNPData( inFile=self.input_fname, snps_name_ls=snps_name_ls, data_matrix=data_matrix, chromosome=chromosome, input_file_format=self.input_file_format, lower_case_for_imputation=self.lower_case_for_imputation, ) self.run(snpData) else: snpData = SNPData( inFile=self.input_fname, input_file_format=self.input_file_format, lower_case_for_imputation=self.lower_case_for_imputation, ) self.run(snpData)
def run(self): from pymodule import read_data header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname) snp_acc_ls = header[2:] if self.debug: import pdb pdb.set_trace() if self.drawing_type==1: matrix_value2label= number2nt matrix_value2color = number2color data_matrix = numpy.array(data_matrix) elif self.drawing_type==2: import MySQLdb #conn = MySQLdb.connect(db="stock",host='natural.uchicago.edu', user='******', passwd='iamhereatusc') conn = MySQLdb.connect(db=self.dbname,host=self.hostname) curs = conn.cursor() snp_allele2index_ls = self.get_snp_allele2index_ls(curs, snp_acc_ls, self.snps_sequenom_info_table) data_matrix = self.transformMatrixIntoTwoAllelesAndHetero(data_matrix, snp_allele2index_ls) matrix_value2label= {-1:'deletion', 0:'NA', 1:'allele1', 2:'allele2', 3:'hetero'} matrix_value2color = {-1:(0,0,0), 0:(255,255,255), 1:(0,0,255), 2:(0,255,0), 3:(255,0,0)} elif self.drawing_type==3: data_matrix = self.transformMatrixIntoHomoAndHetero(data_matrix) matrix_value2label= {-1:'deletion', 0:'NA', 1:'h**o', 2:'hetero'} matrix_value2color = {-1:(0,0,0), 0:(255,255,255), 1:(0,0,255), 2:(255,0,0)} elif self.drawing_type==4: data_matrix = self.transformMatrixIntoTwoAllelesAndHetero(data_matrix) matrix_value2label= {-1:'deletion', 0:'NA', 1:'allele1', 2:'allele2', 3:'hetero'} matrix_value2color = {-1:(0,0,0), 0:(255,255,255), 1:(0,0,255), 2:(0,255,0), 3:(255,0,0)} elif self.drawing_type==5: data_matrix = self.transformMatrixIntoFourNucleotides(data_matrix) matrix_value2label= {-1: '-', 0: 'NA', 1:'A', 2:'C', 3:'G', 4:'T'} matrix_value2color = {-1:(0,0,0), 0:(255,255,255), 1:(0,0,255), 2:(0,255,0), 3:(255,0,0), 4:(122,0,122)} new_strain_acc_list = [] for strain_acc in strain_acc_list: new_strain_acc_list.append(strain_acc) new_strain_acc_list.append(strain_acc) strain_acc_list = new_strain_acc_list else: sys.stderr.write("drawing_type %s not supported\n"%drawing_type) sys.exit(2) row_label_type2label_ls = {1:strain_acc_list, 2:category_list} font = get_font(self.font_path, font_size=self.font_size) #2008-08-01 im = drawLegend(matrix_value2label, matrix_value2color, font) im.save('%s_legend.png'%self.output_fname_prefix) im = drawMatrix(data_matrix, matrix_value2color, row_label_type2label_ls[self.row_label_type], snp_acc_ls, with_grid=1, font=font) im.save('%s.png'%self.output_fname_prefix)
def run(self): """ 2009-5-28 """ if self.debug: import pdb pdb.set_trace() db = Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) nativename2tg_ecotypeid_set = getNativename2TgEcotypeIDSet( db.metadata.bind, turnUpperCase=True) ecotype_id_set_250k_in_pipeline = get_ecotype_id_set_250k_in_pipeline( ArrayInfo) ecotypeid2tg_ecotypeid = get_ecotypeid2tg_ecotypeid(db.metadata.bind) #turn_into_integer=2 because it's not nucleotides header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data( self.input_fname, turn_into_integer=2, matrix_data_type=float) data_matrix_phen = numpy.array(data_matrix_phen) #2009-8-19 bug here. strain_acc_list_phen is not unique for each row. causing replicates to have the same value #from Association import Association #data_matrix_phen = Association.get_phenotype_matrix_in_data_matrix_order(strain_acc_list_phen, strain_acc_list_phen, data_matrix_phen) phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen) ecotype_id_ls = self.straightenEcotypeID(phenData.row_id_ls, nativename2tg_ecotypeid_set, ecotypeid2tg_ecotypeid, \ ecotype_id_set_250k_in_pipeline) session = db.session session.begin() if self.run_type == 1: self.putPhenotypeIntoDB(db, phenData, ecotype_id_ls) elif self.run_type == 2: self.putReplicatePhenotypeIntoDB(db, phenData, ecotype_id_ls) else: sys.stderr.write("Unsupported run type: %s.\n" % (self.run_type)) if self.commit: session.commit()
def run(self): """ 2008-9-7 """ if self.debug: import pdb pdb.set_trace() delimiter = figureOutDelimiter(self.input_fname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, delimiter=delimiter) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) newSnpData, allele_index2allele_ls = snpData.convert2Binary(self.report) if self.mapping_fname: #output allele_index2allele_ls self.output_allele2index_ls(snpData, allele_index2allele_ls, self.mapping_fname) newSnpData.tofile(self.output_fname)
def run(self): """ 2008-5-12 """ if self.debug: import pdb pdb.set_trace() delimiter = figureOutDelimiter(self.input_fname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, delimiter=delimiter) if self.array_id_2nd_column: snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) else: snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\ data_matrix=data_matrix) #ignore category_list rawSnpsData_ls = SNPData2RawSnpsData_ls(snpData, need_transposeSNPData=1, report=self.report) chromosomes = [rawSnpsData.chromosome for rawSnpsData in rawSnpsData_ls] snpsdata.writeRawSnpsDatasToFile(self.output_fname, rawSnpsData_ls, chromosomes=chromosomes, deliminator=',', withArrayIds=self.array_id_2nd_column)
def run(self): if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0) phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(phenData.row_id_ls, strain_acc_list_phen, phenData.data_matrix) #tricky, using strain_acc_list_phen phenotype_col_index1 = self.findOutWhichPhenotypeColumn(phenData, Set([self.phenotype_method_id1]))[0] phenotype_col_index2 = self.findOutWhichPhenotypeColumn(phenData, Set([self.phenotype_method_id2]))[0] x_ls = [] y_ls = [] for i in range(phenData.data_matrix.shape[0]): if not numpy.isnan(phenData.data_matrix[i][phenotype_col_index1]) and not numpy.isnan(phenData.data_matrix[i][phenotype_col_index2]): x_ls.append(phenData.data_matrix[i][phenotype_col_index1]) y_ls.append(phenData.data_matrix[i][phenotype_col_index2]) pylab.clf() pylab.title('Phenotype Contrast') pylab.plot(x_ls, y_ls, '.', alpha=0.6) pylab.grid(alpha=0.3) phenotype_method1 = Stock_250kDB.PhenotypeMethod.get(self.phenotype_method_id1) phenotype_method2 = Stock_250kDB.PhenotypeMethod.get(self.phenotype_method_id2) pylab.xlabel(phenotype_method1.short_name) pylab.ylabel(phenotype_method2.short_name) #draw diagonal line to show perfect correlation max_min_value = max(min(x_ls), min(y_ls)) min_max_value = min(max(x_ls), max(y_ls)) pylab.plot([max_min_value, min_max_value], [max_min_value, min_max_value], c='g', alpha=0.7) png_output_fname = '%s.png'%self.output_fname_prefix pylab.savefig(png_output_fname, dpi=400) pylab.savefig('%s.svg'%self.output_fname_prefix)
def run(self): if self.debug: import pdb pdb.set_trace() import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.user, passwd=self.passwd) curs = conn.cursor() if self.ecotype_duplicate2tg_ecotypeid_table: ecotype_duplicate2tg_ecotypeid = self.get_ecotype_duplicate2tg_ecotypeid( curs, self.ecotype_duplicate2tg_ecotypeid_table) else: ecotype_duplicate2tg_ecotypeid = None from pymodule import figureOutDelimiter delimiter = figureOutDelimiter(self.input_fname) header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname, delimiter=delimiter) tg_ecotypeid2ecotypeid_duplicate_index_ls = self.get_tg_ecotypeid2ecotypeid_duplicate_index_ls( strain_acc_list, category_list, ecotype_duplicate2tg_ecotypeid) ecotypeid2nativename = get_ecotypeid2nativename( curs, ecotype_table=self.ecotype_table) tg_ecotypeid_ls, merge_matrix = self.get_merged_matrix(tg_ecotypeid2ecotypeid_duplicate_index_ls, data_matrix, \ ecotypeid2nativename, self.stat_output_fname) tg_nativename_ls = [] for ecotypeid in tg_ecotypeid_ls: tg_nativename_ls.append(ecotypeid2nativename[ecotypeid]) header[1] = 'nativename' write_data_matrix(merge_matrix, self.output_fname, header, tg_ecotypeid_ls, tg_nativename_ls, delimiter=delimiter)
def run(self): """ 2007-02-27 2007-09-14 filtering_bits -read_data() -remove_rows_with_too_many_NAs() -remove_cols_with_too_many_NAs() -remove_identity_strains() -write_data_matrix() """ if self.debug: import pdb pdb.set_trace() delimiter = figureOutDelimiter(self.input_fname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, int(self.nt_alphabet_bits[0]), delimiter=delimiter) data_matrix = numpy.array(data_matrix) if self.filtering_bits[0]=='1': remove_rows_data = self.remove_rows_with_too_many_NAs(data_matrix, self.row_cutoff) rows_with_too_many_NAs_set = remove_rows_data.rows_with_too_many_NAs_set strain_index2no_of_NAs = remove_rows_data.row_index2no_of_NAs else: rows_with_too_many_NAs_set = set() if self.filtering_bits[1]=='1': remove_cols_data = self.remove_cols_with_too_many_NAs(data_matrix, col_cutoff, rows_with_too_many_NAs_set) cols_with_too_many_NAs_set = remove_cols_data.cols_with_too_many_NAs_set else: cols_with_too_many_NAs_set = set() if self.filtering_bits[2]=='1': no_of_rows, no_of_cols = data_matrix.shape total_rows_set = set(range(no_of_rows)) rows_to_be_checked = total_rows_set - rows_with_too_many_NAs_set total_cols_set = set(range(no_of_cols)) cols_to_be_checked = total_cols_set - cols_with_too_many_NAs_set identity_strains_to_be_removed = self.remove_identity_strains(data_matrix, rows_to_be_checked, cols_to_be_checked) else: identity_strains_to_be_removed = set() rows_to_be_tossed_out = rows_with_too_many_NAs_set | identity_strains_to_be_removed #self.write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out, cols_with_too_many_NAs_set, int(self.nt_alphabet_bits[1])) write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out, cols_with_too_many_NAs_set, nt_alphabet=int(self.nt_alphabet_bits[1]), delimiter=delimiter)
def run(self): """ 2008-9-7 """ if self.debug: import pdb pdb.set_trace() delimiter = figureOutDelimiter(self.input_fname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname, delimiter=delimiter) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) newSnpData, allele_index2allele_ls = snpData.convert2Binary( self.report) if self.mapping_fname: #output allele_index2allele_ls self.output_allele2index_ls(snpData, allele_index2allele_ls, self.mapping_fname) newSnpData.tofile(self.output_fname)
def run(self): """ 2009-5-28 """ if self.debug: import pdb pdb.set_trace() db = Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) nativename2tg_ecotypeid_set = getNativename2TgEcotypeIDSet(db.metadata.bind, turnUpperCase=True) ecotype_id_set_250k_in_pipeline = get_ecotype_id_set_250k_in_pipeline(ArrayInfo) ecotypeid2tg_ecotypeid = get_ecotypeid2tg_ecotypeid(db.metadata.bind) #turn_into_integer=2 because it's not nucleotides header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.input_fname, turn_into_integer=2, matrix_data_type=float) data_matrix_phen = numpy.array(data_matrix_phen) #2009-8-19 bug here. strain_acc_list_phen is not unique for each row. causing replicates to have the same value #from Association import Association #data_matrix_phen = Association.get_phenotype_matrix_in_data_matrix_order(strain_acc_list_phen, strain_acc_list_phen, data_matrix_phen) phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen) ecotype_id_ls = self.straightenEcotypeID(phenData.row_id_ls, nativename2tg_ecotypeid_set, ecotypeid2tg_ecotypeid, \ ecotype_id_set_250k_in_pipeline) session = db.session session.begin() if self.run_type==1: self.putPhenotypeIntoDB(db, phenData, ecotype_id_ls) elif self.run_type==2: self.putReplicatePhenotypeIntoDB(db, phenData, ecotype_id_ls) else: sys.stderr.write("Unsupported run type: %s.\n"%(self.run_type)) if self.commit: session.commit()
def main(self): if self.debug: import pdb pdb.set_trace() if self.input_file_format == 1: header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname, turn_into_integer=0) snps_name_ls = header[2:] no_of_rows = len(strain_acc_list) no_of_samplings = int( math.ceil(self.coverage * no_of_rows / float(self.no_of_accessions_per_sampling))) if no_of_samplings > 1: imputed_matrix, new_snps_name_ls = self.samplingImpute(snps_name_ls, data_matrix, input_file_format=1, \ input_NA_char='0', lower_case_for_imputation=self.lower_case_for_imputation,\ npute_window_size=self.single_window_size, no_of_accessions_per_sampling=self.no_of_accessions_per_sampling,\ coverage=self.coverage) imputedData = YuSNPData(strain_acc_list=strain_acc_list, category_list=category_list, col_id_ls=snps_name_ls, data_matrix=imputed_matrix) imputedData.tofile(self.output_fname) else: self.outputHeader(self.output_fname, strain_acc_list, category_list) chr2no_of_snps = self.get_chr2no_of_snps(snps_name_ls) chr_ls = chr2no_of_snps.keys() chr_ls.sort() for chromosome in chr_ls: snpData = SNPData(inFile=self.input_fname, snps_name_ls=snps_name_ls, data_matrix=data_matrix, chromosome=chromosome, \ input_file_format=self.input_file_format, lower_case_for_imputation=self.lower_case_for_imputation) self.run(snpData) else: snpData = SNPData( inFile=self.input_fname, input_file_format=self.input_file_format, lower_case_for_imputation=self.lower_case_for_imputation) self.run(snpData)
def run(self): """ 2007-04-30 2007-05-14 add nt_alphabet_bits """ header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, int(self.nt_alphabet_bits[0])) data_matrix = num.array(data_matrix) strain_homo_perc_vector = self.cal_strain_homo_perc_vector(data_matrix) snp_locus_log_prob = self.cal_snp_locus_log_prob(data_matrix, strain_homo_perc_vector) from sets import Set cols_to_be_tossed_out_set = Set() for i in range(len(snp_locus_log_prob)): if snp_locus_log_prob[i]<=min_log_prob: cols_to_be_tossed_out_set.add(i) print "%sSNPs removed:"%(len(cols_to_be_tossed_out_set)) for col_index in cols_to_be_tossed_out_set: print '\t%s\t%s'%(col_index, header[2+col_index]) write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, cols_to_be_tossed_out=cols_to_be_tossed_out_set, nt_alphabet=int(self.nt_alphabet_bits[1])) import pylab pylab.title("histogram of snp locus log probability") pylab.hist(snp_locus_log_prob, 20) pylab.show()
def run(self): """ 2008-09-06 """ if self.debug: #for one-node testing purpose import pdb pdb.set_trace() header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix, turn_into_array=1) #category_list is not used to facilitate row-id matching picklef = open(self.snps_context_fname) snps_context_wrapper = cPickle.load(picklef) del picklef gene_id2snps_id_ls = self.get_gene_id2snps_id_ls( snps_context_wrapper) gene_id_ls = gene_id2snps_id_ls.keys() gene_id_ls.sort() header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data( self.phenotype_fname, turn_into_integer=0) phenData = SNPData( header=header_phen, strain_acc_list=strain_acc_list, data_matrix=data_matrix_phen ) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order( snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix) other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=gene_id_ls, phenData=phenData) other_data_pickle = cPickle.dumps(other_data, -1) phenotype_label_ls_pickle = cPickle.dumps(phenData.col_id_ls, -1) snpData_pickle = cPickle.dumps(snpData, -1) sys.exit(2) self.communicator = MPI.world.duplicate() node_rank = self.communicator.rank free_computing_nodes = range(1, self.communicator.size - 1) #exclude the 1st and last node free_computing_node_set = Set(free_computing_nodes) output_node_rank = self.communicator.size - 1 if node_rank == 0: dstruc = self.inputNodePrepare() params_ls = dstruc.params_ls #send the output node the phenotype_label_ls self.communicator.send(dstruc.output_node_data_pickle, output_node_rank, 0) del dstruc.output_node_data_pickle for node in free_computing_nodes: #send it to the computing_node sys.stderr.write( "passing initial data to nodes from %s to %s ... " % (node_rank, node)) self.communicator.send(dstruc.snpData_pickle, node, 0) self.communicator.send(dstruc.other_data_pickle, node, 0) sys.stderr.write(".\n") del dstruc elif node_rank in free_computing_node_set: data, source, tag = self.communicator.receiveString(0, 0) snpData = cPickle.loads(data) del data data, source, tag = self.communicator.receiveString(0, 0) other_data = cPickle.loads(data) del data self.phenotype_index_ls = other_data.phenotype_index_ls else: data, source, tag = self.communicator.receiveString(0, 0) output_node_data_pickle = cPickle.loads(data) phenotype_label_ls = output_node_data_pickle.phenotype_label_ls self.phenotype_index_ls = output_node_data_pickle.phenotype_index_ls self.synchronize() if node_rank == 0: param_obj = PassingData(params_ls=params_ls, output_node_rank=output_node_rank, report=self.report, counter=0) self.inputNode(param_obj, free_computing_nodes, param_generator=params_ls) #self.input_node(param_obj, free_computing_nodes, input_handler=self.input_fetch_handler, message_size=1) elif node_rank in free_computing_node_set: computing_parameter_obj = PassingData(snpData=snpData, gene_id_ls=other_data.gene_id_ls, \ gene_id2snps_id_ls=other_data.gene_id2snps_id_ls, phenData=other_data.phenData, phenotype_index_ls=self.phenotype_index_ls, min_data_point=self.min_data_point, test_type=self.test_type) self.computing_node(computing_parameter_obj, self.computing_node_handler) else: self.general_output_node(self.output_dir, self.phenotype_index_ls, phenotype_label_ls, free_computing_nodes) self.synchronize() #to avoid some node early exits
def run(self): if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session snpData = SNPData(input_fname=self.input_fname, turn_into_integer=1, turn_into_array=1, ignore_2nd_column=1) if self.eigen_vector_fname and self.eigen_value_fname: eigen_value_ls = self.getEigenValueFromFile(self.eigen_value_fname) eigen_value_ls = numpy.array(eigen_value_ls) explained_var = eigen_value_ls/numpy.sum(eigen_value_ls) PC_data = self.getPCFromFile(self.eigen_vector_fname) PC_matrix = PC_data.PC_matrix else: max_no_of_snps = 10000 if len(snpData.col_id_ls)>max_no_of_snps: #2008-12-01 randomly pick max_no_of_snps SNPs picked_col_index_ls = random.sample(range(len(snpData.col_id_ls)), max_no_of_snps) new_col_id_ls = [snpData.col_id_ls[i] for i in picked_col_index_ls] newSnpData = SNPData(row_id_ls=snpData.row_id_ls, col_id_ls=new_col_id_ls, strain_acc_list=snpData.strain_acc_list,\ category_list=snpData.category_list) newSnpData.data_matrix = snpData.data_matrix[:, picked_col_index_ls] snpData = newSnpData snpData, allele_index2allele_ls = snpData.convertSNPAllele2Index() explained_var = None PC_matrix = None header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0) phenData = SNPData(header=header_phen, strain_acc_list=snpData.strain_acc_list, data_matrix=data_matrix_phen) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, strain_acc_list_phen, phenData.data_matrix) #tricky, using strain_acc_list_phen phenotype_col_index = self.findOutWhichPhenotypeColumn(phenData, Set([self.phenotype_method_id]))[0] ecotype_info = getEcotypeInfo(db, self.country_order_type) #the offset below decides where the label of strains/snps should start in axe_snp_matrix #2008-11-14 only for PlotGroupOfSNPs.py. you can set it to 1 cuz we dont' draw axe_snp_matrix here. snp_id_label_y_offset = 0.95 StrainID2PCAPosInfo = self.getStrainID2PCAPosInfo(snpData, pca_range=[0,1], snp_id_label_y_offset=snp_id_label_y_offset, explained_var=explained_var, T=PC_matrix) axe_y_offset1 = 0.03 axe_height1 = 0.45 #height of axe_chromosome, twice height of axe_map_phenotype_legend axe_y_offset2 = axe_y_offset1+axe_height1 axe_height2 = 0.5 #height of axe_strain_pca, axe_snp_matrix, axe_map axe_y_offset3 = axe_y_offset2+axe_height2 axe_x_offset1 = 0.05 axe_width1 = 0.8 #width of axe_strain_pca axe_x_offset2 = axe_x_offset1 + 0.02 + axe_width1 axe_width2 = 0.05 #width of axe_chromosome, axe_snp_matrix, axe_snp_pca axe_x_offset3 = axe_x_offset2 + axe_width2 axe_width3 = 0.02 #width of axe_phenotype phenotype_method = Stock_250kDB.PhenotypeMethod.get(self.phenotype_method_id) phenotype_cmap = mpl.cm.jet max_phenotype = numpy.nanmax(phenData.data_matrix[:,phenotype_col_index]) #nanmax ignores the nan elements min_phenotype = numpy.nanmin(phenData.data_matrix[:,phenotype_col_index]) #nanmin ignores the nan elements phenotype_gap = max_phenotype - min_phenotype phenotype_jitter = phenotype_gap/10. phenotype_norm = mpl.colors.Normalize(vmin=min_phenotype-phenotype_jitter, vmax=max_phenotype+phenotype_jitter) axe_map_phenotype_legend = pylab.axes([axe_x_offset2, axe_y_offset1, axe_width2, 0.3], frameon=False) cb = mpl.colorbar.ColorbarBase(axe_map_phenotype_legend, cmap=phenotype_cmap, norm=phenotype_norm, orientation='vertical') cb.set_label('Legend Of Phenotype %s %s'%(phenotype_method.id, phenotype_method.short_name)) axe_strain_map = pylab.axes([axe_x_offset1, axe_y_offset2, axe_width1, axe_height2], frameon=False) axe_strain_pca = pylab.axes([axe_x_offset1, axe_y_offset1, axe_width1, axe_height1], frameon=False) axe_strain_map_pca_cover = pylab.axes([axe_x_offset1, axe_y_offset1, axe_width1, axe_height1+axe_height2], frameon=False, \ sharex=axe_strain_pca) #cover both axe_strain_map and axe_strain_pca axe_strain_map_pca_cover.set_yticks([]) axe_strain_pca_xlim = [-0.05,1.05] axe_strain_pca_ylim = [0, 1.05] axe_strain_pca.set_xlim(axe_strain_pca_xlim) axe_strain_pca.set_ylim(axe_strain_pca_ylim) axe_strain_map_pca_cover_ylim = [0, (axe_height1+axe_height2)/axe_height1] #set it accordingly axe_strain_map_pca_cover.set_ylim(axe_strain_map_pca_cover_ylim) axe_strain_pca.grid(True, alpha=0.3) axe_strain_pca.set_xticks([]) axe_strain_pca.set_yticks([]) axe_strain_pca_legend = None #no pca legend self.drawStrainPCA(axe_strain_pca, axe_strain_map, axe_strain_map_pca_cover, axe_strain_pca_legend, StrainID2PCAPosInfo, \ ecotype_info, phenData, \ phenotype_col_index, phenotype_cmap, phenotype_norm, rightmost_x_value=axe_strain_pca_xlim[1],\ strain_color_type=2, pca2map_line_color=None, ecotype_width_on_map=10,\ draw_lines_to_axe_snp_matrix = False, strain_size_on_axe_strain_pca=14, pic_area=self.pic_area,\ map_pca_line_alpha=0.2, map_pca_linewidth=0.2) #customize a couple of things axe_strain_pca.set_xlim(axe_strain_pca_xlim) axe_strain_pca.set_ylim(axe_strain_pca_ylim) axe_strain_map_pca_cover.set_ylim(axe_strain_map_pca_cover_ylim) png_output_fname = '%s.png'%self.output_fname_prefix pylab.savefig(png_output_fname, dpi=400) pylab.savefig('%s.svg'%self.output_fname_prefix) self.plotLatLonPhenVsPC(ecotype_info, StrainID2PCAPosInfo, phenData, phenotype_col_index, phenotype_cmap, phenotype_norm, self.output_fname_prefix, commit=self.commit)
def plone_run(self, min_call_info_mismatch_rate=0.1): """ 2009-6-9 pass self.max_mismatch_rate, self.min_no_of_non_NA_pairs to TwoSNPData to filter entries stored in db. 2009-4-13 add min_call_info_mismatch_rate 2009-2-5 add "create_tables=False" to db.setup() 2008-07-02 fix a bug which causes the program to continue read data even while call_info_id2fname is empty and input_dir is null. 2008-07-01 adjust to the newest functions in QC_250k.py 2008-04-25 return None if QC_method_id==0 2008-04-20 for plone to call it just to get row_id2NA_mismatch_rate """ import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user = self.user, passwd = self.passwd) curs = conn.cursor() self.curs = curs #database connection and etc db = Stock_250kDB.Stock_250kDB(username=self.user, password=self.passwd, hostname=self.hostname, database=self.dbname) db.setup(create_tables=False) session = db.session session.begin() #transaction = session.create_transaction() # if cmp_data_filename not specified, try to find in the data_description column in table QC_method. qm = QCMethod.query.get(self.QC_method_id) if not self.cmp_data_filename and self.QC_method_id!=0: if qm.data_description: data_description_ls = qm.data_description.split('=') if len(data_description_ls)>1: self.cmp_data_filename = qm.data_description.split('=')[1].strip() #after db query, cmp_data_filename is still nothing, exit program. if not self.cmp_data_filename and self.QC_method_id!=0: sys.stderr.write("cmp_data_filename is still nothing even after db query. please specify it on the commandline.\n") sys.exit(3) #from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix header, strain_acc_list, category_list, data_matrix = read_data(self.cmp_data_filename) strain_acc_list = map(int, strain_acc_list) #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix, snps_table=self.QC_method_id2snps_table.get(self.QC_method_id), ignore_het=qm.ignore_het) #category_list is not used. if self.input_dir: #04/22/08 Watch: call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid) #no submission to db call_info_id2fname = self.get_array_id2fname(curs, self.input_dir) else: #call_info_id2fname = self.get_call_info_id2fname(curs, self.call_info_table, self.call_QC_table, self.QC_method_id) call_data = self.get_call_info_id2fname(db, self.QC_method_id, self.call_method_id, \ filter_calls_QCed=0, max_call_info_mismatch_rate=1, min_call_info_mismatch_rate=min_call_info_mismatch_rate,\ debug=self.debug) call_info_id2fname = call_data.call_info_id2fname call_info_ls_to_return = call_data.call_info_ls_to_return #2008-07-01 pick the call_info_ids to be handled new_call_info_id2fname = {} for call_info_id_wanted in self.call_info_id_ls: if call_info_id_wanted in call_info_id2fname: new_call_info_id2fname[call_info_id_wanted] = call_info_id2fname[call_info_id_wanted] elif self.report: sys.stderr.write("%s not in call_info_id2fname.\n"%(call_info_id_wanted)) call_info_id2fname = new_call_info_id2fname if call_info_id2fname: pdata = self.read_call_matrix(call_info_id2fname, self.min_probability) header = pdata.header call_info_id_ls = pdata.call_info_id_ls array_id_ls = pdata.array_id_ls ecotype_id_ls = pdata.ecotype_id_ls data_matrix = pdata.data_matrix elif self.input_dir: #2008-07-02 #input file is SNP by strain format. double header (1st two lines) header, snps_name_ls, category_list, data_matrix = FilterStrainSNPMatrix.read_data(self.input_dir, double_header=1) ecotype_id_ls = header[0][2:] call_info_id_ls = header[1][2:] data_matrix = numpy.array(data_matrix) data_matrix = data_matrix.transpose() header = ['', ''] + snps_name_ls #fake a header for SNPData else: #2008-07-02 sys.stderr.write("No good arrays.\n") return None snps_name2snps_id = None #swap the ecotype_id_ls and call_info_id_ls when passing them to SNPData. now strain_acc_list=ecotype_id_ls snpData1 = SNPData(header=header, strain_acc_list=ecotype_id_ls, category_list= call_info_id_ls, data_matrix=data_matrix, \ min_probability=self.min_probability, call_method_id=self.call_method_id, col_id2id=snps_name2snps_id,\ max_call_info_mismatch_rate=self.max_call_info_mismatch_rate, snps_table='stock_250k.snps') #snps_table is set to the stock_250k snps_table twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \ QC_method_id=self.QC_method_id, user=self.user, row_matching_by_which_value=0, debug=self.debug,\ max_mismatch_rate=self.max_mismatch_rate, min_no_of_non_NA_pairs=self.min_no_of_non_NA_pairs) #2009-6-9 cross-matching results whose mismatch_rates are below max_mismatch_rate would be put into db. row_id2NA_mismatch_rate = None #2008-05-01 create a cross match table temporarily twoSNPData.qc_cross_match_table = 'qc_cross_match' twoSNPData.new_QC_cross_match_table = self.new_QC_cross_match_table twoSNPData.cal_row_id2pairwise_dist() #database submission is done along. return row_id2NA_mismatch_rate
def run(self): """ 2008-04-25 return None if QC_method_id==0 2008-04-20 for plone to call it just to get row_id2NA_mismatch_rate """ #database connection and etc db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.user, password=self.passwd, hostname=self.hostname, database=self.dbname) db.setup(create_tables=False) session = db.session session.begin() #transaction = session.create_transaction() self.cmp_data_filename = self.findOutCmpDataFilename( self.cmp_data_filename, self.QC_method_id, self.QCMethod_class) qm = self.QCMethod_class.query.get(self.QC_method_id) #2009-5-20 import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.user, passwd=self.passwd) curs = conn.cursor() self.curs = curs if self.debug: import pdb pdb.set_trace() readme = formReadmeObj(sys.argv, self.ad, Stock_250kDB.README) session.add(readme) QC_method_id2snps_table = self.QC_method_id2snps_table if self.QC_method_id == 0: self.cal_independent_NA_rate(db, self.min_probability, readme) row_id2NA_mismatch_rate = None else: #from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix header, strain_acc_list, category_list, data_matrix = read_data( self.cmp_data_filename, ignore_het=qm.ignore_het) strain_acc_list = map( int, strain_acc_list ) #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix, snps_table=QC_method_id2snps_table.get(self.QC_method_id),\ ignore_het=qm.ignore_het) #category_list is not used. 05/20/09 ignore_het is useless cuz data_matrix is provided. """ if self.input_dir and os.path.isdir(self.input_dir): #04/22/08 Watch: call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid) #no submission to db call_info_id2fname = self.get_array_id2fname(curs, self.input_dir) """ if self.input_dir and os.path.isfile(self.input_dir): #it's file call_info_id2fname = None else: if self.run_type == 2: #no filtering on call_info entries that have been QCed. filter_calls_QCed = 0 elif self.run_type == 1: filter_calls_QCed = 1 self.max_call_info_mismatch_rate = 1 #don't use this when doing accession-wise QC else: sys.stderr.write("run_type=%s is not supported.\n" % self.run_type) sys.exit(5) call_data = self.get_call_info_id2fname(db, self.QC_method_id, self.call_method_id, \ filter_calls_QCed, self.max_call_info_mismatch_rate, self.debug,\ min_no_of_non_NA_pairs=self.min_no_of_non_NA_pairs, input_dir=self.input_dir) call_info_id2fname = call_data.call_info_id2fname call_info_ls_to_return = call_data.call_info_ls_to_return if self.run_type == 2: snps_name2snps_id = self.get_snps_name2snps_id(db) else: snps_name2snps_id = None if call_info_id2fname: db_id2chr_pos = db.getSNPID2ChrPos() #2011-22 from DB_250k2data import DB_250k2Data db_id2index = DB_250k2Data.getSNPID2index( call_info_id2fname.values()[0][1], db_id2chr_pos) if self.one_by_one and self.run_type == 1: #one_by_one only for QC by accession row_id2NA_mismatch_rate = {} row_id12row_id2 = {} counter = 0 for call_info_id, value in call_info_id2fname.iteritems(): counter += 1 print "No", counter tmp_dict = {} tmp_dict[call_info_id] = value pdata = self.read_call_matrix( tmp_dict, self.min_probability, db_id2chr_pos=db_id2chr_pos, db_id2index=db_id2index) #05/20/09 no need for qm.ignore_het because 250k is all h**o passingdata = self.qcDataMatrixVSsnpData( pdata, snps_name2snps_id, snpData2, curs, session, readme) row_id2NA_mismatch_rate.update( passingdata.row_id2NA_mismatch_rate) row_id12row_id2.update(passingdata.row_id12row_id2) del pdata if self.debug and counter == 10: break else: pdata = self.read_call_matrix(call_info_id2fname, self.min_probability, db_id2chr_pos=db_id2chr_pos, db_id2index=db_id2index) #05/20/09 no need for qm.ignore_het because 250k is all h**o passingdata = self.qcDataMatrixVSsnpData( pdata, snps_name2snps_id, snpData2, curs, session, readme) row_id2NA_mismatch_rate = passingdata.row_id2NA_mismatch_rate row_id12row_id2 = passingdata.row_id12row_id2 del pdata else: #input file is SNP by strain format. double header (1st two lines) header, snps_name_ls, category_list, data_matrix = read_data( self.input_dir, double_header=1, ignore_het=qm.ignore_het) pdata = PassingData() pdata.ecotype_id_ls = header[0][2:] pdata.call_info_id_ls = header[1][2:] data_matrix = numpy.array(data_matrix) pdata.data_matrix = data_matrix.transpose() pdata.header = ['', '' ] + snps_name_ls #fake a header for SNPData passingdata = self.qcDataMatrixVSsnpData( pdata, snps_name2snps_id, snpData2, curs, session, readme) row_id2NA_mismatch_rate = passingdata.row_id2NA_mismatch_rate row_id12row_id2 = passingdata.row_id12row_id2 del pdata if self.output_fname and self.run_type == 1 and row_id2NA_mismatch_rate: self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate, self.output_fname) if self.run_type == 1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate: #if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid) #row_id2NA_mismatch_rate might be None if it's method 0. self.submit_to_call_QC(session, row_id2NA_mismatch_rate, self.QC_method_id, self.user, self.min_probability, \ row_id12row_id2, self.call_method_id, readme) if self.commit: curs.execute("commit") session.commit() else: session.rollback() self.row_id2NA_mismatch_rate = row_id2NA_mismatch_rate #for plone to get the data structure
def load_dstruc(self): if self.debug: import pdb pdb.set_trace() QualityControl.load_dstruc(self) self.header1, self.strain_acc_list1, self.category_list1, self.data_matrix1 = read_data( self.input_fname1) self.header2, self.strain_acc_list2, self.category_list2, self.data_matrix2 = read_data( self.input_fname2) self.col_id2col_index1, self.col_id2col_index2, self.col_id12col_id2 = self.get_col_matching_dstruc( self.header1, self.header2) self.row_id2row_index1, self.row_id2row_index2, self.row_id12row_id2 = self.get_row_matching_dstruc( self.strain_acc_list1, self.category_list1, self.strain_acc_list2)
def run(self): """ 2008-12-02 """ if self.debug: import pdb pdb.set_trace() delimiter = figureOutDelimiter(self.input_fname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, delimiter=delimiter) if self.array_id_2nd_column: snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) else: snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\ data_matrix=data_matrix) #ignore category_list newSnpData, allele_index2allele_ls = snpData.convert2Binary(self.report) if self.phenotype_fname and self.phenotype_method_id: header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0) phenData = SNPData(header=header_phen, strain_acc_list=newSnpData.strain_acc_list, data_matrix=data_matrix_phen) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(newSnpData.row_id_ls, strain_acc_list_phen, phenData.data_matrix) #tricky, using strain_acc_list_phen phenotype_col_index = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(phenData, Set([self.phenotype_method_id]))[0] phenotype_label = phenData.col_id_ls[phenotype_col_index] phenotype_f = open('%s_%s.pheno'%(self.output_fname_prefix, phenotype_label.replace('/', '_')), 'w') for phenotype_value in phenData.data_matrix[:,phenotype_col_index]: if self.phenotype_is_binary: #binary and non-binary have different NA designator if numpy.isnan(phenotype_value): phenotype_value = 9 else: phenotype_value = int(phenotype_value) else: if numpy.isnan(phenotype_value): phenotype_value = -100.0 phenotype_f.write('%s\n'%phenotype_value) del phenotype_f genotype_f = open('%s.geno'%self.output_fname_prefix, 'w') ind_writer = csv.writer(open('%s.ind'%self.output_fname_prefix, 'w'), delimiter='\t') snp_writer = csv.writer(open('%s.snp'%self.output_fname_prefix, 'w'), delimiter='\t') #transpose it newSnpData = transposeSNPData(newSnpData) no_of_rows = len(newSnpData.data_matrix) no_of_cols = len(newSnpData.data_matrix[0]) for i in range(no_of_rows): snp_id = newSnpData.row_id_ls[i] chr, pos = snp_id.split('_') allele1 = allele_index2allele_ls[i][0] #major allele allele2 = allele_index2allele_ls[i][1] #minor allele snp_writer.writerow([snp_id, chr, 0.0, pos, allele1, allele2]) geno_line = '' for j in range(no_of_cols): if i==0: #write out the accessions ind_writer.writerow([newSnpData.col_id_ls[j], 'U', 'Case']) allele = newSnpData.data_matrix[i][j] if allele==0: geno_line += '0' elif allele==1: geno_line += '2' else: geno_line += '9' geno_line += '\n' genotype_f.write(geno_line) del genotype_f, ind_writer, snp_writer
def run(self): from pymodule import read_data header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname) snp_acc_ls = header[2:] if self.debug: import pdb pdb.set_trace() if self.drawing_type == 1: matrix_value2label = number2nt matrix_value2color = number2color data_matrix = numpy.array(data_matrix) elif self.drawing_type == 2: import MySQLdb #conn = MySQLdb.connect(db="stock",host='natural.uchicago.edu', user='******', passwd='iamhereatusc') conn = MySQLdb.connect(db=self.dbname, host=self.hostname) curs = conn.cursor() snp_allele2index_ls = self.get_snp_allele2index_ls( curs, snp_acc_ls, self.snps_sequenom_info_table) data_matrix = self.transformMatrixIntoTwoAllelesAndHetero( data_matrix, snp_allele2index_ls) matrix_value2label = { -1: 'deletion', 0: 'NA', 1: 'allele1', 2: 'allele2', 3: 'hetero' } matrix_value2color = { -1: (0, 0, 0), 0: (255, 255, 255), 1: (0, 0, 255), 2: (0, 255, 0), 3: (255, 0, 0) } elif self.drawing_type == 3: data_matrix = self.transformMatrixIntoHomoAndHetero(data_matrix) matrix_value2label = { -1: 'deletion', 0: 'NA', 1: 'h**o', 2: 'hetero' } matrix_value2color = { -1: (0, 0, 0), 0: (255, 255, 255), 1: (0, 0, 255), 2: (255, 0, 0) } elif self.drawing_type == 4: data_matrix = self.transformMatrixIntoTwoAllelesAndHetero( data_matrix) matrix_value2label = { -1: 'deletion', 0: 'NA', 1: 'allele1', 2: 'allele2', 3: 'hetero' } matrix_value2color = { -1: (0, 0, 0), 0: (255, 255, 255), 1: (0, 0, 255), 2: (0, 255, 0), 3: (255, 0, 0) } elif self.drawing_type == 5: data_matrix = self.transformMatrixIntoFourNucleotides(data_matrix) matrix_value2label = { -1: '-', 0: 'NA', 1: 'A', 2: 'C', 3: 'G', 4: 'T' } matrix_value2color = { -1: (0, 0, 0), 0: (255, 255, 255), 1: (0, 0, 255), 2: (0, 255, 0), 3: (255, 0, 0), 4: (122, 0, 122) } new_strain_acc_list = [] for strain_acc in strain_acc_list: new_strain_acc_list.append(strain_acc) new_strain_acc_list.append(strain_acc) strain_acc_list = new_strain_acc_list else: sys.stderr.write("drawing_type %s not supported\n" % drawing_type) sys.exit(2) row_label_type2label_ls = {1: strain_acc_list, 2: category_list} font = get_font(self.font_path, font_size=self.font_size) #2008-08-01 im = drawLegend(matrix_value2label, matrix_value2color, font) im.save('%s_legend.png' % self.output_fname_prefix) im = drawMatrix(data_matrix, matrix_value2color, row_label_type2label_ls[self.row_label_type], snp_acc_ls, with_grid=1, font=font) im.save('%s.png' % self.output_fname_prefix)
def plone_run(self, min_call_info_mismatch_rate=0.1): """ 2009-6-9 pass self.max_mismatch_rate, self.min_no_of_non_NA_pairs to TwoSNPData to filter entries stored in db. 2009-4-13 add min_call_info_mismatch_rate 2009-2-5 add "create_tables=False" to db.setup() 2008-07-02 fix a bug which causes the program to continue read data even while call_info_id2fname is empty and input_dir is null. 2008-07-01 adjust to the newest functions in QC_250k.py 2008-04-25 return None if QC_method_id==0 2008-04-20 for plone to call it just to get row_id2NA_mismatch_rate """ import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.user, passwd=self.passwd) curs = conn.cursor() self.curs = curs #database connection and etc db = Stock_250kDB.Stock_250kDB(username=self.user, password=self.passwd, hostname=self.hostname, database=self.dbname) db.setup(create_tables=False) session = db.session session.begin() #transaction = session.create_transaction() # if cmp_data_filename not specified, try to find in the data_description column in table QC_method. qm = QCMethod.query.get(self.QC_method_id) if not self.cmp_data_filename and self.QC_method_id != 0: if qm.data_description: data_description_ls = qm.data_description.split('=') if len(data_description_ls) > 1: self.cmp_data_filename = qm.data_description.split( '=')[1].strip() #after db query, cmp_data_filename is still nothing, exit program. if not self.cmp_data_filename and self.QC_method_id != 0: sys.stderr.write( "cmp_data_filename is still nothing even after db query. please specify it on the commandline.\n" ) sys.exit(3) #from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix header, strain_acc_list, category_list, data_matrix = read_data( self.cmp_data_filename) strain_acc_list = map( int, strain_acc_list ) #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix, snps_table=self.QC_method_id2snps_table.get(self.QC_method_id), ignore_het=qm.ignore_het) #category_list is not used. if self.input_dir: #04/22/08 Watch: call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid) #no submission to db call_info_id2fname = self.get_array_id2fname(curs, self.input_dir) else: #call_info_id2fname = self.get_call_info_id2fname(curs, self.call_info_table, self.call_QC_table, self.QC_method_id) call_data = self.get_call_info_id2fname(db, self.QC_method_id, self.call_method_id, \ filter_calls_QCed=0, max_call_info_mismatch_rate=1, min_call_info_mismatch_rate=min_call_info_mismatch_rate,\ debug=self.debug) call_info_id2fname = call_data.call_info_id2fname call_info_ls_to_return = call_data.call_info_ls_to_return #2008-07-01 pick the call_info_ids to be handled new_call_info_id2fname = {} for call_info_id_wanted in self.call_info_id_ls: if call_info_id_wanted in call_info_id2fname: new_call_info_id2fname[ call_info_id_wanted] = call_info_id2fname[ call_info_id_wanted] elif self.report: sys.stderr.write("%s not in call_info_id2fname.\n" % (call_info_id_wanted)) call_info_id2fname = new_call_info_id2fname if call_info_id2fname: pdata = self.read_call_matrix(call_info_id2fname, self.min_probability) header = pdata.header call_info_id_ls = pdata.call_info_id_ls array_id_ls = pdata.array_id_ls ecotype_id_ls = pdata.ecotype_id_ls data_matrix = pdata.data_matrix elif self.input_dir: #2008-07-02 #input file is SNP by strain format. double header (1st two lines) header, snps_name_ls, category_list, data_matrix = FilterStrainSNPMatrix.read_data( self.input_dir, double_header=1) ecotype_id_ls = header[0][2:] call_info_id_ls = header[1][2:] data_matrix = numpy.array(data_matrix) data_matrix = data_matrix.transpose() header = ['', ''] + snps_name_ls #fake a header for SNPData else: #2008-07-02 sys.stderr.write("No good arrays.\n") return None snps_name2snps_id = None #swap the ecotype_id_ls and call_info_id_ls when passing them to SNPData. now strain_acc_list=ecotype_id_ls snpData1 = SNPData(header=header, strain_acc_list=ecotype_id_ls, category_list= call_info_id_ls, data_matrix=data_matrix, \ min_probability=self.min_probability, call_method_id=self.call_method_id, col_id2id=snps_name2snps_id,\ max_call_info_mismatch_rate=self.max_call_info_mismatch_rate, snps_table='stock_250k.snps') #snps_table is set to the stock_250k snps_table twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \ QC_method_id=self.QC_method_id, user=self.user, row_matching_by_which_value=0, debug=self.debug,\ max_mismatch_rate=self.max_mismatch_rate, min_no_of_non_NA_pairs=self.min_no_of_non_NA_pairs) #2009-6-9 cross-matching results whose mismatch_rates are below max_mismatch_rate would be put into db. row_id2NA_mismatch_rate = None #2008-05-01 create a cross match table temporarily twoSNPData.qc_cross_match_table = 'qc_cross_match' twoSNPData.new_QC_cross_match_table = self.new_QC_cross_match_table twoSNPData.cal_row_id2pairwise_dist( ) #database submission is done along. return row_id2NA_mismatch_rate
def run(self): """ 2008-12-02 """ if self.debug: import pdb pdb.set_trace() delimiter = figureOutDelimiter(self.input_fname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname, delimiter=delimiter) if self.array_id_2nd_column: snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) else: snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\ data_matrix=data_matrix) #ignore category_list newSnpData, allele_index2allele_ls = snpData.convert2Binary( self.report) if self.phenotype_fname and self.phenotype_method_id: header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data( self.phenotype_fname, turn_into_integer=0) phenData = SNPData( header=header_phen, strain_acc_list=newSnpData.strain_acc_list, data_matrix=data_matrix_phen ) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order( newSnpData.row_id_ls, strain_acc_list_phen, phenData.data_matrix) #tricky, using strain_acc_list_phen phenotype_col_index = PlotGroupOfSNPs.findOutWhichPhenotypeColumn( phenData, Set([self.phenotype_method_id]))[0] phenotype_label = phenData.col_id_ls[phenotype_col_index] phenotype_f = open( '%s_%s.pheno' % (self.output_fname_prefix, phenotype_label.replace('/', '_')), 'w') for phenotype_value in phenData.data_matrix[:, phenotype_col_index]: if self.phenotype_is_binary: #binary and non-binary have different NA designator if numpy.isnan(phenotype_value): phenotype_value = 9 else: phenotype_value = int(phenotype_value) else: if numpy.isnan(phenotype_value): phenotype_value = -100.0 phenotype_f.write('%s\n' % phenotype_value) del phenotype_f genotype_f = open('%s.geno' % self.output_fname_prefix, 'w') ind_writer = csv.writer(open('%s.ind' % self.output_fname_prefix, 'w'), delimiter='\t') snp_writer = csv.writer(open('%s.snp' % self.output_fname_prefix, 'w'), delimiter='\t') #transpose it newSnpData = transposeSNPData(newSnpData) no_of_rows = len(newSnpData.data_matrix) no_of_cols = len(newSnpData.data_matrix[0]) for i in range(no_of_rows): snp_id = newSnpData.row_id_ls[i] chr, pos = snp_id.split('_') allele1 = allele_index2allele_ls[i][0] #major allele allele2 = allele_index2allele_ls[i][1] #minor allele snp_writer.writerow([snp_id, chr, 0.0, pos, allele1, allele2]) geno_line = '' for j in range(no_of_cols): if i == 0: #write out the accessions ind_writer.writerow([newSnpData.col_id_ls[j], 'U', 'Case']) allele = newSnpData.data_matrix[i][j] if allele == 0: geno_line += '0' elif allele == 1: geno_line += '2' else: geno_line += '9' geno_line += '\n' genotype_f.write(geno_line) del genotype_f, ind_writer, snp_writer
def loadDataStructure(self, db_250k=None, association_locus_id=None, association_landscape_type_id=None, \ locusExtensionDistance=5000,\ data_dir=None, list_type_id_list=None, gene_annotation_pickleFname=None, \ snpInfoPickleFname=None, locus_type_id=1, snp_matrix_fname=None, snp_matrix_data_type=None, \ phenotype_fname=None): """ 2012.11.14 """ sys.stderr.write("Fetching GWAS landscape for association-locus %s, landscape type %s ..."%(association_locus_id, association_landscape_type_id)) # fetch the associationLocus associationLocus = Stock_250kDB.AssociationLocus.get(association_locus_id) associationLandscapeType = Stock_250kDB.AssociationLandscapeType.get(association_landscape_type_id) # fetch all result-peaks landscape_gwr_ls = [] # fetch landscape within this interval start = max(1, associationLocus.start-locusExtensionDistance) stop = associationLocus.stop + locusExtensionDistance pd = PassingData(min_MAF=associationLandscapeType.min_MAF, data_dir=data_dir, \ need_chr_pos_ls=0, chromosome=associationLocus.chromosome, \ start=start, stop=stop, report=False) #report controls whether getResultMethodContent() will report progress. association_landscape_id_set = set() for association_peak in associationLocus.association_peak_ls: association_landscape = db_250k.getAssociationLandscape(result_id=association_peak.result_id, association_landscape_type_id=associationLandscapeType.id) if association_landscape and association_landscape.id not in association_landscape_id_set: association_landscape_id_set.add(association_landscape.id) genome_wide_result = db_250k.getResultMethodContent(association_landscape=association_landscape, data_dir=data_dir, \ construct_chr_pos2index=True, pdata=pd) landscape_gwr_ls.append(genome_wide_result) sys.stderr.write(" %s%s "%('\x08'*80, len(landscape_gwr_ls))) sys.stderr.write("%s landscapes.\n"%(len(landscape_gwr_ls))) centralLocus = SNPPassingData(chromosome=associationLocus.chromosome, position=start, \ snps_id=associationLocus.id, start=start, stop=stop, fileNamePrefix="") LD_info = None gene_annotation = DrawSNPRegion.dealWithGeneAnnotation(gene_annotation_pickleFname) if snpInfoPickleFname: snp_info = db_250k.dealWithSNPInfo(snpInfoPickleFname, locus_type_id=locus_type_id) #2012.3.8 else: snp_info = None candidate_gene_set = set() if list_type_id_list: for list_type_id in list_type_id_list: candidate_gene_list = db_250k.getGeneList(list_type_id) candidate_gene_set |= set(candidate_gene_list) if snp_matrix_fname and phenotype_fname: if snp_matrix_data_type==3: matrix_data_type=float #2009-3-23 for CNV amplitude file else: matrix_data_type=int snpData = SNPData(input_fname=snp_matrix_fname, turn_into_integer=1, turn_into_array=1, ignore_2nd_column=1,\ matrix_data_type=matrix_data_type) if snpData.data_matrix is None: sys.stderr.write("Error. snpData.data_matrix is None.\n") sys.exit(3) header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(phenotype_fname, turn_into_integer=0) phenData = SNPData(header=header_phen, strain_acc_list=snpData.strain_acc_list, data_matrix=data_matrix_phen) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, \ strain_acc_list_phen, phenData.data_matrix) #tricky, using strain_acc_list_phen #2008-12-05 fake a snp_info for findSNPsInRegion DrawSNPRegion.construct_chr_pos2index_forSNPData(snpData, snp_info=snp_info) ecotype_info = getEcotypeInfo(db_250k) else: snpData = None phenData = None ecotype_info = None return_data = PassingData(associationLocus=associationLocus, associationLandscapeType=associationLandscapeType, \ landscape_gwr_ls=landscape_gwr_ls, \ gene_annotation=gene_annotation, snp_info=snp_info, LD_info=LD_info, \ candidate_gene_set=candidate_gene_set, snpData=snpData, phenData=phenData,\ ecotype_info=ecotype_info, centralLocus=centralLocus) return return_data
def run(self): self.communicator = MPI.world.duplicate() node_rank = self.communicator.rank free_computing_nodes = range(1, self.communicator.size - 1) #exclude the 1st and last node free_computing_node_set = Set(free_computing_nodes) output_node_rank = self.communicator.size - 1 """ if node_rank!=output_node_rank: header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix) #category_list is not used to facilitate row-id matching """ if node_rank == 0: header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix) #category_list is not used to facilitate row-id matching snpData_pickle = cPickle.dumps(snpData, -1) for node in free_computing_nodes: #send it to the computing_node sys.stderr.write( "passing initial data to nodes from %s to %s ... " % (node_rank, node)) self.communicator.send(snpData_pickle, node, 0) sys.stderr.write(".\n") del snpData_pickle params_ls = self.generate_params(len(snpData.col_id_ls), self.block_size) del snpData elif node_rank in free_computing_node_set: data, source, tag = self.communicator.receiveString(0, 0) snpData = cPickle.loads(data) del data else: pass self.synchronize() if node_rank == 0: param_obj = PassingData(params_ls=params_ls, output_node_rank=output_node_rank, report=self.report, counter=0) self.inputNode(param_obj, free_computing_nodes, param_generator=params_ls) #self.input_node(param_obj, free_computing_nodes, input_handler=self.input_handler, message_size=1) #self.input_node(param_obj, free_computing_nodes, self.message_size) elif node_rank in free_computing_node_set: computing_parameter_obj = PassingData( snpData=snpData, min_LD_to_output=self.min_LD_to_output, min_MAF=self.min_MAF, discard_perc=self.discard_perc) self.computing_node(computing_parameter_obj, self.computing_node_handler) else: if getattr(self, 'output_fname', None): writer = csv.writer(open(self.output_fname, 'w'), delimiter='\t') #header_row = ['snp1_id', 'snp2_id', 'r2', 'D', "D'", "no_of_pairs"] #writer.writerow(header_row) else: writer = None param_obj = PassingData(writer=writer, is_header_written=False) self.output_node(free_computing_nodes, param_obj, self.output_node_handler) del writer self.synchronize() #to avoid some node early exits
def run(self): if self.debug: import pdb pdb.set_trace() db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema) db.setup(create_tables=False) session = db.session header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data( self.phenotype_fname, turn_into_integer=0) phenData = SNPData( header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen ) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order( phenData.row_id_ls, strain_acc_list_phen, phenData.data_matrix) #tricky, using strain_acc_list_phen phenotype_col_index1 = self.findOutWhichPhenotypeColumn( phenData, Set([self.phenotype_method_id1]))[0] phenotype_col_index2 = self.findOutWhichPhenotypeColumn( phenData, Set([self.phenotype_method_id2]))[0] x_ls = [] y_ls = [] for i in range(phenData.data_matrix.shape[0]): if not numpy.isnan( phenData.data_matrix[i] [phenotype_col_index1]) and not numpy.isnan( phenData.data_matrix[i][phenotype_col_index2]): x_ls.append(phenData.data_matrix[i][phenotype_col_index1]) y_ls.append(phenData.data_matrix[i][phenotype_col_index2]) pylab.clf() pylab.title('Phenotype Contrast') pylab.plot(x_ls, y_ls, '.', alpha=0.6) pylab.grid(alpha=0.3) phenotype_method1 = Stock_250kDB.PhenotypeMethod.get( self.phenotype_method_id1) phenotype_method2 = Stock_250kDB.PhenotypeMethod.get( self.phenotype_method_id2) pylab.xlabel(phenotype_method1.short_name) pylab.ylabel(phenotype_method2.short_name) #draw diagonal line to show perfect correlation max_min_value = max(min(x_ls), min(y_ls)) min_max_value = min(max(x_ls), max(y_ls)) pylab.plot([max_min_value, min_max_value], [max_min_value, min_max_value], c='g', alpha=0.7) png_output_fname = '%s.png' % self.output_fname_prefix pylab.savefig(png_output_fname, dpi=400) pylab.savefig('%s.svg' % self.output_fname_prefix)
def prepareTwoSNPData(self, db, max_mismatch_rate=0.25, min_no_of_non_NA_pairs=40, report=0): """ 2009-9-23 add arguments max_mismatch_rate & min_no_of_non_NA_pairs, and pass them to twoSNPData. However it's useless to control what should be inserted into db because TwoSNPData.qc_cross_match_table is not defined and even if it's defined, the table it'll create doesn't concord to the one in 149SNP db. 2008-09-10 if self.input_fname is given, get 149SNP data from it , instead of database 2008-8-28 split out of run() so that MpiQC149CrossMatch could call this easily """ import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.db_user, passwd=self.db_passwd) curs = conn.cursor() if self.input_fname: header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname) else: from dbSNP2data import dbSNP2data snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m( curs, StockDB.Calls.table.name, StockDB.SNPs.table.name) strain_info_data = self.get_strain_id_info( self.QC_method_id, ignore_strains_with_qc=False) data_matrix = self.get_data_matrix( db, strain_info_data.strain_id2index, snp_id2index, StockDB.Calls.table.name) strain_acc_list = [ strain_info_data.strain_id2acc[strain_id] for strain_id in strain_info_data.strain_id_list ] #tg_ecotypeid category_list = [ strain_info_data.strain_id2category[strain_id] for strain_id in strain_info_data.strain_id_list ] #strainid header = ['ecotypeid', 'strainid'] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] header.append(snp_name) snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \ snps_table='stock.snps') #snps_table is set to the stock_250k snps_table if self.QC_method_id == 4: snpData2 = snpData1 else: self.cmp_data_filename = self.findOutCmpDataFilename( self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod) header, strain_acc_list, category_list, data_matrix = read_data( self.cmp_data_filename) strain_acc_list = map( int, strain_acc_list ) #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix) #category_list is not used to facilitate row-id matching twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, \ QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug,\ max_mismatch_rate=max_mismatch_rate, min_no_of_non_NA_pairs=min_no_of_non_NA_pairs, report=report) return twoSNPData
def run(self): """ 2008-09-06 """ if self.debug: #for one-node testing purpose import pdb pdb.set_trace() header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix, turn_into_array=1) #category_list is not used to facilitate row-id matching picklef = open(self.snps_context_fname) snps_context_wrapper = cPickle.load(picklef) del picklef gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(snps_context_wrapper) gene_id_ls = gene_id2snps_id_ls.keys() gene_id_ls.sort() header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0) phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list, data_matrix=data_matrix_phen) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix) other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=gene_id_ls, phenData=phenData) other_data_pickle = cPickle.dumps(other_data, -1) phenotype_label_ls_pickle = cPickle.dumps(phenData.col_id_ls, -1) snpData_pickle = cPickle.dumps(snpData, -1) sys.exit(2) self.communicator = MPI.world.duplicate() node_rank = self.communicator.rank free_computing_nodes = range(1, self.communicator.size-1) #exclude the 1st and last node free_computing_node_set = Set(free_computing_nodes) output_node_rank = self.communicator.size-1 if node_rank == 0: dstruc = self.inputNodePrepare() params_ls = dstruc.params_ls #send the output node the phenotype_label_ls self.communicator.send(dstruc.output_node_data_pickle, output_node_rank, 0) del dstruc.output_node_data_pickle for node in free_computing_nodes: #send it to the computing_node sys.stderr.write("passing initial data to nodes from %s to %s ... "%(node_rank, node)) self.communicator.send(dstruc.snpData_pickle, node, 0) self.communicator.send(dstruc.other_data_pickle, node, 0) sys.stderr.write(".\n") del dstruc elif node_rank in free_computing_node_set: data, source, tag = self.communicator.receiveString(0, 0) snpData = cPickle.loads(data) del data data, source, tag = self.communicator.receiveString(0, 0) other_data = cPickle.loads(data) del data self.phenotype_index_ls = other_data.phenotype_index_ls else: data, source, tag = self.communicator.receiveString(0, 0) output_node_data_pickle = cPickle.loads(data) phenotype_label_ls = output_node_data_pickle.phenotype_label_ls self.phenotype_index_ls = output_node_data_pickle.phenotype_index_ls self.synchronize() if node_rank == 0: param_obj = PassingData(params_ls=params_ls, output_node_rank=output_node_rank, report=self.report, counter=0) self.inputNode(param_obj, free_computing_nodes, param_generator = params_ls) #self.input_node(param_obj, free_computing_nodes, input_handler=self.input_fetch_handler, message_size=1) elif node_rank in free_computing_node_set: computing_parameter_obj = PassingData(snpData=snpData, gene_id_ls=other_data.gene_id_ls, \ gene_id2snps_id_ls=other_data.gene_id2snps_id_ls, phenData=other_data.phenData, phenotype_index_ls=self.phenotype_index_ls, min_data_point=self.min_data_point, test_type=self.test_type) self.computing_node(computing_parameter_obj, self.computing_node_handler) else: self.general_output_node(self.output_dir, self.phenotype_index_ls, phenotype_label_ls, free_computing_nodes) self.synchronize() #to avoid some node early exits
def run(self): if self.debug: import pdb pdb.set_trace() from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix #to check whether two input file are in different orientation file_format2count = {} file_format_ls = [self.input_fname1_format, self.input_fname2_format] for file_format in file_format_ls: if file_format not in file_format2count: file_format2count[file_format] = 0 file_format2count[file_format] += 1 #2008-05-15 TwoSNPData can handle character matrix/2D-list. but transposeSNPData needs numeric matrix to transpose except when numpy is installed. if 1 in file_format2count and file_format2count[1]==1: #there's one and only one strain x snp format. #it needs transpose matrix. only numpy works on character matrix. not sure Numeric or numarray is imported. so transform the input matrix to integer. use_nt2number = 1 else: use_nt2number = 0 if self.input_fname1_format==1: header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname1) snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) elif self.input_fname1_format==2: snpsd_ls = dataParsers.parseCSVData(self.input_fname1, withArrayIds=False, use_nt2number=use_nt2number) snpData1 = RawSnpsData_ls2SNPData(snpsd_ls, report=self.report, use_nt2number=0) #already nt in number del snpsd_ls elif self.input_fname1_format==3: snpsd_ls = dataParsers.parseCSVData(self.input_fname1, withArrayIds=True, use_nt2number=use_nt2number) snpData1 = RawSnpsData_ls2SNPData(snpsd_ls, report=self.report, use_nt2number=0) del snpsd_ls else: sys.stderr.write('Error: unsupported input_fname1 format, %s\n' % self.input_fname1_format) sys.exit(2) if self.run_type!=2: if self.input_fname2_format==1: header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname2) snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list,\ data_matrix=data_matrix) elif self.input_fname2_format==2: snpsd_ls = dataParsers.parseCSVData(self.input_fname2, withArrayIds=False, use_nt2number=use_nt2number) snpData2 = RawSnpsData_ls2SNPData(snpsd_ls, report=self.report, use_nt2number=0) del snpsd_ls else: sys.stderr.write('Error: unsupported input_fname2 format, %s\n' % self.input_fname2_format) sys.exit(2) if 1 in file_format2count and file_format2count[1]==1: #there's one and only one strain x snp format. transpose the 2nd snpData snpData2 = transposeSNPData(snpData2, report=self.report) if self.input_fname1_format == 1: #row_id for the 1st file = (ecotype_id, duplicate). for 2nd file, row_id=ecotype_id. row_matching_by_which_value = 0 col_matching_by_which_value = None elif self.input_fname1_format == 2: #col_id for the 1st file = accession. for 2nd file, col_id=accession. row_matching_by_which_value = None col_matching_by_which_value = None elif self.input_fname1_format == 3: #col_id for the 1st file = (array_id, accession). for 2nd file, col_id=accession. row_matching_by_which_value = None col_matching_by_which_value = 1 else: #2008-10-12 pairwise mismatch between same data snpData2 = snpData1 row_matching_by_which_value = None col_matching_by_which_value = None twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, row_matching_by_which_value=row_matching_by_which_value,\ col_matching_by_which_value=col_matching_by_which_value, debug=self.debug) if self.run_type==3: #2008-10-12 compare snpData1 and snpData2 only for designated entries from snpData1 if not self.ecotype_id_ls: sys.stderr.write("Run_type %s: ecotype_id_ls (%s) is not specified.\n"%(self.run_type, self.ecotype_id_ls)) sys.exit(3) ecotype_id_set = Set(self.ecotype_id_ls) row_id_ls = [] #test against for row_id in snpData1.row_id_ls: if not isinstance(row_id, str) and hasattr(row_id, '__len__'): ecotype_id = row_id[0] else: ecotype_id = row_id if ecotype_id in ecotype_id_set: row_id_ls.append(row_id) print '%s arrays'%(len(row_id_ls)) if self.ecotype_id_ls: for row_id in row_id_ls: col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise(row_id=row_id) if col_id2NA_mismatch_rate: if not isinstance(row_id, str) and hasattr(row_id, '__len__'): row_id_name = '_'.join(row_id) else: row_id_name = row_id output_fname = '%s_%s'%(self.output_fname, row_id_name) twoSNPData.output_col_id2NA_mismatch_rate_InGWRFormat(col_id2NA_mismatch_rate, output_fname) elif self.run_type==2: #2008-10-12 column-wise mismatch of snpData1 vs snpData1 between rows with same ecotype_id but different array_id row_id_pair_set = Set() for row_id in snpData1.row_id_ls: if not isinstance(row_id, str) and hasattr(row_id, '__len__'): ecotype_id = row_id[0] else: ecotype_id = row_id for row_id2 in snpData2.row_id_ls: if row_id2[0]==ecotype_id and row_id2[1]!=row_id[1]: #same ecotype_id but different array_id row_id_pair_set.add((row_id, row_id2)) print '%s arrays'%(len(row_id_pair_set)) for row_id1, row_id2 in row_id_pair_set: row_id12row_id2 = {row_id1:row_id2} col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise(row_id=row_id1, row_id12row_id2=row_id12row_id2) if col_id2NA_mismatch_rate: output_fname = '%s_%s_vs_%s'%(self.output_fname, '_'.join(row_id1), '_'.join(row_id2)) twoSNPData.output_col_id2NA_mismatch_rate_InGWRFormat(col_id2NA_mismatch_rate, output_fname) elif self.run_type==1: #sys.exit(2) #2008-10-12 skip all original functions row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise() col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise() if row_id2NA_mismatch_rate: QC_250k.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate, self.output_fname, file_1st_open=1) if col_id2NA_mismatch_rate: QC_250k.output_row_id2NA_mismatch_rate(col_id2NA_mismatch_rate, self.output_fname, file_1st_open=0)
def run(self): """ 2008-04-25 return None if QC_method_id==0 2008-04-20 for plone to call it just to get row_id2NA_mismatch_rate """ # database connection and etc db = Stock_250kDB.Stock_250kDB( drivername=self.drivername, username=self.user, password=self.passwd, hostname=self.hostname, database=self.dbname, ) db.setup() session = db.session session.begin() # transaction = session.create_transaction() self.cmp_data_filename = self.findOutCmpDataFilename( self.cmp_data_filename, self.QC_method_id, self.QCMethod_class ) qm = self.QCMethod_class.query.get(self.QC_method_id) # 2009-5-20 import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.user, passwd=self.passwd) curs = conn.cursor() self.curs = curs if self.debug: import pdb pdb.set_trace() readme = formReadmeObj(sys.argv, self.ad, Stock_250kDB.README) session.save(readme) QC_method_id2snps_table = self.QC_method_id2snps_table if self.QC_method_id == 0: self.cal_independent_NA_rate(db, self.min_probability, readme) row_id2NA_mismatch_rate = None else: # from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix header, strain_acc_list, category_list, data_matrix = read_data( self.cmp_data_filename, ignore_het=qm.ignore_het ) strain_acc_list = map( int, strain_acc_list ) # it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData( header=header, strain_acc_list=strain_acc_list, data_matrix=data_matrix, snps_table=QC_method_id2snps_table.get(self.QC_method_id), ignore_het=qm.ignore_het, ) # category_list is not used. 05/20/09 ignore_het is useless cuz data_matrix is provided. """ if self.input_dir and os.path.isdir(self.input_dir): #04/22/08 Watch: call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid) #no submission to db call_info_id2fname = self.get_array_id2fname(curs, self.input_dir) """ if self.input_dir and os.path.isfile(self.input_dir): # it's file call_info_id2fname = None else: if self.run_type == 2: # no filtering on call_info entries that have been QCed. filter_calls_QCed = 0 elif self.run_type == 1: filter_calls_QCed = 1 self.max_call_info_mismatch_rate = 1 # don't use this when doing accession-wise QC else: sys.stderr.write("run_type=%s is not supported.\n" % self.run_type) sys.exit(5) call_data = self.get_call_info_id2fname( db, self.QC_method_id, self.call_method_id, filter_calls_QCed, self.max_call_info_mismatch_rate, self.debug, min_no_of_non_NA_pairs=self.min_no_of_non_NA_pairs, input_dir=self.input_dir, ) call_info_id2fname = call_data.call_info_id2fname call_info_ls_to_return = call_data.call_info_ls_to_return if self.run_type == 2: snps_name2snps_id = self.get_snps_name2snps_id(db) else: snps_name2snps_id = None if call_info_id2fname: if self.one_by_one and self.run_type == 1: # one_by_one only for QC by accession row_id2NA_mismatch_rate = {} row_id12row_id2 = {} counter = 0 for call_info_id, value in call_info_id2fname.iteritems(): counter += 1 print "No", counter tmp_dict = {} tmp_dict[call_info_id] = value pdata = self.read_call_matrix( tmp_dict, self.min_probability ) # 05/20/09 no need for qm.ignore_het because 250k is all h**o passingdata = self.qcDataMatrixVSsnpData( pdata, snps_name2snps_id, snpData2, curs, session, readme ) row_id2NA_mismatch_rate.update(passingdata.row_id2NA_mismatch_rate) row_id12row_id2.update(passingdata.row_id12row_id2) del pdata if self.debug and counter == 10: break else: pdata = self.read_call_matrix( call_info_id2fname, self.min_probability ) # 05/20/09 no need for qm.ignore_het because 250k is all h**o passingdata = self.qcDataMatrixVSsnpData(pdata, snps_name2snps_id, snpData2, curs, session, readme) row_id2NA_mismatch_rate = passingdata.row_id2NA_mismatch_rate row_id12row_id2 = passingdata.row_id12row_id2 del pdata else: # input file is SNP by strain format. double header (1st two lines) header, snps_name_ls, category_list, data_matrix = read_data( self.input_dir, double_header=1, ignore_het=qm.ignore_het ) pdata = PassingData() pdata.ecotype_id_ls = header[0][2:] pdata.call_info_id_ls = header[1][2:] data_matrix = numpy.array(data_matrix) pdata.data_matrix = data_matrix.transpose() pdata.header = ["", ""] + snps_name_ls # fake a header for SNPData passingdata = self.qcDataMatrixVSsnpData(pdata, snps_name2snps_id, snpData2, curs, session, readme) row_id2NA_mismatch_rate = passingdata.row_id2NA_mismatch_rate row_id12row_id2 = passingdata.row_id12row_id2 del pdata if self.output_fname and self.run_type == 1 and row_id2NA_mismatch_rate: self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate, self.output_fname) if self.run_type == 1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate: # if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid) # row_id2NA_mismatch_rate might be None if it's method 0. self.submit_to_call_QC( session, row_id2NA_mismatch_rate, self.QC_method_id, self.user, self.min_probability, row_id12row_id2, self.call_method_id, readme, ) if self.commit: curs.execute("commit") session.commit() else: session.rollback() self.row_id2NA_mismatch_rate = row_id2NA_mismatch_rate # for plone to get the data structure
def run(self): if self.debug: import pdb pdb.set_trace() db = StockDB.StockDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.dbname) db.setup(create_tables=False) session = db.session session.begin() self.cmp_data_filename = self.findOutCmpDataFilename( self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod) header, strain_acc_list, category_list, data_matrix = read_data( self.cmp_data_filename) strain_acc_list = map( int, strain_acc_list ) #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix) #category_list is not used. readme = formReadmeObj(sys.argv, self.ad, StockDB.README) session.save(readme) import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.db_user, passwd=self.db_passwd) curs = conn.cursor() from dbSNP2data import dbSNP2data snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m( curs, StockDB.Calls.table.name, StockDB.SNPs.table.name) strain_info_data = self.get_strain_id_info(self.QC_method_id) data_matrix = self.get_data_matrix(db, strain_info_data.strain_id2index, snp_id2index, StockDB.Calls.table.name) strain_acc_list = [ strain_info_data.strain_id2acc[strain_id] for strain_id in strain_info_data.strain_id_list ] category_list = [ strain_info_data.strain_id2category[strain_id] for strain_id in strain_info_data.strain_id_list ] header = ['ecotypeid', 'strainid'] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] header.append(snp_name) snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \ snps_table='stock.snps') #snps_table is set to the stock_250k snps_table twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \ QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug) if self.run_type == 1: row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise() elif self.run_type == 2: #twoSNPData.save_col_wise(session, readme) #2008-08-18 need to implement a new one for 149SNP row_id2NA_mismatch_rate = {} else: sys.stderr.write("run_type=%s is not supported.\n" % self.run_type) sys.exit(5) if self.output_fname and self.run_type == 1 and row_id2NA_mismatch_rate: self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate, self.output_fname) if self.run_type == 1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate: #if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid) #row_id2NA_mismatch_rate might be None if it's method 0. self.submit_to_call_QC(session, row_id2NA_mismatch_rate, self.QC_method_id, self.db_user, \ twoSNPData.row_id12row_id2, readme) if self.commit: session.commit() else: session.rollback()