def removeUnPhenotypedSNPData( clf, snpData, header_phen, strain_acc_list_phen, data_matrix_phen, phenotype_method_id_ls ): """ 2010-2-25 remove un-phenotyped ecotypes from the SNP data in order to keep the snp dataset small """ sys.stderr.write("Removing un-phenotyped ecotypes from the SNP data ...") phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen) if phenotype_method_id_ls: which_phenotype_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(phenData, Set(phenotype_method_id_ls)) else: # if not available, take all phenotypes which_phenotype_ls = range(len(phenData.col_id_ls)) phenotyped_ecotype_id_set = set() for i in range(len(phenData.row_id_ls)): ecotype_id = phenData.row_id_ls[i] keep_this_ecotype = False for col_index in which_phenotype_ls: if phenData.data_matrix[i][col_index] != "NA": # 2010-2-25 phenotype values are in raw string. keep_this_ecotype = True break if keep_this_ecotype: phenotyped_ecotype_id_set.add(ecotype_id) row_ids_to_be_kept = set() # 2010-2-21 no_of_ecotypes_in_total = len(snpData.row_id_ls) for row_id in snpData.row_id_ls: ecotype_id = row_id[0] # 1st column is ecotype_id, 2nd is array id if ecotype_id in phenotyped_ecotype_id_set: row_ids_to_be_kept.add(row_id) snpData = SNPData.keepRowsByRowID(snpData, row_ids_to_be_kept) no_of_removed = no_of_ecotypes_in_total - len(row_ids_to_be_kept) sys.stderr.write("%s removed. Done.\n" % (no_of_removed)) return snpData
def readInData( cls, phenotype_fname, input_fname, eigen_vector_fname, phenotype_method_id_ls, test_type=1, report=0 ): """ 2010-2-25 call removeUnPhenotypedSNPData() to shrink the snp dataset by removing un-phenotyped ecotypes 2009-3-20 refactored out of run(), easy for MpiAssociation.py to call """ header, strain_acc_list, category_list, data_matrix = read_data(input_fname) snpData = SNPData( header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix ) header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data( phenotype_fname, turn_into_integer=0 ) snpData = cls.removeUnPhenotypedSNPData( snpData, header_phen, strain_acc_list_phen, data_matrix_phen, phenotype_method_id_ls ) newSnpData, allele2index_ls = snpData.convertSNPAllele2Index( report ) # 0 (NA) or -2 (untouched) is all converted to -2 as 0 is used to denote allele newSnpData.header = snpData.header data_matrix_phen = cls.get_phenotype_matrix_in_data_matrix_order( strain_acc_list, strain_acc_list_phen, data_matrix_phen ) phenData = SNPData(header=header_phen, strain_acc_list=snpData.strain_acc_list, data_matrix=data_matrix_phen) if eigen_vector_fname: PC_data = cls.getPCFromFile(eigen_vector_fname) PC_matrix = PC_data.PC_matrix else: if test_type == 4: # eigen_vector_fname not given for this test_type. calcualte PCs. import pca_module T, P, explained_var = pca_module.PCA_svd(newSnpData.data_matrix, standardize=False) PC_matrix = T else: PC_matrix = None del snpData if phenotype_method_id_ls: which_phenotype_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(phenData, Set(phenotype_method_id_ls)) else: # if not available, take all phenotypes which_phenotype_ls = range(len(phenData.col_id_ls)) pdata = PassingData( snpData=newSnpData, phenData=phenData, PC_matrix=PC_matrix, which_phenotype_ls=which_phenotype_ls, phenotype_method_id_ls=phenotype_method_id_ls, ) return pdata
def inputNodePrepare(self, snp_info=None): """ 2009-2-16 get phenData.phenotype_method_id_ls in the same order as phenData.col_id_ls 2009-2-11 refactored out of run() """ header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix, turn_into_array=1) #category_list is not used to facilitate row-id matching picklef = open(self.snps_context_fname) snps_context_wrapper = cPickle.load(picklef) del picklef gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(snps_context_wrapper) del snps_context_wrapper gene_id_ls = gene_id2snps_id_ls.keys() gene_id_ls.sort() header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data( self.phenotype_fname, turn_into_integer=0) phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen) phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order( snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix) phenData.phenotype_method_id_ls = get_phenotype_method_id_lsFromPhenData( phenData) #2009-2-16 self.phenotype_index_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn( phenData, Set(self.phenotype_method_id_ls)) if not self.phenotype_index_ls: self.phenotype_index_ls = range(len(phenData.col_id_ls)) pdata = PassingData(gene_id_ls=gene_id_ls, gene_id2snps_id_ls=gene_id2snps_id_ls, \ phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info) params_ls = self.generate_params(self.gene_id_fname, pdata, self.block_size) other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=pdata.gene_id_ls, phenData=phenData, \ phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info) other_data_pickle = cPickle.dumps(other_data, -1) del other_data output_node_data = PassingData(phenotype_label_ls=phenData.col_id_ls, \ phenotype_index_ls=self.phenotype_index_ls) output_node_data_pickle = cPickle.dumps(output_node_data, -1) snpData_pickle = cPickle.dumps(snpData, -1) del snpData, data_matrix return_data = PassingData(snpData_pickle=snpData_pickle, other_data_pickle=other_data_pickle,\ output_node_data_pickle=output_node_data_pickle, params_ls=params_ls) return return_data
def inputNodePrepare(self, snp_info=None): """ 2009-2-16 get phenData.phenotype_method_id_ls in the same order as phenData.col_id_ls 2009-2-11 refactored out of run() """ header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname) snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \ data_matrix=data_matrix, turn_into_array=1) #category_list is not used to facilitate row-id matching picklef = open(self.snps_context_fname) snps_context_wrapper = cPickle.load(picklef) del picklef gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(snps_context_wrapper) del snps_context_wrapper gene_id_ls = gene_id2snps_id_ls.keys() gene_id_ls.sort() header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0) phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen) phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix) phenData.phenotype_method_id_ls = get_phenotype_method_id_lsFromPhenData(phenData) #2009-2-16 self.phenotype_index_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(phenData, Set(self.phenotype_method_id_ls)) if not self.phenotype_index_ls: self.phenotype_index_ls = range(len(phenData.col_id_ls)) pdata = PassingData(gene_id_ls=gene_id_ls, gene_id2snps_id_ls=gene_id2snps_id_ls, \ phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info) params_ls = self.generate_params(self.gene_id_fname, pdata, self.block_size) other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=pdata.gene_id_ls, phenData=phenData, \ phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info) other_data_pickle = cPickle.dumps(other_data, -1) del other_data output_node_data = PassingData(phenotype_label_ls=phenData.col_id_ls, \ phenotype_index_ls=self.phenotype_index_ls) output_node_data_pickle = cPickle.dumps(output_node_data, -1) snpData_pickle = cPickle.dumps(snpData, -1) del snpData, data_matrix return_data = PassingData(snpData_pickle=snpData_pickle, other_data_pickle=other_data_pickle,\ output_node_data_pickle=output_node_data_pickle, params_ls=params_ls) return return_data
def run(self): """ 2008-12-02 """ if self.debug: import pdb pdb.set_trace() delimiter = figureOutDelimiter(self.input_fname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data( self.input_fname, delimiter=delimiter) if self.array_id_2nd_column: snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) else: snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\ data_matrix=data_matrix) #ignore category_list newSnpData, allele_index2allele_ls = snpData.convert2Binary( self.report) if self.phenotype_fname and self.phenotype_method_id: header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data( self.phenotype_fname, turn_into_integer=0) phenData = SNPData( header=header_phen, strain_acc_list=newSnpData.strain_acc_list, data_matrix=data_matrix_phen ) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order( newSnpData.row_id_ls, strain_acc_list_phen, phenData.data_matrix) #tricky, using strain_acc_list_phen phenotype_col_index = PlotGroupOfSNPs.findOutWhichPhenotypeColumn( phenData, Set([self.phenotype_method_id]))[0] phenotype_label = phenData.col_id_ls[phenotype_col_index] phenotype_f = open( '%s_%s.pheno' % (self.output_fname_prefix, phenotype_label.replace('/', '_')), 'w') for phenotype_value in phenData.data_matrix[:, phenotype_col_index]: if self.phenotype_is_binary: #binary and non-binary have different NA designator if numpy.isnan(phenotype_value): phenotype_value = 9 else: phenotype_value = int(phenotype_value) else: if numpy.isnan(phenotype_value): phenotype_value = -100.0 phenotype_f.write('%s\n' % phenotype_value) del phenotype_f genotype_f = open('%s.geno' % self.output_fname_prefix, 'w') ind_writer = csv.writer(open('%s.ind' % self.output_fname_prefix, 'w'), delimiter='\t') snp_writer = csv.writer(open('%s.snp' % self.output_fname_prefix, 'w'), delimiter='\t') #transpose it newSnpData = transposeSNPData(newSnpData) no_of_rows = len(newSnpData.data_matrix) no_of_cols = len(newSnpData.data_matrix[0]) for i in range(no_of_rows): snp_id = newSnpData.row_id_ls[i] chr, pos = snp_id.split('_') allele1 = allele_index2allele_ls[i][0] #major allele allele2 = allele_index2allele_ls[i][1] #minor allele snp_writer.writerow([snp_id, chr, 0.0, pos, allele1, allele2]) geno_line = '' for j in range(no_of_cols): if i == 0: #write out the accessions ind_writer.writerow([newSnpData.col_id_ls[j], 'U', 'Case']) allele = newSnpData.data_matrix[i][j] if allele == 0: geno_line += '0' elif allele == 1: geno_line += '2' else: geno_line += '9' geno_line += '\n' genotype_f.write(geno_line) del genotype_f, ind_writer, snp_writer
def run(self): """ 2008-12-02 """ if self.debug: import pdb pdb.set_trace() delimiter = figureOutDelimiter(self.input_fname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, delimiter=delimiter) if self.array_id_2nd_column: snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) else: snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\ data_matrix=data_matrix) #ignore category_list newSnpData, allele_index2allele_ls = snpData.convert2Binary(self.report) if self.phenotype_fname and self.phenotype_method_id: header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0) phenData = SNPData(header=header_phen, strain_acc_list=newSnpData.strain_acc_list, data_matrix=data_matrix_phen) #row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(newSnpData.row_id_ls, strain_acc_list_phen, phenData.data_matrix) #tricky, using strain_acc_list_phen phenotype_col_index = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(phenData, Set([self.phenotype_method_id]))[0] phenotype_label = phenData.col_id_ls[phenotype_col_index] phenotype_f = open('%s_%s.pheno'%(self.output_fname_prefix, phenotype_label.replace('/', '_')), 'w') for phenotype_value in phenData.data_matrix[:,phenotype_col_index]: if self.phenotype_is_binary: #binary and non-binary have different NA designator if numpy.isnan(phenotype_value): phenotype_value = 9 else: phenotype_value = int(phenotype_value) else: if numpy.isnan(phenotype_value): phenotype_value = -100.0 phenotype_f.write('%s\n'%phenotype_value) del phenotype_f genotype_f = open('%s.geno'%self.output_fname_prefix, 'w') ind_writer = csv.writer(open('%s.ind'%self.output_fname_prefix, 'w'), delimiter='\t') snp_writer = csv.writer(open('%s.snp'%self.output_fname_prefix, 'w'), delimiter='\t') #transpose it newSnpData = transposeSNPData(newSnpData) no_of_rows = len(newSnpData.data_matrix) no_of_cols = len(newSnpData.data_matrix[0]) for i in range(no_of_rows): snp_id = newSnpData.row_id_ls[i] chr, pos = snp_id.split('_') allele1 = allele_index2allele_ls[i][0] #major allele allele2 = allele_index2allele_ls[i][1] #minor allele snp_writer.writerow([snp_id, chr, 0.0, pos, allele1, allele2]) geno_line = '' for j in range(no_of_cols): if i==0: #write out the accessions ind_writer.writerow([newSnpData.col_id_ls[j], 'U', 'Case']) allele = newSnpData.data_matrix[i][j] if allele==0: geno_line += '0' elif allele==1: geno_line += '2' else: geno_line += '9' geno_line += '\n' genotype_f.write(geno_line) del genotype_f, ind_writer, snp_writer