def on_button_filechooserdialog_save_ok_clicked(self, widget, data=None): """ 2008-02-12 to update the no_of_selected rows (have to double click a row to change a cursor if it's multiple selection) 2008-02-05 """ output_fname = self.filechooserdialog_save.get_filename() self.filechooserdialog_save.hide() pathlist_strains1 = [] self.treeselection.selected_foreach(yh_gnome.foreach_cb, pathlist_strains1) self.app1_appbar1.push("%s rows selected."%len(pathlist_strains1)) if self.header and self.strain_acc_list and self.category_list and self.data_matrix: selected_index_set = Set() for path in pathlist_strains1: row = self.liststore[path[0]] id = row[0] index_in_data_matrix = row[-1] selected_index_set.add(index_in_data_matrix) if self.id_is_strain: id = id[1:-1].split(',') #id is a tuple of (ecotypeid,duplicate) self.strain_acc_list[index_in_data_matrix] = id[0].strip() #remove extra space self.category_list[index_in_data_matrix] = id[1].strip() #else: # self.header[index_in_data_matrix+2] = id from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix() if self.id_is_strain: rows_to_be_tossed_out = Set(range(len(self.strain_acc_list))) - selected_index_set FilterStrainSNPMatrix_instance.write_data_matrix(self.data_matrix, output_fname, self.header, self.strain_acc_list, self.category_list,\ rows_to_be_tossed_out, cols_to_be_tossed_out=Set(), nt_alphabet=0) else: cols_to_be_tossed_out = Set(range(len(self.header)-2)) - selected_index_set FilterStrainSNPMatrix_instance.write_data_matrix(self.data_matrix, output_fname, self.header, self.strain_acc_list, self.category_list,\ rows_to_be_tossed_out=Set(), cols_to_be_tossed_out=cols_to_be_tossed_out, nt_alphabet=0)
def load_dstruc(self): if self.debug: import pdb pdb.set_trace() QualityControl.load_dstruc(self) from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix() self.header1, self.strain_acc_list1, self.category_list1, self.data_matrix1 = FilterStrainSNPMatrix_instance.read_data(self.input_fname1) self.header2, self.strain_acc_list2, self.category_list2, self.data_matrix2 = FilterStrainSNPMatrix_instance.read_data(self.input_fname2) self.col_id2col_index1, self.col_id2col_index2, self.col_id12col_id2 = self.get_col_matching_dstruc(self.header1, self.header2, self.curs, self.snp_locus_table_250k, self.snp_locus_table_149snp) self.row_id2row_index1, self.row_id2row_index2, self.row_id12row_id2 = self.get_row_matching_dstruc(self.strain_acc_list1, self.category_list1, self.strain_acc_list2, self.curs, self.ecotype_duplicate2tg_ecotypeid_table)
def read_2010_x_149SNP(input_fname): """ 2007-12-30 """ from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix() header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(input_fname) snp_acc_ls = header[2:] snp_acc2col_index = dict(zip(snp_acc_ls, range(len(snp_acc_ls)))) accession_id_ls = map(int, strain_acc_list) accession_id2row_index = dict(zip(accession_id_ls, range(len(accession_id_ls)))) return snp_acc2col_index, accession_id2row_index, data_matrix
def run(self): """ 2008-08-11 the database interface changed in variation.src.dbsnp 2008-05-06 """ import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.user, passwd=self.passwd) curs = conn.cursor() if self.debug: import pdb pdb.set_trace() db = DBSNP(username=self.user, password=self.passwd, hostname=self.hostname, database=self.dbname) session = db.session session.begin() #transaction = session.create_transaction() snps_name2possible_mappings, snps_name2snps_id = self.get_snps_name2possible_mappings( db) from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix.read_data( self.input_fname1) snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \ col_id2id=snps_name2snps_id, snps_table='dbsnp.snps') header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix.read_data( self.input_fname2) snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix,\ snps_table='stock_250k.snps') twoSNPData = TwoSNPData384(SNPData1=snpData1, SNPData2=snpData2, curs=curs, user=self.user) readme = formReadmeObj(sys.argv, self.ad, README) session.save(readme) session.flush() twoSNPData.figureOutABMapping(session, readme, snps_name2possible_mappings) if self.commit: curs.execute("commit") session.commit() else: session.rollback()
def load_dstruc(self): if self.debug: import pdb pdb.set_trace() QualityControl.load_dstruc(self) from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix() self.header1, self.strain_acc_list1, self.category_list1, self.data_matrix1 = self.readTina2010In250kSNPs( self.input_fname1) self.header2, self.strain_acc_list2, self.category_list2, self.data_matrix2 = FilterStrainSNPMatrix_instance.read_data( self.input_fname2) self.col_id2col_index1, self.col_id2col_index2, self.col_id12col_id2 = self.get_col_matching_dstruc( self.header1, self.header2) self.row_id2row_index1, self.row_id2row_index2, self.row_id12row_id2 = self.get_row_matching_dstruc( self.curs, self.strain_acc_list1, self.strain_acc_list2)
def load_dstruc(self): if self.debug: import pdb pdb.set_trace() QualityControl.load_dstruc(self) from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix() self.header1, self.strain_acc_list1, self.category_list1, self.data_matrix1 = FilterStrainSNPMatrix_instance.read_data( self.input_fname1) self.header2, self.strain_acc_list2, self.category_list2, self.data_matrix2 = FilterStrainSNPMatrix_instance.read_data( self.input_fname2) self.col_id2col_index1, self.col_id2col_index2, self.col_id12col_id2 = self.get_col_matching_dstruc( self.header1, self.header2, self.curs, self.snp_locus_table_250k, self.snp_locus_table_149snp) self.row_id2row_index1, self.row_id2row_index2, self.row_id12row_id2 = self.get_row_matching_dstruc( self.strain_acc_list1, self.category_list1, self.strain_acc_list2, self.curs, self.ecotype_duplicate2tg_ecotypeid_table)
def run(self): """ 2008-08-11 the database interface changed in variation.src.dbsnp 2008-05-06 """ import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user = self.user, passwd = self.passwd) curs = conn.cursor() if self.debug: import pdb pdb.set_trace() db = DBSNP(username=self.user, password=self.passwd, hostname=self.hostname, database=self.dbname) session = db.session session.begin() #transaction = session.create_transaction() snps_name2possible_mappings, snps_name2snps_id = self.get_snps_name2possible_mappings(db) from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix.read_data(self.input_fname1) snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \ col_id2id=snps_name2snps_id, snps_table='dbsnp.snps') header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix.read_data(self.input_fname2) snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix,\ snps_table='stock_250k.snps') twoSNPData = TwoSNPData384(SNPData1=snpData1, SNPData2=snpData2, curs=curs, user=self.user) readme = formReadmeObj(sys.argv, self.ad, README) session.save(readme) session.flush() twoSNPData.figureOutABMapping(session, readme, snps_name2possible_mappings) if self.commit: curs.execute("commit") session.commit() else: session.rollback()
def load_dstruc(self): if self.debug: import pdb pdb.set_trace() QualityControl.load_dstruc(self) from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix() self.header1, self.strain_acc_list1, self.category_list1, self.data_matrix1 = self.readTina2010In250kSNPs( self.input_fname1 ) self.header2, self.strain_acc_list2, self.category_list2, self.data_matrix2 = FilterStrainSNPMatrix_instance.read_data( self.input_fname2 ) self.col_id2col_index1, self.col_id2col_index2, self.col_id12col_id2 = self.get_col_matching_dstruc( self.header1, self.header2 ) self.row_id2row_index1, self.row_id2row_index2, self.row_id12row_id2 = self.get_row_matching_dstruc( self.curs, self.strain_acc_list1, self.strain_acc_list2 )
def run(self): import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.user, passwd = self.passwd) curs = conn.cursor() if self.debug: import pdb pdb.set_trace() """ #2008-02-08 old way to get 2010 data is from raw alignments. didn't realize all SNPs are put into db. alignment_id2positions_to_be_checked_ls, alignment_id2chr_start_end = self.get_alignment_id2positions_to_be_checked_ls(curs, self.alignment_table) SNPpos_snpacc_ls = self.get_SNPpos_snpacc_ls(curs, self.snp_locus_table) SNPpos2col_index, snp_acc_ls = self.setup_SNP_dstruc(SNPpos_snpacc_ls, alignment_id2chr_start_end) ecotype_id2accession_id, ecotype_id2row_index, ecotype_id2info_ls, ecotype_id_ls, accession_id2row_index, accession_id_ls, nativename_ls = self.setup_accession_ecotype_dstruc(curs, self.accession2ecotype_table, self.ecotype_table) accession_X_snp_matrix, accession_X_snp_matrix_touched, snp_index2alignment_id = self.get_accession_X_snp_matrix(curs, accession_id2row_index, SNPpos2col_index, self.sequence_table, self.alignment_table, alignment_id2positions_to_be_checked_ls) """ if self.processing_bits[3]==0: #2009-2-12 will be a problem if snp_locus_table doesn't have field offset SNPpos2col_index, snp_acc_ls = self.setup_SNP_dstruc2(curs, self.snp_locus_table, offset=self.offset) elif self.processing_bits[3]==1: SNPpos2col_index, snp_acc_ls = self.setup_SNP_dstruc2(curs, self.snp_locus_table, \ cross_linking_table=self.data_type2data_table[self.processing_bits[1]], \ offset=self.offset) elif self.processing_bits[3]==2: SNPpos2col_index, snp_acc_ls = self.setup_SNP_dstruc2(curs, self.data_type2data_table[self.processing_bits[1]], \ offset=self.offset) else: sys.stderr.write("Error: unsupported 3rd bit in processing_bits %s.\n"%self.processing_bits[3]) sys.exit(3) from variation.src.common import get_accession_id2name accession_id2name = get_accession_id2name(curs) if self.processing_bits[1]==0: row_id2dstruc = self.setup_row_dstruc(curs, SNPpos2col_index, accession_id2name, offset=self.offset, version=self.version) elif self.processing_bits[1]==1: from variation.src.common import map_perlegen_ecotype_name2accession_id ecotype_name2accession_id = map_perlegen_ecotype_name2accession_id(curs) row_id2dstruc = self.setup_row_dstruc(curs, SNPpos2col_index, accession_id2name, data_type=self.processing_bits[1], \ ecotype_name2accession_id=ecotype_name2accession_id,\ offset=self.offset, version=self.version) else: sys.stderr("Unsupported data type: %s or no ecotype_name2accession_id specified.\n"%self.processing_bits[1]) sys.exit(2) accession_id_ls, accession_name_ls, data_matrix = self.transform_row_id2dstruc_2_matrix(row_id2dstruc) from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix() #2008-02-08 which type of row id/1st column if self.processing_bits[0]==0: from variation.src.common import map_accession_id2ecotype_id accession_id2ecotype_id = map_accession_id2ecotype_id(curs, accession2ecotype_table=self.accession2ecotype_table) accession_id2ecotype_id[99] = 6909 #accession 99 is the reference genome, which col-0 (ecotype_id=6909) ecotype_id_ls = [] rows_to_be_tossed_out=Set() for i in range(len(accession_id_ls)): ecotype_id = accession_id2ecotype_id.get(accession_id_ls[i]) if not ecotype_id: #mapping failed rows_to_be_tossed_out.add(i) ecotype_id_ls.append(ecotype_id) strain_acc_list = ecotype_id_ls header = ['ecotype_id'] #1st column in the header else: rows_to_be_tossed_out=Set() strain_acc_list = accession_id_ls header = ['accession_id'] #2008-02-08 which type of 2nd column if self.processing_bits[2]==0: category_list = [1]*len(accession_name_ls) header.append('duplicate') #2nd column in the header elif self.processing_bits[2]==1: category_list = accession_name_ls header.append('accession_name') else: category_list = accession_name_ls header.append('accession_name') header += snp_acc_ls FilterStrainSNPMatrix_instance.write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out=rows_to_be_tossed_out)
def run(self): """ 2008-05-08 transpose everything if output_matrix_type=1 (bjarni's SNP matrix format) 2007-02-19 --db_connect --get_snp_id2index() --get_strain_id2index() --get_strain_id_info() --get_snp_id_info() --get_data_matrix() if self.toss_out_rows: --toss_rows_to_make_distance_matrix_NA_free() --find_smallest_vertex_set_to_remove_all_edges() --write_data_matrix() #--sort_file() 2007-09-22 for mysql_connection add get_nativename_snpid2call_m() add fill_in_resolved_duplicated_calls() """ if self.debug: import pdb pdb.set_trace() if self.db_connection_type == 1: import MySQLdb #conn = MySQLdb.connect(db="stock",host='natural.uchicago.edu', user='******', passwd='iamhereatusc') conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.user, passwd=self.passwd) curs = conn.cursor() snp_id2index, snp_id_list, snp_id2info = self.get_snp_id2index_m( curs, self.input_table, self.snp_locus_table) strain_id2index, strain_id_list, nativename2strain_id, strain_id2acc, strain_id2category = self.get_strain_id2index_m(curs, \ self.input_table, self.strain_info_table, self.only_include_strains_with_GPS, \ self.resolve_duplicated_calls, toss_contaminants=self.toss_contaminants) #strain_id2acc, strain_id2category = self.get_strain_id_info_m(curs, strain_id_list, self.strain_info_table) #snp_id2info = self.get_snp_id_info_m(curs, snp_id_list, self.snp_locus_table) if self.input_table == 'dbsnp.calls': from variation.src.FigureOut384IlluminaABMapping import get_snps_id2mapping snps_id2mapping = get_snps_id2mapping(self.hostname, dbname='dbsnp', user=self.user, passwd=self.passwd) else: snps_id2mapping = None data_matrix = self.get_data_matrix_m(curs, strain_id2index, snp_id2index, nt2number, self.input_table, self.need_heterozygous_call, snps_id2mapping) """ if self.resolve_duplicated_calls: nativename_snpid2call = self.get_nativename_snpid2call_m(curs, self.strain_info_table, self.input_table) data_matrix = self.fill_in_resolved_duplicated_calls(data_matrix, strain_id2index, snp_id2index, nativename2strain_id, nativename_snpid2call) """ if self.include_other_strain_info: strain_id2other_info = self.get_strain_id2other_info( curs, strain_id_list, self.strain_info_table, self.input_table) else: strain_id2other_info = {} elif self.db_connection_type == 2: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) snp_id2index, snp_id_list = self.get_snp_id2index( curs, self.input_table, self.snp_locus_table) strain_id2index, strain_id_list = self.get_strain_id2index( curs, self.input_table) strain_id2acc, strain_id2category = self.get_strain_id_info( curs, strain_id_list, self.strain_info_table) snp_id2info = self.get_snp_id_info(curs, snp_id_list, self.snp_locus_table) data_matrix = self.get_data_matrix(curs, strain_id2index, snp_id2index, nt2number, self.input_table, self.need_heterozygous_call) strain_id2other_info = {} if self.toss_out_rows: rows_to_be_tossed_out = self.toss_rows_to_make_distance_matrix_NA_free( data_matrix) rows_to_be_tossed_out = Set(rows_to_be_tossed_out) else: rows_to_be_tossed_out = Set() #05/08/08 if self.discard_all_NA_strain: from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix remove_rows_data = FilterStrainSNPMatrix.remove_rows_with_too_many_NAs( data_matrix, row_cutoff=1) rows_with_too_many_NAs_set = remove_rows_data.rows_with_too_many_NAs_set #row_index2no_of_NAs = remove_rows_data.row_index2no_of_NAs rows_to_be_tossed_out.update(rows_with_too_many_NAs_set) strain_acc_list = [ strain_id2acc[strain_id] for strain_id in strain_id_list ] category_list = [ strain_id2category[strain_id] for strain_id in strain_id_list ] strain_acc2other_info = {} for strain_id in strain_id2other_info: strain_acc2other_info[ strain_id2acc[strain_id]] = strain_id2other_info[strain_id] if self.output_matrix_type == 1: #transpose everything data_matrix = num.array(data_matrix) data_matrix = num.transpose(data_matrix) header = ['Chromosomes', 'Positions'] + strain_acc_list chromosome_ls = [] position_ls = [] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] chromosome_ls.append(chromosome) position_ls.append(position) strain_acc_list = chromosome_ls category_list = position_ls cols_to_be_tossed_out = rows_to_be_tossed_out rows_to_be_tossed_out = None strain_id2other_info = None #make up one else: header = ['strain', 'category'] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] header.append(snp_name) cols_to_be_tossed_out = None write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out=rows_to_be_tossed_out, \ cols_to_be_tossed_out=cols_to_be_tossed_out, nt_alphabet=self.nt_alphabet,\ strain_acc2other_info=strain_acc2other_info, delimiter=self.delimiter)
def run(self): """ 2008-05-08 transpose everything if output_matrix_type=1 (bjarni's SNP matrix format) 2007-02-19 --db_connect --get_snp_id2index() --get_strain_id2index() --get_strain_id_info() --get_snp_id_info() --get_data_matrix() if self.toss_out_rows: --toss_rows_to_make_distance_matrix_NA_free() --find_smallest_vertex_set_to_remove_all_edges() --write_data_matrix() #--sort_file() 2007-09-22 for mysql_connection add get_nativename_snpid2call_m() add fill_in_resolved_duplicated_calls() """ if self.debug: import pdb pdb.set_trace() if self.db_connection_type==1: import MySQLdb #conn = MySQLdb.connect(db="stock",host='natural.uchicago.edu', user='******', passwd='iamhereatusc') conn = MySQLdb.connect(db=self.dbname,host=self.hostname, user=self.user, passwd = self.passwd) curs = conn.cursor() snp_id2index, snp_id_list, snp_id2info = self.get_snp_id2index_m(curs, self.input_table, self.snp_locus_table) strain_id2index, strain_id_list, nativename2strain_id, strain_id2acc, strain_id2category = self.get_strain_id2index_m(curs, \ self.input_table, self.strain_info_table, self.only_include_strains_with_GPS, \ self.resolve_duplicated_calls, toss_contaminants=self.toss_contaminants) #strain_id2acc, strain_id2category = self.get_strain_id_info_m(curs, strain_id_list, self.strain_info_table) #snp_id2info = self.get_snp_id_info_m(curs, snp_id_list, self.snp_locus_table) if self.input_table == 'dbsnp.calls': from variation.src.FigureOut384IlluminaABMapping import get_snps_id2mapping snps_id2mapping = get_snps_id2mapping(self.hostname, dbname='dbsnp', user=self.user, passwd=self.passwd) else: snps_id2mapping = None data_matrix = self.get_data_matrix_m(curs, strain_id2index, snp_id2index, nt2number, self.input_table, self.need_heterozygous_call, snps_id2mapping) """ if self.resolve_duplicated_calls: nativename_snpid2call = self.get_nativename_snpid2call_m(curs, self.strain_info_table, self.input_table) data_matrix = self.fill_in_resolved_duplicated_calls(data_matrix, strain_id2index, snp_id2index, nativename2strain_id, nativename_snpid2call) """ if self.include_other_strain_info: strain_id2other_info = self.get_strain_id2other_info(curs, strain_id_list, self.strain_info_table, self.input_table) else: strain_id2other_info = {} elif self.db_connection_type==2: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) snp_id2index, snp_id_list = self.get_snp_id2index(curs, self.input_table, self.snp_locus_table) strain_id2index, strain_id_list = self.get_strain_id2index(curs, self.input_table) strain_id2acc, strain_id2category = self.get_strain_id_info(curs, strain_id_list, self.strain_info_table) snp_id2info = self.get_snp_id_info(curs, snp_id_list, self.snp_locus_table) data_matrix = self.get_data_matrix(curs, strain_id2index, snp_id2index, nt2number, self.input_table, self.need_heterozygous_call) strain_id2other_info = {} if self.toss_out_rows: rows_to_be_tossed_out = self.toss_rows_to_make_distance_matrix_NA_free(data_matrix) rows_to_be_tossed_out = Set(rows_to_be_tossed_out) else: rows_to_be_tossed_out = Set() #05/08/08 if self.discard_all_NA_strain: from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix remove_rows_data = FilterStrainSNPMatrix.remove_rows_with_too_many_NAs(data_matrix, row_cutoff=1) rows_with_too_many_NAs_set = remove_rows_data.rows_with_too_many_NAs_set #row_index2no_of_NAs = remove_rows_data.row_index2no_of_NAs rows_to_be_tossed_out.update(rows_with_too_many_NAs_set) strain_acc_list = [strain_id2acc[strain_id] for strain_id in strain_id_list] category_list = [strain_id2category[strain_id] for strain_id in strain_id_list] strain_acc2other_info = {} for strain_id in strain_id2other_info: strain_acc2other_info[strain_id2acc[strain_id]] = strain_id2other_info[strain_id] if self.output_matrix_type==1: #transpose everything data_matrix = num.array(data_matrix) data_matrix = num.transpose(data_matrix) header = ['Chromosomes', 'Positions'] + strain_acc_list chromosome_ls = [] position_ls = [] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] chromosome_ls.append(chromosome) position_ls.append(position) strain_acc_list = chromosome_ls category_list = position_ls cols_to_be_tossed_out = rows_to_be_tossed_out rows_to_be_tossed_out = None strain_id2other_info = None #make up one else: header = ['strain', 'category'] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] header.append(snp_name) cols_to_be_tossed_out = None write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out=rows_to_be_tossed_out, \ cols_to_be_tossed_out=cols_to_be_tossed_out, nt_alphabet=self.nt_alphabet,\ strain_acc2other_info=strain_acc2other_info, delimiter=self.delimiter)