def run(self): """ 2007-03-20 2007-04-03 """ from FilterStrainSNPMatrix import FilterStrainSNPMatrix FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix() if self.draw_only: header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(self.output_fname) data_matrix = Numeric.array(data_matrix) else: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data(self.input_fname) snp_acc_ls = header[2:] strain_id2index = self.get_id2index(curs, self.strain_info_table, strain_acc_list) snp_id2index = self.get_id2index(curs, self.snp_locus_table, snp_acc_ls) from dbSNP2data import dbSNP2data dbSNP2data_instance = dbSNP2data(report=self.report) data_matrix = dbSNP2data_instance.get_data_matrix(curs, strain_id2index, snp_id2index, nt2number, self.data_table, need_heterozygous_call=1) FilterStrainSNPMatrix_instance.write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list) heterozygous_data_matrix, coarse_data_matrix = self.get_heterozygous_and_coarse_data_matrix(data_matrix) self.displayDataMatrix(heterozygous_data_matrix, title='heterozygous_data_matrix, 5-10=hetero, else=0') self.displayDataMatrix(coarse_data_matrix, title='coarse_data_matrix, 0=NA, 1=h**o, 2=hetero') raw_input("enter")
def run(self): """ 2007-07-12 2007-07-17 """ from dbSNP2data import dbSNP2data dbSNP2data_instance = dbSNP2data(user=self.user, passwd=self.passwd, output_fname='whatever') import MySQLdb conn = MySQLdb.connect(db=self.dbname,host=self.hostname, user = self.user, passwd = self.passwd) curs = conn.cursor() #snp_id2index, snp_id_list, snp_acc_list, snp_id2acc = self.get_snp_struc(curs, self.snpacc_fname, self.snp_locus_table) snp_id2index, snp_id_list, snp_id2info = dbSNP2data_instance.get_snp_id2index_m(curs, self.input_table, self.snp_locus_table) #snp_id2acc = dbSNP2data_instance.get_snp_id_info_m(curs, snp_id_list, self.snp_locus_table) snp_acc_list = [] for snp_id in snp_id_list: snp_acc_list.append(snp_id2info[snp_id][0]) #popid2ecotypeid_ls = self.get_popid2ecotypeid_ls(curs, self.population_table) popid2strain_id_snp_id_ls = self.get_popid2strain_id_snp_id_ls(curs, self.population_table, self.popid2snpid_table) strain_id2index, strain_id_list = self.get_strain_id2index(popid2strain_id_snp_id_ls, self.min_no_of_strains_per_pop) #strain_id2index, strain_id_list, nativename2strain_id, strain_id2acc, strain_id2category = dbSNP2data_instance.get_strain_id2index_m(curs, self.input_table, self.strain_info_table) strain_id2acc, strain_id2category = dbSNP2data_instance.get_strain_id_info_m(curs, strain_id_list, self.strain_info_table) data_matrix = dbSNP2data_instance.get_data_matrix_m(curs, strain_id2index, snp_id2index, nt2number, self.input_table, need_heterozygous_call=1) self.OutputPop_dict[self.output_type](data_matrix, popid2strain_id_snp_id_ls, strain_id2index, self.output_fname, snp_id2index, strain_id2acc,\ strain_id2category, snp_acc_list, self.with_header_line, self.nt_alphabet)
def remove_identity_strains(self, data_matrix, rows_to_be_checked, cols_to_be_checked): """ 2009-2-18 class "dbSNP2data" has a few non-null arguments. feed it during initialization 2007-04-16 the similarity graph structure complicated the issue bug found by Chris Toomajian Now use the greedy graph algorithm to remove identity strains. 2007-09-13 remove parameter strain_index2no_of_NAs """ sys.stderr.write("Searching for identity strains ...") rows_to_be_checked_ls = list(rows_to_be_checked) rows_to_be_checked_ls.sort() #from small to big if self.debug: import pdb pdb.set_trace() no_of_total_cols_to_be_checked = len(cols_to_be_checked) identity_pair_ls = [] #2007-04-16 for i in range(len(rows_to_be_checked_ls)): row1_index = rows_to_be_checked_ls[i] #watch this for j in rows_to_be_checked_ls[i+1:]: no_of_same_cols = 0 for k in cols_to_be_checked: if data_matrix[row1_index][k] == data_matrix[j][k] or data_matrix[row1_index][k]==0 or data_matrix[j][k]==0: no_of_same_cols += 1 if no_of_same_cols == no_of_total_cols_to_be_checked: identity_pair_ls.append([row1_index, j]) if self.debug: import pdb pdb.set_trace() sys.stderr.write("done.\n") sys.stderr.write("Removing identity strains ...") import networkx as nx g = nx.Graph() g.add_edges_from(identity_pair_ls) from dbSNP2data import dbSNP2data dbSNP2data_instance = dbSNP2data(user='******', passwd='secret', output_fname='/tmp/nothing') #dbSNP2data has a few non-null arguments. vertex_list_to_be_deleted = dbSNP2data_instance.find_smallest_vertex_set_to_remove_all_edges(g) identity_strains_to_be_removed = set(vertex_list_to_be_deleted) """ #2007-04-16 useless identity_strains_to_be_removed = set() for src, tg_list in src2tg_list.iteritems(): strain_with_least_NA = src least_no_of_NAs = strain_index2no_of_NAs[src] identity_strains_to_be_removed.add(src) #add in src for tg in tg_list: identity_strains_to_be_removed.add(tg) #add in tg if strain_index2no_of_NAs[tg] < least_no_of_NAs: strain_with_least_NA = tg least_no_of_NAs = strain_index2no_of_NAs[tg] identity_strains_to_be_removed.remove(strain_with_least_NA) #remove the one with least NAs """ if self.debug: print print 'identity_strains_to_be_removed' print identity_strains_to_be_removed sys.stderr.write("%s identity strains, done.\n"%(len(identity_strains_to_be_removed))) return identity_strains_to_be_removed
def test_find_smallest_vertex_set_to_remove_all_edges(self): from dbSNP2data import dbSNP2data identity_pair_ls = [[1,2],[2,3],[2,4],[4,5]] import networkx as nx g = nx.Graph() g.add_edges_from(identity_pair_ls) from dbSNP2data import dbSNP2data dbSNP2data_instance = dbSNP2data() #import pdb #pdb.set_trace() vertex_list_to_be_deleted = dbSNP2data_instance.find_smallest_vertex_set_to_remove_all_edges(g) print 'graph' print identity_pair_ls print 'vertex_list_to_be_deleted' print vertex_list_to_be_deleted
def test_find_smallest_vertex_set_to_remove_all_edges(self): from dbSNP2data import dbSNP2data identity_pair_ls = [[1, 2], [2, 3], [2, 4], [4, 5]] import networkx as nx g = nx.Graph() g.add_edges_from(identity_pair_ls) from dbSNP2data import dbSNP2data dbSNP2data_instance = dbSNP2data() #import pdb #pdb.set_trace() vertex_list_to_be_deleted = dbSNP2data_instance.find_smallest_vertex_set_to_remove_all_edges( g) print 'graph' print identity_pair_ls print 'vertex_list_to_be_deleted' print vertex_list_to_be_deleted
def run(self): """ 2007-07-12 2007-07-17 """ from dbSNP2data import dbSNP2data dbSNP2data_instance = dbSNP2data(user=self.user, passwd=self.passwd, output_fname='whatever') import MySQLdb conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.user, passwd=self.passwd) curs = conn.cursor() #snp_id2index, snp_id_list, snp_acc_list, snp_id2acc = self.get_snp_struc(curs, self.snpacc_fname, self.snp_locus_table) snp_id2index, snp_id_list, snp_id2info = dbSNP2data_instance.get_snp_id2index_m( curs, self.input_table, self.snp_locus_table) #snp_id2acc = dbSNP2data_instance.get_snp_id_info_m(curs, snp_id_list, self.snp_locus_table) snp_acc_list = [] for snp_id in snp_id_list: snp_acc_list.append(snp_id2info[snp_id][0]) #popid2ecotypeid_ls = self.get_popid2ecotypeid_ls(curs, self.population_table) popid2strain_id_snp_id_ls = self.get_popid2strain_id_snp_id_ls( curs, self.population_table, self.popid2snpid_table) strain_id2index, strain_id_list = self.get_strain_id2index( popid2strain_id_snp_id_ls, self.min_no_of_strains_per_pop) #strain_id2index, strain_id_list, nativename2strain_id, strain_id2acc, strain_id2category = dbSNP2data_instance.get_strain_id2index_m(curs, self.input_table, self.strain_info_table) strain_id2acc, strain_id2category = dbSNP2data_instance.get_strain_id_info_m( curs, strain_id_list, self.strain_info_table) data_matrix = dbSNP2data_instance.get_data_matrix_m( curs, strain_id2index, snp_id2index, nt2number, self.input_table, need_heterozygous_call=1) self.OutputPop_dict[self.output_type](data_matrix, popid2strain_id_snp_id_ls, strain_id2index, self.output_fname, snp_id2index, strain_id2acc,\ strain_id2category, snp_acc_list, self.with_header_line, self.nt_alphabet)
def run(self): import MySQLdb conn = MySQLdb.connect(db=self.dbname,host=self.hostname, user = self.user, passwd = self.passwd) curs = conn.cursor() from dbSNP2data import dbSNP2data dbSNP2data_instance = dbSNP2data(user=self.user, passwd=self.passwd, output_fname='whatever') snp_id2index, snp_id_list, snp_id2info = dbSNP2data_instance.get_snp_id2index_m(curs, self.input_table, self.snp_locus_table) #strain_id2index, strain_id_list strain_id2index, strain_id_list, nativename2strain_id, strain_id2acc, strain_id2category = dbSNP2data_instance.get_strain_id2index_m(curs, self.input_table, self.strain_info_table) #2008-06-02 stuff returned by get_strain_id2index_m is totally changed. ecotype_id2row_index = {} for strain_id, acc in strain_id2acc.iteritems(): row_index = strain_id2index[strain_id] ecotype_id2row_index[acc] = row_index #strain_id2acc, strain_id2category = dbSNP2data_instance.get_strain_id_info_m(curs, strain_id_list, self.strain_info_table) snp_id2acc = dbSNP2data_instance.get_snp_id_info_m(curs, snp_id_list, self.snp_locus_table) data_matrix = dbSNP2data_instance.get_data_matrix_m(curs, strain_id2index, snp_id2index, nt2number, self.input_table, need_heterozygous_call=1) from OutputPopulation import OutputPopulation popid2ecotypeid_ls = OutputPopulation.get_popid2ecotypeid_ls(curs, self.population_table) from FilterStrainSNPMatrix import FilterStrainSNPMatrix FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix() from RemoveBadSNPs import RemoveBadSNPs RemoveBadSNPs_instance = RemoveBadSNPs() popid2strain_id_snp_id_ls = {} for popid, ecotypeid_ls in popid2ecotypeid_ls.iteritems(): if len(ecotypeid_ls)>=self.min_no_of_strains_per_pop: sys.stderr.write("Population %s\n"%popid) sub_data_matrix, new_ecotypeid_ls = self.create_sub_data_matrix(popid, data_matrix, ecotypeid_ls, ecotype_id2row_index) if len(new_ecotypeid_ls)>=self.min_no_of_strains_per_pop: sys.stderr.write("\tPopulation %s has %s strains\n"%(popid, len(new_ecotypeid_ls))) strain_id_selected, snp_id_selected = self.cleanup_one_population(FilterStrainSNPMatrix_instance, RemoveBadSNPs_instance, sub_data_matrix, new_ecotypeid_ls, snp_id_list, self.min_no_of_strains_per_pop, self.row_cutoff, self.col_cutoff, self.min_log_prob) if strain_id_selected and snp_id_selected: popid2strain_id_snp_id_ls[popid] = [strain_id_selected, snp_id_selected] if self.commit: self.create_popid2snpid_table(curs, self.output_table) self.mark_strain_id_selected(curs, popid2strain_id_snp_id_ls, self.population_table) self.submit_popid2snpid_list(curs, popid2strain_id_snp_id_ls, self.population_table, self.output_table) conn.commit()
def run(self): """ 2007-03-20 2007-04-03 """ from FilterStrainSNPMatrix import FilterStrainSNPMatrix FilterStrainSNPMatrix_instance = FilterStrainSNPMatrix() if self.draw_only: header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data( self.output_fname) data_matrix = Numeric.array(data_matrix) else: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix_instance.read_data( self.input_fname) snp_acc_ls = header[2:] strain_id2index = self.get_id2index(curs, self.strain_info_table, strain_acc_list) snp_id2index = self.get_id2index(curs, self.snp_locus_table, snp_acc_ls) from dbSNP2data import dbSNP2data dbSNP2data_instance = dbSNP2data(report=self.report) data_matrix = dbSNP2data_instance.get_data_matrix( curs, strain_id2index, snp_id2index, nt2number, self.data_table, need_heterozygous_call=1) FilterStrainSNPMatrix_instance.write_data_matrix( data_matrix, self.output_fname, header, strain_acc_list, category_list) heterozygous_data_matrix, coarse_data_matrix = self.get_heterozygous_and_coarse_data_matrix( data_matrix) self.displayDataMatrix( heterozygous_data_matrix, title='heterozygous_data_matrix, 5-10=hetero, else=0') self.displayDataMatrix( coarse_data_matrix, title='coarse_data_matrix, 0=NA, 1=h**o, 2=hetero') raw_input("enter")
def remove_identity_strains(self, data_matrix, rows_to_be_checked, cols_to_be_checked): """ 2009-2-18 class "dbSNP2data" has a few non-null arguments. feed it during initialization 2007-04-16 the similarity graph structure complicated the issue bug found by Chris Toomajian Now use the greedy graph algorithm to remove identity strains. 2007-09-13 remove parameter strain_index2no_of_NAs """ sys.stderr.write("Searching for identity strains ...") rows_to_be_checked_ls = list(rows_to_be_checked) rows_to_be_checked_ls.sort() # from small to big if self.debug: import pdb pdb.set_trace() no_of_total_cols_to_be_checked = len(cols_to_be_checked) identity_pair_ls = [] # 2007-04-16 for i in range(len(rows_to_be_checked_ls)): row1_index = rows_to_be_checked_ls[i] # watch this for j in rows_to_be_checked_ls[i + 1 :]: no_of_same_cols = 0 for k in cols_to_be_checked: if ( data_matrix[row1_index][k] == data_matrix[j][k] or data_matrix[row1_index][k] == 0 or data_matrix[j][k] == 0 ): no_of_same_cols += 1 if no_of_same_cols == no_of_total_cols_to_be_checked: identity_pair_ls.append([row1_index, j]) if self.debug: import pdb pdb.set_trace() sys.stderr.write("done.\n") sys.stderr.write("Removing identity strains ...") import networkx as nx g = nx.Graph() g.add_edges_from(identity_pair_ls) from dbSNP2data import dbSNP2data dbSNP2data_instance = dbSNP2data( user="******", passwd="secret", output_fname="/tmp/nothing" ) # dbSNP2data has a few non-null arguments. vertex_list_to_be_deleted = dbSNP2data_instance.find_smallest_vertex_set_to_remove_all_edges(g) identity_strains_to_be_removed = Set(vertex_list_to_be_deleted) """ #2007-04-16 useless identity_strains_to_be_removed = Set() for src, tg_list in src2tg_list.iteritems(): strain_with_least_NA = src least_no_of_NAs = strain_index2no_of_NAs[src] identity_strains_to_be_removed.add(src) #add in src for tg in tg_list: identity_strains_to_be_removed.add(tg) #add in tg if strain_index2no_of_NAs[tg] < least_no_of_NAs: strain_with_least_NA = tg least_no_of_NAs = strain_index2no_of_NAs[tg] identity_strains_to_be_removed.remove(strain_with_least_NA) #remove the one with least NAs """ if self.debug: print print "identity_strains_to_be_removed" print identity_strains_to_be_removed sys.stderr.write("%s identity strains, done.\n" % (len(identity_strains_to_be_removed))) return identity_strains_to_be_removed