def run(self): (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) self.parser_dict[self.type](curs, self.inputfile, self.output_table_dict[self.type], self.organism, self.sequence_type) if self.commit: curs.execute("end")
def run(self): """ 2007-02-17 -db_connect() -get_strain_acc2id() -get_snp_acc2id() -parse_file() -expand_snp_locus_table() -submit_to_strain_info_table() -submit_to_snp_table() """ (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) strain_acc2id = self.get_strain_acc2id(curs, self.strain_info_table, self.tax_id) snp_acc2id = self.get_snp_acc2id(curs, self.snp_locus_table, self.tax_id) self.parse_file(self.input_fname, curs, self.output_table, self.tax_id, self.snp_acc_category_pattern, strain_acc2id, snp_acc2id, self.snp_locus_table) if self.commit: curs.execute("end")
def run(self): """ 12-11-05 --db_connect() --get_gene_symbol2gene_id() --get_ensembl_id2gene_id() --parser_dict[] --harbison2004_parse() --cisred_parse() """ (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) if self.type==1: mapping_dict = get_gene_symbol2gene_id(curs, self.tax_id) elif self.type==2: mapping_dict = get_ensembl_id2gene_id(curs, self.tax_id) else: mapping_dict = None self.parser_dict[self.type](curs, self.inputfile, mapping_dict, self.tax_id) if self.commit: curs.execute("end")
def run(self): """ 2008-05-08 transpose everything if output_matrix_type=1 (bjarni's SNP matrix format) 2007-02-19 --db_connect --get_snp_id2index() --get_strain_id2index() --get_strain_id_info() --get_snp_id_info() --get_data_matrix() if self.toss_out_rows: --toss_rows_to_make_distance_matrix_NA_free() --find_smallest_vertex_set_to_remove_all_edges() --write_data_matrix() #--sort_file() 2007-09-22 for mysql_connection add get_nativename_snpid2call_m() add fill_in_resolved_duplicated_calls() """ if self.debug: import pdb pdb.set_trace() if self.db_connection_type == 1: import MySQLdb #conn = MySQLdb.connect(db="stock",host='natural.uchicago.edu', user='******', passwd='iamhereatusc') conn = MySQLdb.connect(db=self.dbname, host=self.hostname, user=self.user, passwd=self.passwd) curs = conn.cursor() snp_id2index, snp_id_list, snp_id2info = self.get_snp_id2index_m( curs, self.input_table, self.snp_locus_table) strain_id2index, strain_id_list, nativename2strain_id, strain_id2acc, strain_id2category = self.get_strain_id2index_m(curs, \ self.input_table, self.strain_info_table, self.only_include_strains_with_GPS, \ self.resolve_duplicated_calls, toss_contaminants=self.toss_contaminants) #strain_id2acc, strain_id2category = self.get_strain_id_info_m(curs, strain_id_list, self.strain_info_table) #snp_id2info = self.get_snp_id_info_m(curs, snp_id_list, self.snp_locus_table) if self.input_table == 'dbsnp.calls': from variation.src.FigureOut384IlluminaABMapping import get_snps_id2mapping snps_id2mapping = get_snps_id2mapping(self.hostname, dbname='dbsnp', user=self.user, passwd=self.passwd) else: snps_id2mapping = None data_matrix = self.get_data_matrix_m(curs, strain_id2index, snp_id2index, nt2number, self.input_table, self.need_heterozygous_call, snps_id2mapping) """ if self.resolve_duplicated_calls: nativename_snpid2call = self.get_nativename_snpid2call_m(curs, self.strain_info_table, self.input_table) data_matrix = self.fill_in_resolved_duplicated_calls(data_matrix, strain_id2index, snp_id2index, nativename2strain_id, nativename_snpid2call) """ if self.include_other_strain_info: strain_id2other_info = self.get_strain_id2other_info( curs, strain_id_list, self.strain_info_table, self.input_table) else: strain_id2other_info = {} elif self.db_connection_type == 2: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) snp_id2index, snp_id_list = self.get_snp_id2index( curs, self.input_table, self.snp_locus_table) strain_id2index, strain_id_list = self.get_strain_id2index( curs, self.input_table) strain_id2acc, strain_id2category = self.get_strain_id_info( curs, strain_id_list, self.strain_info_table) snp_id2info = self.get_snp_id_info(curs, snp_id_list, self.snp_locus_table) data_matrix = self.get_data_matrix(curs, strain_id2index, snp_id2index, nt2number, self.input_table, self.need_heterozygous_call) strain_id2other_info = {} if self.toss_out_rows: rows_to_be_tossed_out = self.toss_rows_to_make_distance_matrix_NA_free( data_matrix) rows_to_be_tossed_out = Set(rows_to_be_tossed_out) else: rows_to_be_tossed_out = Set() #05/08/08 if self.discard_all_NA_strain: from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix remove_rows_data = FilterStrainSNPMatrix.remove_rows_with_too_many_NAs( data_matrix, row_cutoff=1) rows_with_too_many_NAs_set = remove_rows_data.rows_with_too_many_NAs_set #row_index2no_of_NAs = remove_rows_data.row_index2no_of_NAs rows_to_be_tossed_out.update(rows_with_too_many_NAs_set) strain_acc_list = [ strain_id2acc[strain_id] for strain_id in strain_id_list ] category_list = [ strain_id2category[strain_id] for strain_id in strain_id_list ] strain_acc2other_info = {} for strain_id in strain_id2other_info: strain_acc2other_info[ strain_id2acc[strain_id]] = strain_id2other_info[strain_id] if self.output_matrix_type == 1: #transpose everything data_matrix = num.array(data_matrix) data_matrix = num.transpose(data_matrix) header = ['Chromosomes', 'Positions'] + strain_acc_list chromosome_ls = [] position_ls = [] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] chromosome_ls.append(chromosome) position_ls.append(position) strain_acc_list = chromosome_ls category_list = position_ls cols_to_be_tossed_out = rows_to_be_tossed_out rows_to_be_tossed_out = None strain_id2other_info = None #make up one else: header = ['strain', 'category'] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] header.append(snp_name) cols_to_be_tossed_out = None write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out=rows_to_be_tossed_out, \ cols_to_be_tossed_out=cols_to_be_tossed_out, nt_alphabet=self.nt_alphabet,\ strain_acc2other_info=strain_acc2other_info, delimiter=self.delimiter)
def run(self): """ 2008-05-08 transpose everything if output_matrix_type=1 (bjarni's SNP matrix format) 2007-02-19 --db_connect --get_snp_id2index() --get_strain_id2index() --get_strain_id_info() --get_snp_id_info() --get_data_matrix() if self.toss_out_rows: --toss_rows_to_make_distance_matrix_NA_free() --find_smallest_vertex_set_to_remove_all_edges() --write_data_matrix() #--sort_file() 2007-09-22 for mysql_connection add get_nativename_snpid2call_m() add fill_in_resolved_duplicated_calls() """ if self.debug: import pdb pdb.set_trace() if self.db_connection_type==1: import MySQLdb #conn = MySQLdb.connect(db="stock",host='natural.uchicago.edu', user='******', passwd='iamhereatusc') conn = MySQLdb.connect(db=self.dbname,host=self.hostname, user=self.user, passwd = self.passwd) curs = conn.cursor() snp_id2index, snp_id_list, snp_id2info = self.get_snp_id2index_m(curs, self.input_table, self.snp_locus_table) strain_id2index, strain_id_list, nativename2strain_id, strain_id2acc, strain_id2category = self.get_strain_id2index_m(curs, \ self.input_table, self.strain_info_table, self.only_include_strains_with_GPS, \ self.resolve_duplicated_calls, toss_contaminants=self.toss_contaminants) #strain_id2acc, strain_id2category = self.get_strain_id_info_m(curs, strain_id_list, self.strain_info_table) #snp_id2info = self.get_snp_id_info_m(curs, snp_id_list, self.snp_locus_table) if self.input_table == 'dbsnp.calls': from variation.src.FigureOut384IlluminaABMapping import get_snps_id2mapping snps_id2mapping = get_snps_id2mapping(self.hostname, dbname='dbsnp', user=self.user, passwd=self.passwd) else: snps_id2mapping = None data_matrix = self.get_data_matrix_m(curs, strain_id2index, snp_id2index, nt2number, self.input_table, self.need_heterozygous_call, snps_id2mapping) """ if self.resolve_duplicated_calls: nativename_snpid2call = self.get_nativename_snpid2call_m(curs, self.strain_info_table, self.input_table) data_matrix = self.fill_in_resolved_duplicated_calls(data_matrix, strain_id2index, snp_id2index, nativename2strain_id, nativename_snpid2call) """ if self.include_other_strain_info: strain_id2other_info = self.get_strain_id2other_info(curs, strain_id_list, self.strain_info_table, self.input_table) else: strain_id2other_info = {} elif self.db_connection_type==2: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) snp_id2index, snp_id_list = self.get_snp_id2index(curs, self.input_table, self.snp_locus_table) strain_id2index, strain_id_list = self.get_strain_id2index(curs, self.input_table) strain_id2acc, strain_id2category = self.get_strain_id_info(curs, strain_id_list, self.strain_info_table) snp_id2info = self.get_snp_id_info(curs, snp_id_list, self.snp_locus_table) data_matrix = self.get_data_matrix(curs, strain_id2index, snp_id2index, nt2number, self.input_table, self.need_heterozygous_call) strain_id2other_info = {} if self.toss_out_rows: rows_to_be_tossed_out = self.toss_rows_to_make_distance_matrix_NA_free(data_matrix) rows_to_be_tossed_out = Set(rows_to_be_tossed_out) else: rows_to_be_tossed_out = Set() #05/08/08 if self.discard_all_NA_strain: from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix remove_rows_data = FilterStrainSNPMatrix.remove_rows_with_too_many_NAs(data_matrix, row_cutoff=1) rows_with_too_many_NAs_set = remove_rows_data.rows_with_too_many_NAs_set #row_index2no_of_NAs = remove_rows_data.row_index2no_of_NAs rows_to_be_tossed_out.update(rows_with_too_many_NAs_set) strain_acc_list = [strain_id2acc[strain_id] for strain_id in strain_id_list] category_list = [strain_id2category[strain_id] for strain_id in strain_id_list] strain_acc2other_info = {} for strain_id in strain_id2other_info: strain_acc2other_info[strain_id2acc[strain_id]] = strain_id2other_info[strain_id] if self.output_matrix_type==1: #transpose everything data_matrix = num.array(data_matrix) data_matrix = num.transpose(data_matrix) header = ['Chromosomes', 'Positions'] + strain_acc_list chromosome_ls = [] position_ls = [] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] chromosome_ls.append(chromosome) position_ls.append(position) strain_acc_list = chromosome_ls category_list = position_ls cols_to_be_tossed_out = rows_to_be_tossed_out rows_to_be_tossed_out = None strain_id2other_info = None #make up one else: header = ['strain', 'category'] for snp_id in snp_id_list: snp_name, chromosome, position = snp_id2info[snp_id] header.append(snp_name) cols_to_be_tossed_out = None write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out=rows_to_be_tossed_out, \ cols_to_be_tossed_out=cols_to_be_tossed_out, nt_alphabet=self.nt_alphabet,\ strain_acc2other_info=strain_acc2other_info, delimiter=self.delimiter)