Beispiel #1
0
 def run(self):
     (conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
     self.parser_dict[self.type](curs, self.inputfile,
                                 self.output_table_dict[self.type],
                                 self.organism, self.sequence_type)
     if self.commit:
         curs.execute("end")
Beispiel #2
0
	def run(self):
		"""
		2007-02-17
		
		-db_connect()
		-get_strain_acc2id()
		-get_snp_acc2id()
		-parse_file()
			-expand_snp_locus_table()
			-submit_to_strain_info_table()
			-submit_to_snp_table()
		"""
		(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
		strain_acc2id = self.get_strain_acc2id(curs, self.strain_info_table, self.tax_id)
		snp_acc2id = self.get_snp_acc2id(curs, self.snp_locus_table, self.tax_id)
		self.parse_file(self.input_fname, curs, self.output_table, self.tax_id, self.snp_acc_category_pattern, strain_acc2id, snp_acc2id, self.snp_locus_table)
		if self.commit:
			curs.execute("end")
Beispiel #3
0
	def run(self):
		"""
		12-11-05
			
			--db_connect()
			--get_gene_symbol2gene_id()
			--get_ensembl_id2gene_id()
			--parser_dict[]
				--harbison2004_parse()
				--cisred_parse()
		"""
		(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
		if self.type==1:
			mapping_dict = get_gene_symbol2gene_id(curs, self.tax_id)
		elif self.type==2:
			mapping_dict = get_ensembl_id2gene_id(curs, self.tax_id)
		else:
			mapping_dict = None
		self.parser_dict[self.type](curs, self.inputfile, mapping_dict, self.tax_id)
		if self.commit:
			curs.execute("end")
Beispiel #4
0
    def run(self):
        """
		2007-02-17
		
		-db_connect()
		-get_strain_acc2id()
		-get_snp_acc2id()
		-parse_file()
			-expand_snp_locus_table()
			-submit_to_strain_info_table()
			-submit_to_snp_table()
		"""
        (conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
        strain_acc2id = self.get_strain_acc2id(curs, self.strain_info_table,
                                               self.tax_id)
        snp_acc2id = self.get_snp_acc2id(curs, self.snp_locus_table,
                                         self.tax_id)
        self.parse_file(self.input_fname, curs, self.output_table, self.tax_id,
                        self.snp_acc_category_pattern, strain_acc2id,
                        snp_acc2id, self.snp_locus_table)
        if self.commit:
            curs.execute("end")
Beispiel #5
0
    def run(self):
        """
		2008-05-08
			transpose everything if output_matrix_type=1 (bjarni's SNP matrix format)
		2007-02-19
			--db_connect
			--get_snp_id2index()
			--get_strain_id2index()
			--get_strain_id_info()
			--get_snp_id_info()
			--get_data_matrix()
			if self.toss_out_rows:
				--toss_rows_to_make_distance_matrix_NA_free()
					--find_smallest_vertex_set_to_remove_all_edges()
			--write_data_matrix()
			#--sort_file()
		2007-09-22
			for mysql_connection
				add get_nativename_snpid2call_m()
				add fill_in_resolved_duplicated_calls()
		"""
        if self.debug:
            import pdb
            pdb.set_trace()
        if self.db_connection_type == 1:
            import MySQLdb
            #conn = MySQLdb.connect(db="stock",host='natural.uchicago.edu', user='******', passwd='iamhereatusc')
            conn = MySQLdb.connect(db=self.dbname,
                                   host=self.hostname,
                                   user=self.user,
                                   passwd=self.passwd)
            curs = conn.cursor()
            snp_id2index, snp_id_list, snp_id2info = self.get_snp_id2index_m(
                curs, self.input_table, self.snp_locus_table)
            strain_id2index, strain_id_list, nativename2strain_id, strain_id2acc, strain_id2category = self.get_strain_id2index_m(curs, \
                         self.input_table, self.strain_info_table, self.only_include_strains_with_GPS, \
                         self.resolve_duplicated_calls, toss_contaminants=self.toss_contaminants)

            #strain_id2acc, strain_id2category = self.get_strain_id_info_m(curs, strain_id_list, self.strain_info_table)
            #snp_id2info = self.get_snp_id_info_m(curs, snp_id_list, self.snp_locus_table)
            if self.input_table == 'dbsnp.calls':
                from variation.src.FigureOut384IlluminaABMapping import get_snps_id2mapping
                snps_id2mapping = get_snps_id2mapping(self.hostname,
                                                      dbname='dbsnp',
                                                      user=self.user,
                                                      passwd=self.passwd)
            else:
                snps_id2mapping = None
            data_matrix = self.get_data_matrix_m(curs, strain_id2index,
                                                 snp_id2index, nt2number,
                                                 self.input_table,
                                                 self.need_heterozygous_call,
                                                 snps_id2mapping)
            """
			if self.resolve_duplicated_calls:
				nativename_snpid2call = self.get_nativename_snpid2call_m(curs, self.strain_info_table, self.input_table)
				data_matrix = self.fill_in_resolved_duplicated_calls(data_matrix, strain_id2index, snp_id2index, nativename2strain_id, nativename_snpid2call)
			"""
            if self.include_other_strain_info:
                strain_id2other_info = self.get_strain_id2other_info(
                    curs, strain_id_list, self.strain_info_table,
                    self.input_table)
            else:
                strain_id2other_info = {}
        elif self.db_connection_type == 2:
            (conn, curs) = db_connect(self.hostname, self.dbname, self.schema)
            snp_id2index, snp_id_list = self.get_snp_id2index(
                curs, self.input_table, self.snp_locus_table)
            strain_id2index, strain_id_list = self.get_strain_id2index(
                curs, self.input_table)

            strain_id2acc, strain_id2category = self.get_strain_id_info(
                curs, strain_id_list, self.strain_info_table)
            snp_id2info = self.get_snp_id_info(curs, snp_id_list,
                                               self.snp_locus_table)
            data_matrix = self.get_data_matrix(curs, strain_id2index,
                                               snp_id2index, nt2number,
                                               self.input_table,
                                               self.need_heterozygous_call)
            strain_id2other_info = {}

        if self.toss_out_rows:
            rows_to_be_tossed_out = self.toss_rows_to_make_distance_matrix_NA_free(
                data_matrix)
            rows_to_be_tossed_out = Set(rows_to_be_tossed_out)
        else:
            rows_to_be_tossed_out = Set()

        #05/08/08
        if self.discard_all_NA_strain:
            from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix
            remove_rows_data = FilterStrainSNPMatrix.remove_rows_with_too_many_NAs(
                data_matrix, row_cutoff=1)
            rows_with_too_many_NAs_set = remove_rows_data.rows_with_too_many_NAs_set
            #row_index2no_of_NAs = remove_rows_data.row_index2no_of_NAs
            rows_to_be_tossed_out.update(rows_with_too_many_NAs_set)

        strain_acc_list = [
            strain_id2acc[strain_id] for strain_id in strain_id_list
        ]
        category_list = [
            strain_id2category[strain_id] for strain_id in strain_id_list
        ]

        strain_acc2other_info = {}
        for strain_id in strain_id2other_info:
            strain_acc2other_info[
                strain_id2acc[strain_id]] = strain_id2other_info[strain_id]

        if self.output_matrix_type == 1:
            #transpose everything
            data_matrix = num.array(data_matrix)
            data_matrix = num.transpose(data_matrix)

            header = ['Chromosomes', 'Positions'] + strain_acc_list
            chromosome_ls = []
            position_ls = []
            for snp_id in snp_id_list:
                snp_name, chromosome, position = snp_id2info[snp_id]
                chromosome_ls.append(chromosome)
                position_ls.append(position)

            strain_acc_list = chromosome_ls
            category_list = position_ls
            cols_to_be_tossed_out = rows_to_be_tossed_out
            rows_to_be_tossed_out = None
            strain_id2other_info = None  #make up one
        else:
            header = ['strain', 'category']
            for snp_id in snp_id_list:
                snp_name, chromosome, position = snp_id2info[snp_id]
                header.append(snp_name)
            cols_to_be_tossed_out = None

        write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out=rows_to_be_tossed_out, \
           cols_to_be_tossed_out=cols_to_be_tossed_out, nt_alphabet=self.nt_alphabet,\
           strain_acc2other_info=strain_acc2other_info, delimiter=self.delimiter)
Beispiel #6
0
	def run(self):
		"""
		2008-05-08
			transpose everything if output_matrix_type=1 (bjarni's SNP matrix format)
		2007-02-19
			--db_connect
			--get_snp_id2index()
			--get_strain_id2index()
			--get_strain_id_info()
			--get_snp_id_info()
			--get_data_matrix()
			if self.toss_out_rows:
				--toss_rows_to_make_distance_matrix_NA_free()
					--find_smallest_vertex_set_to_remove_all_edges()
			--write_data_matrix()
			#--sort_file()
		2007-09-22
			for mysql_connection
				add get_nativename_snpid2call_m()
				add fill_in_resolved_duplicated_calls()
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		if self.db_connection_type==1:
			import MySQLdb
			#conn = MySQLdb.connect(db="stock",host='natural.uchicago.edu', user='******', passwd='iamhereatusc')
			conn = MySQLdb.connect(db=self.dbname,host=self.hostname, user=self.user, passwd = self.passwd)
			curs = conn.cursor()
			snp_id2index, snp_id_list, snp_id2info = self.get_snp_id2index_m(curs, self.input_table, self.snp_locus_table)
			strain_id2index, strain_id_list, nativename2strain_id, strain_id2acc, strain_id2category = self.get_strain_id2index_m(curs, \
																self.input_table, self.strain_info_table, self.only_include_strains_with_GPS, \
																self.resolve_duplicated_calls, toss_contaminants=self.toss_contaminants)
			
			#strain_id2acc, strain_id2category = self.get_strain_id_info_m(curs, strain_id_list, self.strain_info_table)
			#snp_id2info = self.get_snp_id_info_m(curs, snp_id_list, self.snp_locus_table)
			if self.input_table == 'dbsnp.calls':
				from variation.src.FigureOut384IlluminaABMapping import get_snps_id2mapping
				snps_id2mapping = get_snps_id2mapping(self.hostname, dbname='dbsnp', user=self.user, passwd=self.passwd)
			else:
				snps_id2mapping = None
			data_matrix = self.get_data_matrix_m(curs, strain_id2index, snp_id2index, nt2number, self.input_table, self.need_heterozygous_call, snps_id2mapping)
			"""
			if self.resolve_duplicated_calls:
				nativename_snpid2call = self.get_nativename_snpid2call_m(curs, self.strain_info_table, self.input_table)
				data_matrix = self.fill_in_resolved_duplicated_calls(data_matrix, strain_id2index, snp_id2index, nativename2strain_id, nativename_snpid2call)
			"""
			if self.include_other_strain_info:
				strain_id2other_info = self.get_strain_id2other_info(curs, strain_id_list, self.strain_info_table, self.input_table)
			else:
				strain_id2other_info = {}
		elif self.db_connection_type==2:
			(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
			snp_id2index, snp_id_list = self.get_snp_id2index(curs, self.input_table, self.snp_locus_table)
			strain_id2index, strain_id_list = self.get_strain_id2index(curs, self.input_table)
			
			strain_id2acc, strain_id2category = self.get_strain_id_info(curs, strain_id_list, self.strain_info_table)
			snp_id2info = self.get_snp_id_info(curs, snp_id_list, self.snp_locus_table)
			data_matrix = self.get_data_matrix(curs, strain_id2index, snp_id2index, nt2number, self.input_table, self.need_heterozygous_call)
			strain_id2other_info = {}
		
		if self.toss_out_rows:
			rows_to_be_tossed_out = self.toss_rows_to_make_distance_matrix_NA_free(data_matrix)
			rows_to_be_tossed_out = Set(rows_to_be_tossed_out)
		else:
			rows_to_be_tossed_out = Set()
		
		#05/08/08
		if self.discard_all_NA_strain:
			from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix
			remove_rows_data = FilterStrainSNPMatrix.remove_rows_with_too_many_NAs(data_matrix, row_cutoff=1)
			rows_with_too_many_NAs_set = remove_rows_data.rows_with_too_many_NAs_set
			#row_index2no_of_NAs = remove_rows_data.row_index2no_of_NAs
			rows_to_be_tossed_out.update(rows_with_too_many_NAs_set)
		
		strain_acc_list = [strain_id2acc[strain_id] for strain_id in strain_id_list]
		category_list = [strain_id2category[strain_id] for strain_id in strain_id_list]
		
		strain_acc2other_info = {}
		for strain_id in strain_id2other_info:
			strain_acc2other_info[strain_id2acc[strain_id]] = strain_id2other_info[strain_id]
		
		if self.output_matrix_type==1:
			#transpose everything
			data_matrix = num.array(data_matrix)
			data_matrix = num.transpose(data_matrix)
			
			header = ['Chromosomes', 'Positions'] + strain_acc_list
			chromosome_ls = []
			position_ls = []
			for snp_id in snp_id_list:
				snp_name, chromosome, position = snp_id2info[snp_id]
				chromosome_ls.append(chromosome)
				position_ls.append(position) 
			
			strain_acc_list = chromosome_ls
			category_list = position_ls
			cols_to_be_tossed_out = rows_to_be_tossed_out
			rows_to_be_tossed_out = None
			strain_id2other_info = None	#make up one
		else:
			header = ['strain', 'category']
			for snp_id in snp_id_list:
				snp_name, chromosome, position = snp_id2info[snp_id]
				header.append(snp_name)
			cols_to_be_tossed_out = None
		
		write_data_matrix(data_matrix, self.output_fname, header, strain_acc_list, category_list, rows_to_be_tossed_out=rows_to_be_tossed_out, \
					cols_to_be_tossed_out=cols_to_be_tossed_out, nt_alphabet=self.nt_alphabet,\
					strain_acc2other_info=strain_acc2other_info, delimiter=self.delimiter)
Beispiel #7
0
	def run(self):
		(conn, curs) =  db_connect(self.hostname, self.dbname, self.schema)
		self.parser_dict[self.type](curs, self.inputfile, self.output_table_dict[self.type], self.organism, self.sequence_type)
		if self.commit:
			curs.execute("end")