def run(self):
        """
		2008-06-02
		"""
        if self.debug:
            import pdb
            pdb.set_trace()
        if self.row_matching_by_which_value == 0:
            snpData1 = SNPData(input_fname=self.input_fname1,
                               turn_into_array=1,
                               ignore_2nd_column=1)
        else:
            snpData1 = SNPData(input_fname=self.input_fname1,
                               turn_into_array=1)
        snpData2 = SNPData(input_fname=self.input_fname2, turn_into_array=1)

        if self.row_matching_by_which_value == 1 or self.row_matching_by_which_value == 2:
            row_matching_by_which_value = self.row_matching_by_which_value - 1
        else:
            row_matching_by_which_value = None
        twoSNPData = TwoSNPData(
            SNPData1=snpData1,
            SNPData2=snpData2,
            debug=self.debug,
            row_matching_by_which_value=row_matching_by_which_value)
        newSnpData = twoSNPData.order2ndSNPDataRowsSameAs1stSNPData()
        newSnpData.tofile(self.output_fname)
    def run(self):
        """
		"""

        if self.debug:
            import pdb
            pdb.set_trace()

        snpData = SNPData(input_fname=self.inputFname,
                          turn_into_array=1,
                          ignore_2nd_column=1)
        snpData = SNPData.removeMonomorphicCols(snpData, NA_set=set([]))
        if self.min_MAF and self.min_MAF > 0:
            snpData = SNPData.removeColsByMAF(snpData,
                                              min_MAF=self.min_MAF,
                                              NA_set=set([]))

        self.writer = VCFFile(outputFname=self.outputFname, openMode='w')
        self.writer.makeupHeaderFromSampleIDList(
            sampleIDList=snpData.row_id_ls)
        self.writer.writeMetaAndHeader()

        counter = 0
        for j in xrange(len(snpData.col_id_ls)):
            snp_id = snpData.col_id_ls[j]
            chromosome, start = snp_id.split('_')[:2]
            genotype_ls = snpData.data_matrix[:, j]
            genotype_ls = utils.dict_map(number2di_nt, genotype_ls)
            genotype_ls_vcf = []
            alleleNucleotide2Number = {}
            alleleNumber2Nucleotide = {}
            for genotype in genotype_ls:
                if genotype == 'NA':
                    genotype_ls_vcf.append("./.")
                elif len(genotype) == 2:
                    for allele in genotype:
                        if allele not in alleleNucleotide2Number:
                            alleleNumber = len(alleleNucleotide2Number)
                            alleleNucleotide2Number[allele] = alleleNumber
                            alleleNumber2Nucleotide[alleleNumber] = allele
                    genotype_ls_vcf.append(
                        "%s/%s" % (alleleNucleotide2Number[genotype[0]],
                                   alleleNucleotide2Number[genotype[1]]))

                else:
                    genotype_ls_vcf.append("./.")
            refAllele = alleleNumber2Nucleotide[0]
            if 1 not in alleleNumber2Nucleotide:
                altAllele = refAllele
            else:
                altAllele = alleleNumber2Nucleotide[1]
            row = [
                chromosome, start, ".", refAllele, altAllele, 999, 'PASS',
                "DP=100", "GT"
            ] + genotype_ls_vcf
            self.writer.writerow(row)
            counter += 1
        sys.stderr.write("  %s records.\n" % (counter))
        self.writer.close()
Beispiel #3
0
    def readInData(
        cls, phenotype_fname, input_fname, eigen_vector_fname, phenotype_method_id_ls, test_type=1, report=0
    ):
        """
		2010-2-25
			call removeUnPhenotypedSNPData() to shrink the snp dataset by removing un-phenotyped ecotypes
		2009-3-20
			refactored out of run(), easy for MpiAssociation.py to call
		"""
        header, strain_acc_list, category_list, data_matrix = read_data(input_fname)
        snpData = SNPData(
            header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix
        )

        header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(
            phenotype_fname, turn_into_integer=0
        )
        snpData = cls.removeUnPhenotypedSNPData(
            snpData, header_phen, strain_acc_list_phen, data_matrix_phen, phenotype_method_id_ls
        )

        newSnpData, allele2index_ls = snpData.convertSNPAllele2Index(
            report
        )  # 0 (NA) or -2 (untouched) is all converted to -2 as 0 is used to denote allele
        newSnpData.header = snpData.header

        data_matrix_phen = cls.get_phenotype_matrix_in_data_matrix_order(
            strain_acc_list, strain_acc_list_phen, data_matrix_phen
        )
        phenData = SNPData(header=header_phen, strain_acc_list=snpData.strain_acc_list, data_matrix=data_matrix_phen)

        if eigen_vector_fname:
            PC_data = cls.getPCFromFile(eigen_vector_fname)
            PC_matrix = PC_data.PC_matrix
        else:
            if test_type == 4:  # eigen_vector_fname not given for this test_type. calcualte PCs.
                import pca_module

                T, P, explained_var = pca_module.PCA_svd(newSnpData.data_matrix, standardize=False)
                PC_matrix = T
            else:
                PC_matrix = None

        del snpData
        if phenotype_method_id_ls:
            which_phenotype_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(phenData, Set(phenotype_method_id_ls))
        else:  # if not available, take all phenotypes
            which_phenotype_ls = range(len(phenData.col_id_ls))
        pdata = PassingData(
            snpData=newSnpData,
            phenData=phenData,
            PC_matrix=PC_matrix,
            which_phenotype_ls=which_phenotype_ls,
            phenotype_method_id_ls=phenotype_method_id_ls,
        )
        return pdata
Beispiel #4
0
    def inputNodePrepare(self, snp_info=None):
        """
		2009-2-16
			get phenData.phenotype_method_id_ls in the same order as phenData.col_id_ls
		2009-2-11
			refactored out of run()
		"""
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.input_fname)
        snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \
            data_matrix=data_matrix, turn_into_array=1) #category_list is not used to facilitate row-id matching

        picklef = open(self.snps_context_fname)
        snps_context_wrapper = cPickle.load(picklef)
        del picklef
        gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(snps_context_wrapper)
        del snps_context_wrapper
        gene_id_ls = gene_id2snps_id_ls.keys()
        gene_id_ls.sort()

        header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(
            self.phenotype_fname, turn_into_integer=0)
        phenData = SNPData(header=header_phen,
                           strain_acc_list=strain_acc_list_phen,
                           data_matrix=data_matrix_phen)
        phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(
            snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix)
        phenData.phenotype_method_id_ls = get_phenotype_method_id_lsFromPhenData(
            phenData)  #2009-2-16

        self.phenotype_index_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(
            phenData, Set(self.phenotype_method_id_ls))

        if not self.phenotype_index_ls:
            self.phenotype_index_ls = range(len(phenData.col_id_ls))

        pdata = PassingData(gene_id_ls=gene_id_ls, gene_id2snps_id_ls=gene_id2snps_id_ls, \
            phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info)
        params_ls = self.generate_params(self.gene_id_fname, pdata,
                                         self.block_size)

        other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=pdata.gene_id_ls, phenData=phenData, \
              phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info)
        other_data_pickle = cPickle.dumps(other_data, -1)
        del other_data

        output_node_data = PassingData(phenotype_label_ls=phenData.col_id_ls, \
              phenotype_index_ls=self.phenotype_index_ls)
        output_node_data_pickle = cPickle.dumps(output_node_data, -1)

        snpData_pickle = cPickle.dumps(snpData, -1)
        del snpData, data_matrix
        return_data = PassingData(snpData_pickle=snpData_pickle, other_data_pickle=other_data_pickle,\
              output_node_data_pickle=output_node_data_pickle, params_ls=params_ls)
        return return_data
    def run(self):
        """
		2008-08-11
			the database interface changed in variation.src.dbsnp
		2008-05-06
		"""
        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname,
                               host=self.hostname,
                               user=self.user,
                               passwd=self.passwd)
        curs = conn.cursor()
        if self.debug:
            import pdb
            pdb.set_trace()

        db = DBSNP(username=self.user,
                   password=self.passwd,
                   hostname=self.hostname,
                   database=self.dbname)
        session = db.session
        session.begin()
        #transaction = session.create_transaction()

        snps_name2possible_mappings, snps_name2snps_id = self.get_snps_name2possible_mappings(
            db)

        from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix
        header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix.read_data(
            self.input_fname1)
        snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \
             col_id2id=snps_name2snps_id, snps_table='dbsnp.snps')

        header, strain_acc_list, category_list, data_matrix = FilterStrainSNPMatrix.read_data(
            self.input_fname2)
        snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix,\
            snps_table='stock_250k.snps')

        twoSNPData = TwoSNPData384(SNPData1=snpData1,
                                   SNPData2=snpData2,
                                   curs=curs,
                                   user=self.user)

        readme = formReadmeObj(sys.argv, self.ad, README)
        session.save(readme)
        session.flush()
        twoSNPData.figureOutABMapping(session, readme,
                                      snps_name2possible_mappings)
        if self.commit:
            curs.execute("commit")
            session.commit()
        else:
            session.rollback()
Beispiel #6
0
	def getPhenotypeData(cls, curs, phenotype_avg_table=None, phenotype_method_table=None, ecotype_table='stock.ecotype', get_raw_data=1):
		"""
		2009-2-2
			wrap up all other 3 methods
		"""
		phenotype_info = cls.get_phenotype_method_id_info(curs, phenotype_avg_table, phenotype_method_table)
		ecotype_id2index, ecotype_id_ls, ecotype_name_ls = cls.get_ecotype_id2info(curs, phenotype_avg_table, ecotype_table)
		data_matrix = cls.get_matrix(curs, phenotype_avg_table, ecotype_id2index, phenotype_info, get_raw_data)
		pheno_data = SNPData(col_id_ls=phenotype_info.phenotype_id_ls, row_id_ls=ecotype_id_ls, data_matrix=data_matrix)
		pheno_data.row_label_ls = ecotype_name_ls
		pheno_data.col_label_ls = phenotype_info.method_id_name_ls
		return pheno_data
Beispiel #7
0
    def run(self):
        """
		2008-5-12
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        #database connection and etc
        db = self.db_250k

        session = db.session
        session.begin()

        delimiter = figureOutDelimiter(self.inputFname, report=self.report)
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.inputFname, delimiter=delimiter)

        if self.snp_id_type == 1:
            #2011-2-27 translate the db_id into chr_pos because the new StrainXSNP dataset uses db_id to identify SNPs.
            # but if col-id is already chr_pos, it's fine.
            new_header = header[:2]
            data_matrix_col_index_to_be_kept = []
            for i in xrange(2, len(header)):
                snp_id = header[i]
                chr_pos = db.get_chr_pos_given_db_id2chr_pos(snp_id, )
                if chr_pos is not None:
                    data_matrix_col_index_to_be_kept.append(i - 2)
                    new_header.append(chr_pos)
            # to remove no-db_id columns from data matrix
            data_matrix = numpy.array(data_matrix)
            data_matrix = data_matrix[:, data_matrix_col_index_to_be_kept]
            header = new_header

        if self.array_id_2nd_column:
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
                data_matrix=data_matrix)
        else:
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\
                data_matrix=data_matrix) #ignore category_list

        rawSnpsData_ls = SNPData2RawSnpsData_ls(snpData,
                                                need_transposeSNPData=1,
                                                report=self.report)
        chromosomes = [
            rawSnpsData.chromosome for rawSnpsData in rawSnpsData_ls
        ]
        snpsdata.writeRawSnpsDatasToFile(self.outputFname,
                                         rawSnpsData_ls,
                                         chromosomes=chromosomes,
                                         deliminator=',',
                                         withArrayIds=self.array_id_2nd_column)
Beispiel #8
0
    def run(self):
        """
		2009-2-12
		"""
        if self.debug:
            import pdb
            pdb.set_trace()
        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname,
                               host=self.hostname,
                               user=self.db_user,
                               passwd=self.db_passwd)
        curs = conn.cursor()

        chr2CNV_probe_ls_pickle_fname = '/tmp/chr2CNV_probe_ls.pickle'
        if not os.path.isfile(chr2CNV_probe_ls_pickle_fname):
            chr2CNV_probe_ls = self.get_chr2CNV_probe_ls(
                curs, self.probes_table)
            picklef = open(chr2CNV_probe_ls_pickle_fname, 'w')
            cPickle.dump(chr2CNV_probe_ls, picklef, -1)
            del picklef
        else:
            picklef = open(chr2CNV_probe_ls_pickle_fname, 'r')
            chr2CNV_probe_ls = cPickle.load(picklef)
            del picklef
        snpData = SNPData(input_fname=self.input_fname,
                          turn_into_array=1,
                          ignore_2nd_column=1)

        probeData = self.get_probe_id2snp_id_ls(chr2CNV_probe_ls,
                                                snpData.col_id_ls)
        SNP2Col_allele = self.get_SNP2Col_allele(snpData)

        cnvIntensityData = SNPData(input_fname=self.cnv_input_fname,
                                   turn_into_array=1,
                                   ignore_2nd_column=1,
                                   matrix_data_type=float)

        cnvQCData = self.getCNVQCMatrix(probeData.probe_id2snp_id_ls,
                                        probeData.snp_id2tup, snpData,
                                        SNP2Col_allele, cnvIntensityData)
        plotdata_pickle_fname = '/tmp/CNV_plot_data.pickle'
        picklef = open(plotdata_pickle_fname, 'w')
        cPickle.dump(cnvQCData.plotData, picklef, -1)
        del picklef
        cnvQCData.mismatchData.tofile('%s_mismatch.tsv' %
                                      self.output_fname_prefix)
        cnvQCData.insertionData.tofile('%s_insertion.tsv' %
                                       self.output_fname_prefix)
        cnvQCData.deletionData.tofile('%s_deletion.tsv' %
                                      self.output_fname_prefix)
        cnvQCData.qcData.tofile('%s_qc.tsv' % self.output_fname_prefix)
Beispiel #9
0
    def run(self):
        cnvIntensityData = SNPData(input_fname=self.input_fname,
                                   turn_into_array=1,
                                   ignore_2nd_column=1,
                                   matrix_data_type=float)
        probe_pos_ls = []
        avg_intensity_ls = []

        if self.run_type == 1:
            newDataMatrix = numpy.ones(cnvIntensityData.data_matrix.shape,
                                       numpy.int)

        for j in range(cnvIntensityData.data_matrix.shape[1]):
            probe_id = cnvIntensityData.col_id_ls[j]
            probe_id = probe_id.split('_')
            probe_id = map(int, probe_id)
            probe_pos_ls.append(probe_id[1])
            avg_intensity_ls.append(
                numpy.sum(cnvIntensityData.data_matrix[:, j]))
            if self.run_type == 1:
                for i in range(cnvIntensityData.data_matrix.shape[0]):
                    if cnvIntensityData.data_matrix[i][
                            j] <= self.max_del_intensity:
                        newDataMatrix[i][j] = -1

        if self.run_type == 1:
            newData = SNPData(row_id_ls=cnvIntensityData.row_id_ls,
                              col_id_ls=cnvIntensityData.col_id_ls,
                              data_matrix=newDataMatrix)
            newData.tofile(self.output_fname)
        elif self.run_type == 2:
            block_size = 1000
            no_of_probes = len(probe_pos_ls)
            no_of_blocks = no_of_probes / block_size
            for i in range(no_of_blocks):
                if i * block_size > no_of_probes:
                    break
                start_index = i * block_size
                end_index = min((i + 1) * block_size, no_of_probes)
                fname = '%s_%s_%s.png' % (self.output_fname,
                                          probe_pos_ls[start_index],
                                          probe_pos_ls[end_index])
                pylab.clf()
                pylab.plot(probe_pos_ls[start_index:end_index],
                           avg_intensity_ls[start_index:end_index],
                           '.',
                           markersize=4,
                           alpha=0.4)
                pylab.xlabel('chromosome position')
                pylab.ylabel('sum intensity')
                pylab.savefig(fname, dpi=300)
Beispiel #10
0
    def main(self):
        if self.debug:
            import pdb

            pdb.set_trace()
        if self.input_file_format == 1:
            header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, turn_into_integer=0)
            snps_name_ls = header[2:]
            no_of_rows = len(strain_acc_list)
            no_of_samplings = int(math.ceil(self.coverage * no_of_rows / float(self.no_of_accessions_per_sampling)))
            if no_of_samplings > 1:
                imputed_matrix, new_snps_name_ls = self.samplingImpute(
                    snps_name_ls,
                    data_matrix,
                    input_file_format=1,
                    input_NA_char="0",
                    lower_case_for_imputation=self.lower_case_for_imputation,
                    npute_window_size=self.single_window_size,
                    no_of_accessions_per_sampling=self.no_of_accessions_per_sampling,
                    coverage=self.coverage,
                )
                imputedData = YuSNPData(
                    strain_acc_list=strain_acc_list,
                    category_list=category_list,
                    col_id_ls=snps_name_ls,
                    data_matrix=imputed_matrix,
                )
                imputedData.tofile(self.output_fname)
            else:
                self.outputHeader(self.output_fname, strain_acc_list, category_list)
                chr2no_of_snps = self.get_chr2no_of_snps(snps_name_ls)
                chr_ls = chr2no_of_snps.keys()
                chr_ls.sort()
                for chromosome in chr_ls:
                    snpData = SNPData(
                        inFile=self.input_fname,
                        snps_name_ls=snps_name_ls,
                        data_matrix=data_matrix,
                        chromosome=chromosome,
                        input_file_format=self.input_file_format,
                        lower_case_for_imputation=self.lower_case_for_imputation,
                    )
                    self.run(snpData)
        else:
            snpData = SNPData(
                inFile=self.input_fname,
                input_file_format=self.input_file_format,
                lower_case_for_imputation=self.lower_case_for_imputation,
            )
            self.run(snpData)
	def loadDataStructure(self, gene_annotation_picklef, min_MAF=0, min_distance=20000, \
						list_type_id=None, snp_matrix_fname=None, snp_matrix_data_type=1):
		"""
		2009-5-30
			add argument snp_matrix_fname
		2008-11-25
		2008-10-01
			wrap a few functions up, convenient for both run() and drawSNPRegion()
		"""
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		self.db = db
		snp_info = self.getSNPInfo(db)
		gene_annotation = self.dealWithGeneAnnotation(gene_annotation_picklef)
		if list_type_id:
			candidate_gene_list = self.getGeneList(list_type_id)
			candidate_gene_set = Set(candidate_gene_list)
		else:
			candidate_gene_set = Set()
		
		if snp_matrix_fname:
			if snp_matrix_data_type==3:
				matrix_data_type=float		#2009-3-23 for CNV amplitude file
			else:
				matrix_data_type=int
			snpData = SNPData(input_fname=snp_matrix_fname, turn_into_integer=1, turn_into_array=1, ignore_2nd_column=1, matrix_data_type=matrix_data_type)			
			#2008-12-05 fake a snp_info for findSNPsInRegion
			self.construct_chr_pos2index_forSNPData(snpData)
		else:
			snpData = None
			
		return_data = PassingData(gene_annotation=gene_annotation, snp_info=snp_info, \
								candidate_gene_set=candidate_gene_set, snpData=snpData)
		return return_data
Beispiel #12
0
	def run(self):	
		if self.debug:
			import pdb
			pdb.set_trace()
		db = Stock_250kDB(drivername=self.drivername, username=self.db_user,
						password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup()
		session = db.session
		
		results_method_id_info = self.getResultsMethodIDInfo(db, self.call_method_id_ls, self.min_distance, self.get_closest, self.min_MAF)
		results_method_id2gene_set = self.getResultsMethodID2GeneSet(db, results_method_id_info, self.results_directory, self.max_rank)
		rdata = self.getDataMatrix(results_method_id2gene_set, results_method_id_info)
		
		header = ['', ''] + results_method_id_info.results_method_id_label_ls
		strain_acc_list = results_method_id_info.results_method_id_label_ls
		category_list = results_method_id_info.results_method_id_ls
		if SNPData.isDataMatrixEmpty(rdata.data_matrix):
			sys.stderr.write("Nothing fetched from database.\n")
			sys.exit(3)
		if self.output_fname:
			write_data_matrix(rdata.data_matrix, self.output_fname, header, strain_acc_list, category_list)
		
		if self.fig_fname:
			font = get_font(self.font_path, font_size=self.font_size)	#2008-08-01
			value2color_func = lambda x: Value2Color.value2HSLcolor(x, rdata.min_value, rdata.max_value)
			im_legend = drawContinousLegend(rdata.min_value, rdata.max_value, self.no_of_ticks, value2color_func, font)
			#im.save('%s_legend.png'%self.fig_fname_prefix)
			im = drawMatrix(rdata.data_matrix, value2color_func, strain_acc_list,\
						strain_acc_list, with_grid=1, font=font)
			im = combineTwoImages(im, im_legend, font=font)
			im.save(self.fig_fname)
Beispiel #13
0
    def removeUnPhenotypedSNPData(
        clf, snpData, header_phen, strain_acc_list_phen, data_matrix_phen, phenotype_method_id_ls
    ):
        """
		2010-2-25
			remove un-phenotyped ecotypes from the SNP data in order to keep the snp dataset small 
		"""
        sys.stderr.write("Removing un-phenotyped ecotypes from the SNP data ...")
        phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen)
        if phenotype_method_id_ls:
            which_phenotype_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(phenData, Set(phenotype_method_id_ls))
        else:  # if not available, take all phenotypes
            which_phenotype_ls = range(len(phenData.col_id_ls))

        phenotyped_ecotype_id_set = set()
        for i in range(len(phenData.row_id_ls)):
            ecotype_id = phenData.row_id_ls[i]
            keep_this_ecotype = False
            for col_index in which_phenotype_ls:
                if phenData.data_matrix[i][col_index] != "NA":  # 2010-2-25 phenotype values are in raw string.
                    keep_this_ecotype = True
                    break
            if keep_this_ecotype:
                phenotyped_ecotype_id_set.add(ecotype_id)

        row_ids_to_be_kept = set()  # 2010-2-21
        no_of_ecotypes_in_total = len(snpData.row_id_ls)
        for row_id in snpData.row_id_ls:
            ecotype_id = row_id[0]  # 1st column is ecotype_id, 2nd is array id
            if ecotype_id in phenotyped_ecotype_id_set:
                row_ids_to_be_kept.add(row_id)
        snpData = SNPData.keepRowsByRowID(snpData, row_ids_to_be_kept)
        no_of_removed = no_of_ecotypes_in_total - len(row_ids_to_be_kept)
        sys.stderr.write("%s removed. Done.\n" % (no_of_removed))
        return snpData
Beispiel #14
0
    def qcDataMatrixVSsnpData(self, pdata, snps_name2snps_id, snpData2, curs,
                              session, readme):
        """
		2008-08-16
			split from run() to enable one_by_one option
		"""
        #swap the ecotype_id_ls and call_info_id_ls when passing them to SNPData. now strain_acc_list=ecotype_id_ls
        snpData1 = SNPData(header=pdata.header, strain_acc_list=pdata.ecotype_id_ls, category_list=pdata.call_info_id_ls, data_matrix=pdata.data_matrix, \
            min_probability=self.min_probability, call_method_id=self.call_method_id, col_id2id=snps_name2snps_id,\
            max_call_info_mismatch_rate=self.max_call_info_mismatch_rate, snps_table='stock_250k.snps')
        #snps_table is set to the stock_250k snps_table

        twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \
             QC_method_id=self.QC_method_id, user=self.user, row_matching_by_which_value=0, debug=self.debug)

        if self.run_type == 1:
            row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise()
        elif self.run_type == 2:
            twoSNPData.save_col_wise(session, readme)
            row_id2NA_mismatch_rate = {}
        else:
            sys.stderr.write("run_type=%s is not supported.\n" % self.run_type)
            sys.exit(5)
        passingdata = PassingData()
        passingdata.row_id2NA_mismatch_rate = row_id2NA_mismatch_rate
        passingdata.row_id12row_id2 = twoSNPData.row_id12row_id2
        return passingdata
Beispiel #15
0
	def inputNodePrepare(self, snp_info=None):
		"""
		2009-2-16
			get phenData.phenotype_method_id_ls in the same order as phenData.col_id_ls
		2009-2-11
			refactored out of run()
		"""
		header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname)
		snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \
						data_matrix=data_matrix, turn_into_array=1)	#category_list is not used to facilitate row-id matching
		
		picklef = open(self.snps_context_fname)
		snps_context_wrapper = cPickle.load(picklef)
		del picklef
		gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(snps_context_wrapper)
		del snps_context_wrapper
		gene_id_ls = gene_id2snps_id_ls.keys()
		gene_id_ls.sort()
		
		header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0)
		phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen)
		phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix)
		phenData.phenotype_method_id_ls = get_phenotype_method_id_lsFromPhenData(phenData)	#2009-2-16
		
		self.phenotype_index_ls = PlotGroupOfSNPs.findOutWhichPhenotypeColumn(phenData, Set(self.phenotype_method_id_ls))
		
		if not self.phenotype_index_ls:
			self.phenotype_index_ls = range(len(phenData.col_id_ls))
		
		pdata = PassingData(gene_id_ls=gene_id_ls, gene_id2snps_id_ls=gene_id2snps_id_ls, \
						phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info)
		params_ls = self.generate_params(self.gene_id_fname, pdata, self.block_size)
		
		other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=pdata.gene_id_ls, phenData=phenData, \
								phenotype_index_ls=self.phenotype_index_ls, snp_info=snp_info)
		other_data_pickle = cPickle.dumps(other_data, -1)
		del other_data
		
		output_node_data = PassingData(phenotype_label_ls=phenData.col_id_ls, \
								phenotype_index_ls=self.phenotype_index_ls)
		output_node_data_pickle = cPickle.dumps(output_node_data, -1)
		
		snpData_pickle = cPickle.dumps(snpData, -1)
		del snpData, data_matrix
		return_data = PassingData(snpData_pickle=snpData_pickle, other_data_pickle=other_data_pickle,\
								output_node_data_pickle=output_node_data_pickle, params_ls=params_ls)
		return return_data
Beispiel #16
0
	def run(self):
		"""
		2008-9-7
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		
		delimiter = figureOutDelimiter(self.input_fname, report=self.report)
		header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, delimiter=delimiter)
		
		snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
						data_matrix=data_matrix)
		newSnpData, allele_index2allele_ls = snpData.convert2Binary(self.report)
		
		if self.mapping_fname:	#output allele_index2allele_ls
			self.output_allele2index_ls(snpData, allele_index2allele_ls, self.mapping_fname)
		
		newSnpData.tofile(self.output_fname)
Beispiel #17
0
    def getPhenotypeDataInSNPDataOrder(cls, snpData):
        """
		2009-4-30
			get data from all the phenotypes into one matrix (accession by phenotype) 
		"""
        phenoData_inSNPDataOrder = getattr(model, "phenoData_inSNPDataOrder", None)
        if phenoData_inSNPDataOrder is None:
            phenoData = cls.getPhenotypeData()
            phenoData_inSNPDataOrder = SNPData(
                col_id_ls=phenoData.col_id_ls, strain_acc_list=snpData.row_id_ls, data_matrix=phenoData.data_matrix
            )  # row label is that of the SNP matrix
            phenoData_inSNPDataOrder.col_label_ls = phenoData.col_label_ls
            phenotype_row_id_ls = map(
                str, phenoData.row_id_ls
            )  # phenoData.row_id_ls is a list of integer ecotype ids, need to convert
            phenoData_inSNPDataOrder.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(
                snpData.row_id_ls, phenotype_row_id_ls, phenoData_inSNPDataOrder.data_matrix
            )
            model.phenoData_inSNPDataOrder = phenoData_inSNPDataOrder
        return phenoData_inSNPDataOrder
    def getPhenotypeData(cls, curs, phenotype_avg_table=None, phenotype_method_table=None, ecotype_table='stock.ecotype', get_raw_data=1,\
         getPublicPhenotype=False):
        """
		2012.9.28
			add argument getPublicPhenotype
		2009-2-2
			wrap up all other 3 methods
		"""
        phenotype_info = cls.get_phenotype_method_id_info(curs, phenotype_avg_table=phenotype_avg_table, \
                phenotype_method_table=phenotype_method_table, getPublicPhenotype=getPublicPhenotype)
        ecotype_id2index, ecotype_id_ls, ecotype_name_ls = cls.get_ecotype_id2info(curs, phenotype_avg_table=phenotype_avg_table,\
                      ecotype_table=ecotype_table, getPublicPhenotype=getPublicPhenotype)
        data_matrix = cls.get_matrix(curs, phenotype_avg_table, ecotype_id2index=ecotype_id2index, phenotype_info=phenotype_info, \
              get_raw_data=get_raw_data, phenotype_method_table=phenotype_method_table,\
              getPublicPhenotype=getPublicPhenotype)
        pheno_data = SNPData(col_id_ls=phenotype_info.phenotype_id_ls,
                             row_id_ls=ecotype_id_ls,
                             data_matrix=data_matrix)
        pheno_data.row_label_ls = ecotype_name_ls
        pheno_data.col_label_ls = phenotype_info.method_id_name_ls
        return pheno_data
Beispiel #19
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		session = db.session
		
		header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0)
		phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list_phen, data_matrix=data_matrix_phen)	#row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
		phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(phenData.row_id_ls, strain_acc_list_phen, phenData.data_matrix)	#tricky, using strain_acc_list_phen
		
		phenotype_col_index1 = self.findOutWhichPhenotypeColumn(phenData, Set([self.phenotype_method_id1]))[0]
		phenotype_col_index2 = self.findOutWhichPhenotypeColumn(phenData, Set([self.phenotype_method_id2]))[0]
		
		x_ls = []
		y_ls = []
		for i in range(phenData.data_matrix.shape[0]):
			if not numpy.isnan(phenData.data_matrix[i][phenotype_col_index1]) and not numpy.isnan(phenData.data_matrix[i][phenotype_col_index2]):
				x_ls.append(phenData.data_matrix[i][phenotype_col_index1])
				y_ls.append(phenData.data_matrix[i][phenotype_col_index2])
		
		pylab.clf()
		pylab.title('Phenotype Contrast')
		pylab.plot(x_ls, y_ls, '.', alpha=0.6)
		pylab.grid(alpha=0.3)
		phenotype_method1 = Stock_250kDB.PhenotypeMethod.get(self.phenotype_method_id1)
		phenotype_method2 = Stock_250kDB.PhenotypeMethod.get(self.phenotype_method_id2)
		pylab.xlabel(phenotype_method1.short_name)
		pylab.ylabel(phenotype_method2.short_name)
		
		#draw diagonal line to show perfect correlation
		max_min_value = max(min(x_ls), min(y_ls))
		min_max_value = min(max(x_ls), max(y_ls))
		pylab.plot([max_min_value, min_max_value], [max_min_value, min_max_value], c='g', alpha=0.7)
		
		png_output_fname = '%s.png'%self.output_fname_prefix
		pylab.savefig(png_output_fname, dpi=400)
		pylab.savefig('%s.svg'%self.output_fname_prefix)
Beispiel #20
0
    def run(self):
        """
		2008-9-7
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        delimiter = figureOutDelimiter(self.input_fname, report=self.report)
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.input_fname, delimiter=delimiter)

        snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
            data_matrix=data_matrix)
        newSnpData, allele_index2allele_ls = snpData.convert2Binary(
            self.report)

        if self.mapping_fname:  #output allele_index2allele_ls
            self.output_allele2index_ls(snpData, allele_index2allele_ls,
                                        self.mapping_fname)

        newSnpData.tofile(self.output_fname)
Beispiel #21
0
    def create_init_data(self):
        """
		2009-6-5
			add argument ignore_het=1 to snpData_2010_149_384 & snpData_perlegen
		2008-05-12
			initial data loading on node 0
		"""
        init_data = PassingData()
        init_data.snpData_250k = SNPData(input_fname=self.input_fname,
                                         turn_into_array=1)
        init_data.snpData_2010_149_384 = SNPData(
            input_fname=self.fname_2010_149_384,
            turn_into_array=1,
            ignore_2nd_column=1,
            ignore_het=1)
        init_data.snpData_perlegen = SNPData(input_fname=self.fname_perlegen,
                                             turn_into_array=1,
                                             ignore_2nd_column=1,
                                             ignore_het=1)
        param_d = self.generate_parameters(self.parameter_names)
        init_data.param_d = param_d
        return init_data
Beispiel #22
0
 def main(self):
     if self.debug:
         import pdb
         pdb.set_trace()
     if self.input_file_format == 1:
         header, strain_acc_list, category_list, data_matrix = read_data(
             self.input_fname, turn_into_integer=0)
         snps_name_ls = header[2:]
         no_of_rows = len(strain_acc_list)
         no_of_samplings = int(
             math.ceil(self.coverage * no_of_rows /
                       float(self.no_of_accessions_per_sampling)))
         if no_of_samplings > 1:
             imputed_matrix, new_snps_name_ls = self.samplingImpute(snps_name_ls, data_matrix, input_file_format=1, \
                      input_NA_char='0', lower_case_for_imputation=self.lower_case_for_imputation,\
                      npute_window_size=self.single_window_size, no_of_accessions_per_sampling=self.no_of_accessions_per_sampling,\
                      coverage=self.coverage)
             imputedData = YuSNPData(strain_acc_list=strain_acc_list,
                                     category_list=category_list,
                                     col_id_ls=snps_name_ls,
                                     data_matrix=imputed_matrix)
             imputedData.tofile(self.output_fname)
         else:
             self.outputHeader(self.output_fname, strain_acc_list,
                               category_list)
             chr2no_of_snps = self.get_chr2no_of_snps(snps_name_ls)
             chr_ls = chr2no_of_snps.keys()
             chr_ls.sort()
             for chromosome in chr_ls:
                 snpData = SNPData(inFile=self.input_fname, snps_name_ls=snps_name_ls, data_matrix=data_matrix, chromosome=chromosome, \
                     input_file_format=self.input_file_format, lower_case_for_imputation=self.lower_case_for_imputation)
                 self.run(snpData)
     else:
         snpData = SNPData(
             inFile=self.input_fname,
             input_file_format=self.input_file_format,
             lower_case_for_imputation=self.lower_case_for_imputation)
         self.run(snpData)
Beispiel #23
0
	def run(self):
		cnvIntensityData = SNPData(input_fname=self.input_fname, turn_into_array=1, ignore_2nd_column=1, matrix_data_type=float)
		probe_pos_ls = []
		avg_intensity_ls = []
		
		if self.run_type == 1:
			newDataMatrix = numpy.ones(cnvIntensityData.data_matrix.shape, numpy.int)
		
		for j in range(cnvIntensityData.data_matrix.shape[1]):
			probe_id = cnvIntensityData.col_id_ls[j]
			probe_id = probe_id.split('_')
			probe_id = map(int, probe_id)
			probe_pos_ls.append(probe_id[1])
			avg_intensity_ls.append(numpy.sum(cnvIntensityData.data_matrix[:,j]))
			if self.run_type==1:
				for i in range(cnvIntensityData.data_matrix.shape[0]):
					if cnvIntensityData.data_matrix[i][j]<=self.max_del_intensity:
						newDataMatrix[i][j] = -1
		
		if self.run_type==1:
			newData = SNPData(row_id_ls=cnvIntensityData.row_id_ls, col_id_ls=cnvIntensityData.col_id_ls, data_matrix=newDataMatrix)
			newData.tofile(self.output_fname)
		elif self.run_type==2:
			block_size = 1000
			no_of_probes = len(probe_pos_ls)
			no_of_blocks = no_of_probes/block_size
			for i in range(no_of_blocks):
				if i*block_size>no_of_probes:
					break
				start_index = i*block_size
				end_index = min((i+1)*block_size, no_of_probes)
				fname = '%s_%s_%s.png'%(self.output_fname, probe_pos_ls[start_index], probe_pos_ls[end_index])
				pylab.clf()
				pylab.plot(probe_pos_ls[start_index:end_index], avg_intensity_ls[start_index:end_index], '.', markersize=4, alpha=0.4)
				pylab.xlabel('chromosome position')
				pylab.ylabel('sum intensity')
				pylab.savefig(fname, dpi=300)
    def run(self):
        """
		2009-5-28
		"""
        if self.debug:
            import pdb
            pdb.set_trace()
        db = Stock_250kDB(drivername=self.drivername,
                          username=self.db_user,
                          password=self.db_passwd,
                          hostname=self.hostname,
                          database=self.dbname,
                          schema=self.schema)
        db.setup(create_tables=False)

        nativename2tg_ecotypeid_set = getNativename2TgEcotypeIDSet(
            db.metadata.bind, turnUpperCase=True)
        ecotype_id_set_250k_in_pipeline = get_ecotype_id_set_250k_in_pipeline(
            ArrayInfo)
        ecotypeid2tg_ecotypeid = get_ecotypeid2tg_ecotypeid(db.metadata.bind)

        #turn_into_integer=2 because it's not nucleotides
        header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(
            self.input_fname, turn_into_integer=2, matrix_data_type=float)
        data_matrix_phen = numpy.array(data_matrix_phen)

        #2009-8-19 bug here. strain_acc_list_phen is not unique for each row. causing replicates to have the same value
        #from Association import Association
        #data_matrix_phen = Association.get_phenotype_matrix_in_data_matrix_order(strain_acc_list_phen, strain_acc_list_phen, data_matrix_phen)

        phenData = SNPData(header=header_phen,
                           strain_acc_list=strain_acc_list_phen,
                           data_matrix=data_matrix_phen)

        ecotype_id_ls = self.straightenEcotypeID(phenData.row_id_ls, nativename2tg_ecotypeid_set, ecotypeid2tg_ecotypeid, \
                  ecotype_id_set_250k_in_pipeline)

        session = db.session
        session.begin()
        if self.run_type == 1:
            self.putPhenotypeIntoDB(db, phenData, ecotype_id_ls)
        elif self.run_type == 2:
            self.putReplicatePhenotypeIntoDB(db, phenData, ecotype_id_ls)
        else:
            sys.stderr.write("Unsupported run type: %s.\n" % (self.run_type))
        if self.commit:
            session.commit()
Beispiel #25
0
	def run(self):
		"""
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		
		cnvIntensityData = self.getBeforeGADAIntensityData(self.input_fname)
		#cnvIntensityData = SNPData(input_fname=self.input_fname, turn_into_array=1, ignore_2nd_column=1, matrix_data_type=float)
		
		qcData = SNPData(input_fname=self.qc_fname, turn_into_array=1, ignore_2nd_column=1)
		if not os.path.isdir(self.output_dir):
			os.makedirs(self.output_dir)
		
		for probe_id in qcData.col_id_ls:
			if probe_id in cnvIntensityData.col_id2col_index:
				cnv_col_index = cnvIntensityData.col_id2col_index[probe_id]
				qc_col_index = qcData.col_id2col_index[probe_id]
				count_ls = []
				intensity_ls = []
				for i in range(len(qcData.row_id_ls)):
					row_id = qcData.row_id_ls[i]
					if qcData.data_matrix[i][qc_col_index]>=0 and row_id in cnvIntensityData.row_id2row_index:
						cnv_row_index = cnvIntensityData.row_id2row_index[row_id]
						count = qcData.data_matrix[i][qc_col_index]
						count_ls.append(count)
						intensity_ls.append(cnvIntensityData.data_matrix[cnv_row_index][cnv_col_index])
				count_set = set(count_ls)
				if len(count_set)>0 and count_set!=set([0]):
					pylab.clf()
					ax = pylab.axes([0.1, 0.1, 0.8, 0.8], frameon=False)
					ax.grid(True, alpha=0.3)
					pylab.plot(count_ls, intensity_ls, '.', markersize=5, alpha=0.4)
					pylab.xlabel('count')
					pylab.ylabel('CNV probe intensity')
					pylab.ylim([-1,1])
					xlim = list(ax.get_xlim())
					xlim[0] -= 1
					xlim[1] += 1
					ax.set_xlim(xlim)
					pylab.title(probe_id)
					pylab.savefig(os.path.join(self.output_dir, '%s.png'%probe_id), dpi=300)
    def getHaploGroupSNPMatrix(self):
        """
		2009-4-18
		"""
        sys.stderr.write("Getting HaploGroup SNP matrix ...")

        col_id_ls = []
        row_id_ls = []
        if self.debug:
            no_of_rows = 10
        else:
            no_of_rows = StockDB.HaploGroup.query.count()

        col_id2col_index = {}
        for row in StockDB.SNPs.query.order_by(
                StockDB.SNPs.chromosome).order_by(StockDB.SNPs.position):
            col_id_ls.append(row.id)
            col_id2col_index[row.id] = len(col_id2col_index)

        no_of_cols = len(col_id2col_index)

        data_matrix = numpy.zeros([no_of_rows, no_of_cols], numpy.int8)
        rows = StockDB.HaploGroup.query.all()
        row_index = 0
        for row in rows:
            data_rows = StockDB.FilteredCalls.query.filter_by(
                ecotypeid=row.ref_ecotypeid)
            row_index = len(row_id_ls)
            for one_call in data_rows:
                nt_number = nt2number[one_call.allele]
                col_index = col_id2col_index[one_call.snpid]
                data_matrix[row_index][col_index] = nt_number
            row_id_ls.append(row.id)
            if self.debug and row_index == no_of_rows - 1:
                break
        snpData = SNPData(col_id_ls=col_id_ls,
                          row_id_ls=row_id_ls,
                          data_matrix=data_matrix)
        sys.stderr.write("Done.\n")
        return snpData
Beispiel #27
0
	def getBeforeGADAIntensityData(self, input_fname):
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		session = db.session
		
		data_matrix, probe_id_ls, chr_pos_ls, header = CNVNormalize.get_input(input_fname)
		
		col_id_ls = []
		for chr_pos in chr_pos_ls:
			col_id_ls.append('%s_%s'%(chr_pos[0], chr_pos[1]))
		
		ecotype_id_ls = []
		for array_id in header[1:-2]:
			array = Stock_250kDB.ArrayInfo.get(int(array_id))
			if array:
				ecotype_id = array.maternal_ecotype_id
				
			else:
				ecotype_id = -1
			ecotype_id_ls.append('%s'%ecotype_id)
		cnvIntensityData = SNPData(row_id_ls=ecotype_id_ls, col_id_ls=col_id_ls, data_matrix=data_matrix.transpose())
		return cnvIntensityData
Beispiel #28
0
	def run(self):
		"""
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		snpData = SNPData(input_fname=self.inputFname, turn_into_array=1, ignore_2nd_column=1)
		snpData = SNPData.removeMonomorphicCols(snpData, NA_set=set([]))
		if self.min_MAF>0:
			snpData = SNPData.removeColsByMAF(snpData,min_MAF=self.min_MAF, NA_set=set([]))
		snpData.col_id_ls = map(int, snpData.col_id_ls)
		snpData.row_id_ls = map(int, snpData.row_id_ls)
		f = h5py.File(self.outputFname, 'w')
		import numpy
		#snpData.data_matrix.dtype = numpy.int16
		dset = f.create_dataset("data_matrix", data=snpData.data_matrix, maxshape=(None, None))	#numpy.array(snpData.data_matrix, dtype=numpy.int64)
		col_id_ls_dset = f.create_dataset('col_id_ls', data=snpData.col_id_ls, maxshape=(None,))
		row_id_ls_dset = f.create_dataset('row_id_ls', data=snpData.row_id_ls, maxshape=(None,))
		f.close()
    def run(self):
        """
		2008-11-08
			generate combinations of results_id, list_type_id and generate plots one after another
			save the plots into database if commit=1
		"""
        if self.debug:
            import pdb
            pdb.set_trace()
        db = Stock_250kDB.Stock_250kDB(drivername=self.drivername,
                                       username=self.db_user,
                                       password=self.db_passwd,
                                       hostname=self.hostname,
                                       database=self.dbname,
                                       schema=self.schema)
        db.setup()
        session = db.session

        param_obj = PassingData(call_method_id=self.call_method_id, \
              analysis_method_id=getattr(self, 'analysis_method_id', None),\
              analysis_method_id_ls=getattr(self, 'analysis_method_id_ls', None),\
              phenotype_method_id_ls=getattr(self, 'phenotype_method_id_ls', None),\
              list_type_id_ls=self.list_type_id_ls, \
              results_type=self.results_type)
        params_ls = MpiGeneListRankTest.generate_params(param_obj)

        ResultsClass, TestResultClass = db.getResultsAndTestResultsClass(
            results_type=self.results_type)

        if ResultsClass is None or TestResultClass is None:
            sys.stderr.write("Invalid results type : %s.\n" % pd.results_type)
            sys.exit(3)

        for results_id, list_type_id in params_ls:
            rm = ResultsClass.get(results_id)
            list_type = Stock_250kDB.GeneListType.get(list_type_id)
            title = 'result(%s) of %s on %s with %s(%s) list'%\
             (results_id, rm.analysis_method.short_name, rm.phenotype_method.short_name, list_type.short_name, list_type.id)

            TopSNPTestType_id_ls = self.getTopSNPTestType_id_ls(self.get_closest, self.min_MAF, self.allow_two_sample_overlapping, self.results_type, \
                 self.test_type_id, self.null_distribution_type_id)
            if self.commit:
                rows = Stock_250kDB.CandidateVsNonRatioPlot.query.filter_by(type_id=TopSNPTestType_id_ls[0]).\
                 filter_by(results_id=results_id).filter_by(list_type_id=list_type_id)
                if rows.count() > 0:
                    row = rows.first()
                    sys.stderr.write(
                        '%s already in db (%s of them) with first id=%s.\n' %
                        (title, rows.count(), row.id))
                    continue

            if not TopSNPTestType_id_ls:
                sys.stderr.write(
                    "No TopSNPTestType matches the input requirements. Exit.\n"
                )
                sys.exit(3)
            TopSNPTestType_id_ls_str = map(str, TopSNPTestType_id_ls)
            from_where_clause = "from %s t, %s y where t.type_id=y.id and t.results_id=%s and t.list_type_id=%s and y.id in (%s)"%\
             (TestResultClass.table.name, Stock_250kDB.CandidateGeneTopSNPTestRMType.table.name,\
             results_id, list_type_id, ','.join(TopSNPTestType_id_ls_str))

            no_of_top_snps_info = self.get_no_of_top_snps_info(
                db, from_where_clause)
            min_distance_info = self.get_min_distance_info(
                db, from_where_clause)
            rdata = self.get_data_matrix(db, no_of_top_snps_info, min_distance_info, from_where_clause, need_other_values=True, \
                   null_distribution_type_id=self.null_distribution_type_id)

            header = ['no_of_top_snps', ''] + min_distance_info.label_ls
            strain_acc_list = no_of_top_snps_info.label_ls
            category_list = no_of_top_snps_info.label_ls

            if SNPData.isDataMatrixEmpty(rdata.data_matrix):
                sys.stderr.write("Nothing fetched from database.\n")
                #sys.exit(3)
                continue

            if self.output_fname:
                write_data_matrix(rdata.data_matrix, self.output_fname, header,
                                  strain_acc_list, category_list)
            """
			if self.fig_fname:
				font = get_font(self.font_path, font_size=self.font_size)	#2008-08-01
				value2color_func = lambda x: Value2Color.value2HSLcolor(x, rdata.min_value, rdata.max_value)
				im_legend = drawContinousLegend(rdata.min_value, rdata.max_value, self.no_of_ticks, value2color_func, font)
				#im.save('%s_legend.png'%self.fig_fname_prefix)
				im = drawMatrix(rdata.data_matrix, value2color_func, no_of_top_snps_info.label_ls,\
							min_distance_info.label_ls, with_grid=1, font=font)
				im = combineTwoImages(im, im_legend, font=font)
				im.save(self.fig_fname)
			"""
            if self.commit:
                output_fname_prefix = None
            else:
                title_cp = title
                title_cp = title_cp.replace('/', '_')
                output_fname_prefix = '%s_%s_type_%s.png' % (os.path.splitext(
                    self.fig_fname)[0], title_cp, TopSNPTestType_id_ls[0])

            if rm.analysis_method_id == 1 or rm.analysis_method_id == 7:
                preset_xlim = [0, 8]
                preset_xlim = None
            else:
                preset_xlim = None
            return_data = self.plotCurve(rdata,
                                         no_of_top_snps_info,
                                         min_distance_info,
                                         output_fname_prefix,
                                         title=title,
                                         commit=self.commit,
                                         preset_xlim=preset_xlim)

            if self.commit and return_data.png_data:
                rows = Stock_250kDB.CandidateVsNonRatioPlot.query.filter_by(type_id=TopSNPTestType_id_ls[0]).\
                 filter_by(results_id=results_id).filter_by(list_type_id=list_type_id)
                if rows.count() > 0:
                    row = rows.first()
                    sys.stderr.write(
                        '%s already in db (%s of them) with first id=%s.\n' %
                        (title, rows.count(), row.id))
                    continue
                plot = Stock_250kDB.CandidateVsNonRatioPlot(
                    type_id=TopSNPTestType_id_ls[0],
                    results_id=results_id,
                    list_type_id=list_type_id)
                plot.png_thumbnail = return_data.png_thumbnail.getvalue()
                plot.png_data = return_data.png_data.getvalue()
                plot.svg_data = return_data.svg_data.getvalue()
                db.session.save(plot)
                db.session.flush()
Beispiel #30
0
	def run(self):	
		if self.debug:
			import pdb
			pdb.set_trace()
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
						password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup()
		session = db.session
		
		if self.test_result_type==1:
			test_result_class_table = CandidateGeneRankSumTestResult.table.name
			test_result_class_table = 'candidate_gene_rank_sum_test_result_2008_09_15'
		elif self.test_result_type==2:
			test_result_class_table = CandidateGeneTopSNPTest.table.name
		elif self.test_result_type==3:
			test_result_class_table = Stock_250kDB.CandidateGeneRankSumTestResultMethod.table.name
		else:
			sys.stderr.write(" test_result_type %s not supported.\n"%(self.test_result_type))
			sys.exit(2)

		#the condition for min_MAF is tricky because of the floating precision.
		if self.test_result_type==1:
			where_condition = "%s r, %s c, %s g where g.id=c.list_type_id and r.analysis_method_id is not null \
				and c.results_id=r.id and c.get_closest=%s and c.min_distance=%s and abs(c.min_MAF-%s)<0.00001"\
				%(ResultsMethod.table.name, test_result_class_table, GeneListType.table.name, self.get_closest, self.min_distance, self.min_MAF)
		elif self.test_result_type==2:
			where_condition = "%s r, %s rg, %s c, %s g where g.id=c.list_type_id and r.analysis_method_id is not null and r.id=rg.results_method_id \
				and c.results_id=rg.id and c.get_closest=%s and c.min_distance=%s and abs(c.min_MAF-%s)<0.00001"\
				%(ResultsMethod.table.name, ResultsByGene.table.name, test_result_class_table, GeneListType.table.name, self.get_closest, self.min_distance, self.min_MAF)
		elif self.test_result_type==3:
			where_condition = "%s r, %s c, %s g where g.id=c.list_type_id and r.analysis_method_id is not null \
				and c.results_id=r.id and c.get_closest=%s and c.min_distance=%s and abs(c.min_MAF-%s)<0.00001"\
				%(ResultsMethod.table.name, test_result_class_table, GeneListType.table.name, self.get_closest, self.min_distance, self.min_MAF)
		if self.call_method_id_ls:
			where_condition += " and r.call_method_id in (%s)"%self.call_method_id_ls
		
		if self.analysis_method_id_ls:
			where_condition += " and r.analysis_method_id in (%s)"%self.analysis_method_id_ls
		if self.super_type_id:
			where_condition += " and g.super_type_id=%s"%self.super_type_id
		
		if self.test_type:
			where_condition += " and c.test_type=%s"%self.test_type
		
		if self.test_result_type==1:
			pass
			where_condition += " and c.max_pvalue_per_gene=%s"%(self.max_pvalue_per_gene)
		elif self.test_result_type==2:
			where_condition += " and c.no_of_top_snps=%s"%(self.no_of_top_snps)		
		
		list_type_id_ls = self.getListTypeInfo(db, where_condition)
		analysis_method_id_ls = self.getAnalysisMethodInfo(db, where_condition)
		list_type_analysis_method_info = self.orderListTypeAnalysisMethodID(list_type_id_ls, analysis_method_id_ls)
		phenotype_info = self.getPhenotypeInfo(db, where_condition)
		rdata = self.get_data_matrix(db, phenotype_info, list_type_analysis_method_info, where_condition)
		
		rdata.data_matrix = self.markDataMatrixBoundary(rdata.data_matrix, phenotype_info, list_type_analysis_method_info)
		
		header = ['list_type_analysis_method', ''] + phenotype_info.phenotype_method_label_ls
		strain_acc_list = list_type_analysis_method_info.list_type_analysis_method_label_ls
		category_list = list_type_analysis_method_info.list_type_id_analysis_method_id_ls
		if SNPData.isDataMatrixEmpty(rdata.data_matrix):
			sys.stderr.write("Nothing fetched from database.\n")
			sys.exit(3)
		if self.output_fname:
			write_data_matrix(rdata.data_matrix, self.output_fname, header, strain_acc_list, category_list)
		
		if self.fig_fname:
			font = get_font(self.font_path, font_size=self.font_size)	#2008-08-01
			value2color_func = lambda x: Value2Color.value2HSLcolor(x, rdata.min_value, rdata.max_value)
			im_legend = drawContinousLegend(rdata.min_value, rdata.max_value, self.no_of_ticks, value2color_func, font)
			#im.save('%s_legend.png'%self.fig_fname_prefix)
			im = drawMatrix(rdata.data_matrix, value2color_func, list_type_analysis_method_info.list_type_analysis_method_label_ls,\
						phenotype_info.phenotype_method_label_ls, with_grid=1, font=font)
			im = combineTwoImages(im, im_legend, font=font)
			im.save(self.fig_fname)
Beispiel #31
0
    def doFilter(
        self,
        snpData,
        snpData_qc_strain,
        snpData_qc_snp,
        min_call_probability,
        max_call_mismatch_rate,
        max_call_NA_rate,
        max_snp_mismatch_rate,
        max_snp_NA_rate,
        npute_window_size,
        output_dir=None,
    ):
        """
		2009-10-11
			replace imputeData() with NPUTE.samplingImpute(..., no_of_accessions_per_sampling=300, coverage=3) to avoid memory blowup. 
		2008-12-22
			replace '=' and ',' with '_' in the output filename
		2008-05-19
			matrix_ls has to be of length >0 before concatenation
		2008-05-19
			use SNPData structure
		2008-05-18
			add onlyCommon=True to FilterAccessions.filterByError()
		2008-05-17
			add argument output_dir. if it's available, output data matrix before and after imputation
		2008-05-12
			add
			qcdata.no_of_accessions_filtered_by_mismatch
			qcdata.no_of_accessions_filtered_by_na
			qcdata.no_of_snps_filtered_by_mismatch
			qcdata.no_of_snps_filtered_by_na
			qcdata.no_of_monomorphic_snps_removed
		
		2008-05-11
			split up from computing_node_handler
		"""
        qcdata = PassingData()
        twoSNPData = TwoSNPData(
            SNPData1=snpData, SNPData2=snpData_qc_strain, row_matching_by_which_value=0, debug=self.debug
        )
        row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise()
        del twoSNPData

        newSnpData = SNPData.removeRowsByMismatchRate(snpData, row_id2NA_mismatch_rate, max_call_mismatch_rate)
        qcdata.no_of_accessions_filtered_by_mismatch = newSnpData.no_of_rows_filtered_by_mismatch

        newSnpData = SNPData.removeRowsByNARate(newSnpData, max_call_NA_rate)
        qcdata.no_of_accessions_filtered_by_na = newSnpData.no_of_rows_filtered_by_na

        twoSNPData = TwoSNPData(
            SNPData1=newSnpData, SNPData2=snpData_qc_snp, row_matching_by_which_value=0, debug=self.debug
        )
        col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise()
        del twoSNPData
        newSnpData = SNPData.removeColsByMismatchRate(newSnpData, col_id2NA_mismatch_rate, max_snp_mismatch_rate)
        qcdata.no_of_snps_filtered_by_mismatch = newSnpData.no_of_cols_filtered_by_mismatch

        newSnpData = SNPData.removeColsByNARate(newSnpData, max_snp_NA_rate)
        qcdata.no_of_snps_filtered_by_na = newSnpData.no_of_cols_filtered_by_na

        twoSNPData = TwoSNPData(
            SNPData1=newSnpData, SNPData2=snpData_qc_snp, row_matching_by_which_value=0, debug=self.debug
        )
        newSnpData = twoSNPData.mergeTwoSNPData(priority=2)
        del twoSNPData
        # MergeSnpsData.merge(snpsd_250k_tmp, snpsd_ls_qc_snp, unionType=0, priority=2)

        newSnpData = SNPData.removeMonomorphicCols(newSnpData)
        qcdata.no_of_monomorphic_snps_removed = newSnpData.no_of_monomorphic_cols

        # FilterSnps.filterMonomorphic(snpsd_250k_tmp)

        if output_dir:
            # output data here
            if not os.path.isdir(output_dir):
                os.makedirs(output_dir)
            output_fname_prefix_ls = [
                "min_oligo_call_probability_%s" % min_call_probability,
                "max_array_mismatch_rate_%s" % max_call_mismatch_rate,
                "max_array_NA_rate_%s" % max_call_NA_rate,
                "max_snp_mismatch_rate_%s" % max_snp_mismatch_rate,
                "max_snp_NA_rate_%s" % max_snp_NA_rate,
                "npute_window_size_%s" % npute_window_size,
            ]
            output_fname = os.path.join(output_dir, "_".join(output_fname_prefix_ls + ["before_imputation.tsv"]))
            newSnpData.tofile(output_fname)
            # chromosomes = [snpsd_250k_tmp[i].chromosome for i in range(len(snpsd_250k_tmp))]
            # snpsdata.writeRawSnpsDatasToFile(output_fname, snpsd_250k_tmp, chromosomes=chromosomes, deliminator=',', withArrayIds = True)

        """
		qcdata.no_of_snps_filtered_by_mismatch = 0
		qcdata.no_of_snps_filtered_by_na = 0
		qcdata.no_of_monomorphic_snps_removed = 0
		for snpsd in snpsd_250k_tmp:
			qcdata.no_of_snps_filtered_by_mismatch += snpsd.no_of_snps_filtered_by_mismatch
			qcdata.no_of_snps_filtered_by_na += snpsd.no_of_snps_filtered_by_na
			qcdata.no_of_monomorphic_snps_removed += snpsd.no_of_monomorphic_snps_removed
		"""

        # snpData0 = RawSnpsData_ls2SNPData(snpsd_250k_tmp)

        twoSNPData0 = TwoSNPData(SNPData1=newSnpData, SNPData2=snpData_qc_strain, row_matching_by_which_value=0)
        row_id2NA_mismatch_rate0 = twoSNPData0.cmp_row_wise()
        col_id2NA_mismatch_rate0 = twoSNPData0.cmp_col_wise()
        del twoSNPData0

        result = []
        # for npute_window_size in npute_window_size_ls:
        # snpsd_250k_tmp_1 = copy.deepcopy(snpsd_250k_tmp)	#deepcopy, otherwise snpsd_250k_tmp_1[i].snps = [] would clear snpsd_250k_tmp up as well
        if len(newSnpData.row_id_ls) > 5:
            snps_name_ls = newSnpData.col_id_ls
            ## 2009-10-8 use NPUTE.samplingImpute()
            imputed_matrix, new_snps_name_ls = NPUTE.samplingImpute(
                snps_name_ls,
                newSnpData.data_matrix,
                input_file_format=1,
                input_NA_char=0,
                lower_case_for_imputation=False,
                npute_window_size=int(npute_window_size),
                no_of_accessions_per_sampling=300,
                coverage=3,
            )
            snpData_imputed = SNPData(
                row_id_ls=newSnpData.row_id_ls, col_id_ls=new_snps_name_ls, data_matrix=imputed_matrix
            )
            """
			## 2009-10-8 use NPUTE.samplingImpute() instead. comment out below
			chr2no_of_snps = NPUTE.get_chr2no_of_snps(snps_name_ls)
			chr_ls = chr2no_of_snps.keys()
			chr_ls.sort()
			snpData_imputed = SNPData(row_id_ls = newSnpData.row_id_ls, col_id_ls=[])
			matrix_ls = []
			for chromosome in chr_ls:
				if chr2no_of_snps[chromosome]>5:	#enough for imputation
					npute_data_struc = NPUTESNPData(snps_name_ls=snps_name_ls, data_matrix=newSnpData.data_matrix, chromosome=chromosome, \
									input_file_format=1, input_NA_char=0)
					imputeData(npute_data_struc, int(npute_window_size))
					matrix_ls.append(npute_data_struc.snps)
					snpData_imputed.col_id_ls += npute_data_struc.chosen_snps_name_ls
			if len(matrix_ls)>0:
				snpData_imputed.data_matrix = num.transpose(num.concatenate(matrix_ls))
			"""
            if output_dir:  # 2008-05-16 write the data out if output_fname is available
                # chromosomes = [snpsd_250k_tmp[i].chromosome for i in range(len(snpsd_250k_tmp))]	#already produced in the previous before_imputation output
                output_fname = os.path.join(output_dir, "_".join(output_fname_prefix_ls + ["after_imputation.tsv"]))
                # snpsdata.writeRawSnpsDatasToFile(output_fname, snpsd_250k_tmp, chromosomes=chromosomes, deliminator=',', withArrayIds = True)
                snpData_imputed.tofile(output_fname)

            twoSNPData1 = TwoSNPData(
                SNPData1=snpData_imputed, SNPData2=snpData_qc_strain, row_matching_by_which_value=0
            )
            qcdata.row_id2NA_mismatch_rate1 = twoSNPData1.cmp_row_wise()
            qcdata.col_id2NA_mismatch_rate1 = twoSNPData1.cmp_col_wise()
            del twoSNPData1, snpData_imputed
        else:
            snpData_imputed = None
            # qcdata.row_id2NA_mismatch_rate1 = {}
            # qcdata.col_id2NA_mismatch_rate1 = {}
        del newSnpData
        """
		for i in range(len(snpsd_250k_tmp)):
			#snpsd_250k_tmp_1[i].snps = []	#clear it up
			
			if len(snpsd_250k_tmp[i].accessions)>5 and len(snpsd_250k_tmp[i].positions)>5:	#not enough for imputation
				npute_data_struc = NPUTESNPData(inFile=snpsd_250k_tmp[i], input_NA_char='NA', input_file_format=4, lower_case_for_imputation=0)
				imputeData(npute_data_struc, int(npute_window_size))
				snpsd_250k_tmp[i].snps = npute_data_struc.snps
				del npute_data_struc
			"""
        qcdata.row_id2NA_mismatch_rate0 = row_id2NA_mismatch_rate0
        qcdata.col_id2NA_mismatch_rate0 = col_id2NA_mismatch_rate0

        qcdata.min_call_probability = min_call_probability
        qcdata.max_call_mismatch_rate = max_call_mismatch_rate
        qcdata.max_call_NA_rate = max_call_NA_rate
        qcdata.max_snp_mismatch_rate = max_snp_mismatch_rate
        qcdata.max_snp_NA_rate = max_snp_NA_rate
        qcdata.npute_window_size = npute_window_size
        result.append(qcdata)
        return result
Beispiel #32
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		session = db.session
		
		snpData = SNPData(input_fname=self.input_fname, turn_into_integer=1, turn_into_array=1, ignore_2nd_column=1)
		
		
		if self.eigen_vector_fname and self.eigen_value_fname:
			eigen_value_ls = self.getEigenValueFromFile(self.eigen_value_fname)
			eigen_value_ls = numpy.array(eigen_value_ls)
			explained_var = eigen_value_ls/numpy.sum(eigen_value_ls)
			PC_data = self.getPCFromFile(self.eigen_vector_fname)
			PC_matrix = PC_data.PC_matrix
		else:
			max_no_of_snps = 10000
			if len(snpData.col_id_ls)>max_no_of_snps:	#2008-12-01 randomly pick max_no_of_snps SNPs
				picked_col_index_ls = random.sample(range(len(snpData.col_id_ls)), max_no_of_snps)
				new_col_id_ls = [snpData.col_id_ls[i] for i in picked_col_index_ls]
				newSnpData = SNPData(row_id_ls=snpData.row_id_ls, col_id_ls=new_col_id_ls, strain_acc_list=snpData.strain_acc_list,\
								category_list=snpData.category_list)
				newSnpData.data_matrix = snpData.data_matrix[:, picked_col_index_ls]
				snpData = newSnpData
		
			snpData, allele_index2allele_ls = snpData.convertSNPAllele2Index()
			explained_var = None
			PC_matrix = None
		
		header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0)
		phenData = SNPData(header=header_phen, strain_acc_list=snpData.strain_acc_list, data_matrix=data_matrix_phen)	#row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
		phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, strain_acc_list_phen, phenData.data_matrix)	#tricky, using strain_acc_list_phen
		
		phenotype_col_index = self.findOutWhichPhenotypeColumn(phenData, Set([self.phenotype_method_id]))[0]
		
		
		ecotype_info = getEcotypeInfo(db, self.country_order_type)
		
		#the offset below decides where the label of strains/snps should start in axe_snp_matrix
		#2008-11-14 only for PlotGroupOfSNPs.py. you can set it to 1 cuz we dont' draw axe_snp_matrix here.
		snp_id_label_y_offset = 0.95
		StrainID2PCAPosInfo = self.getStrainID2PCAPosInfo(snpData, pca_range=[0,1], snp_id_label_y_offset=snp_id_label_y_offset, explained_var=explained_var, T=PC_matrix)
		
		axe_y_offset1 = 0.03
		axe_height1 = 0.45	#height of axe_chromosome, twice height of axe_map_phenotype_legend
		axe_y_offset2 = axe_y_offset1+axe_height1
		axe_height2 = 0.5	#height of axe_strain_pca, axe_snp_matrix, axe_map
		axe_y_offset3 = axe_y_offset2+axe_height2
		
		axe_x_offset1 = 0.05
		axe_width1 = 0.8	#width of axe_strain_pca
		axe_x_offset2 = axe_x_offset1 + 0.02 + axe_width1
		axe_width2 = 0.05	#width of axe_chromosome, axe_snp_matrix, axe_snp_pca
		axe_x_offset3 = axe_x_offset2 + axe_width2
		axe_width3 = 0.02	#width of axe_phenotype
		
		phenotype_method = Stock_250kDB.PhenotypeMethod.get(self.phenotype_method_id)
		
		phenotype_cmap = mpl.cm.jet
		max_phenotype = numpy.nanmax(phenData.data_matrix[:,phenotype_col_index])	#nanmax ignores the nan elements
		min_phenotype = numpy.nanmin(phenData.data_matrix[:,phenotype_col_index])	#nanmin ignores the nan elements
		phenotype_gap = max_phenotype - min_phenotype
		phenotype_jitter = phenotype_gap/10.
		phenotype_norm = mpl.colors.Normalize(vmin=min_phenotype-phenotype_jitter, vmax=max_phenotype+phenotype_jitter)
		axe_map_phenotype_legend = pylab.axes([axe_x_offset2, axe_y_offset1, axe_width2, 0.3], frameon=False)
		cb = mpl.colorbar.ColorbarBase(axe_map_phenotype_legend, cmap=phenotype_cmap,
									norm=phenotype_norm,
									orientation='vertical')
		cb.set_label('Legend Of Phenotype %s %s'%(phenotype_method.id, phenotype_method.short_name))
		
		axe_strain_map = pylab.axes([axe_x_offset1, axe_y_offset2, axe_width1, axe_height2], frameon=False)
		axe_strain_pca = pylab.axes([axe_x_offset1, axe_y_offset1, axe_width1, axe_height1], frameon=False)
		axe_strain_map_pca_cover = pylab.axes([axe_x_offset1, axe_y_offset1, axe_width1, axe_height1+axe_height2], frameon=False, \
											sharex=axe_strain_pca)	#cover both axe_strain_map and axe_strain_pca
		axe_strain_map_pca_cover.set_yticks([])
		axe_strain_pca_xlim = [-0.05,1.05]
		axe_strain_pca_ylim = [0, 1.05]
		axe_strain_pca.set_xlim(axe_strain_pca_xlim)
		axe_strain_pca.set_ylim(axe_strain_pca_ylim)
		axe_strain_map_pca_cover_ylim = [0, (axe_height1+axe_height2)/axe_height1]	#set it accordingly
		axe_strain_map_pca_cover.set_ylim(axe_strain_map_pca_cover_ylim)
				
		axe_strain_pca.grid(True, alpha=0.3)
		axe_strain_pca.set_xticks([])
		axe_strain_pca.set_yticks([])
		axe_strain_pca_legend = None	#no pca legend
		self.drawStrainPCA(axe_strain_pca, axe_strain_map, axe_strain_map_pca_cover, axe_strain_pca_legend, StrainID2PCAPosInfo, \
						ecotype_info, phenData, \
					phenotype_col_index, phenotype_cmap, phenotype_norm, rightmost_x_value=axe_strain_pca_xlim[1],\
					strain_color_type=2, pca2map_line_color=None, ecotype_width_on_map=10,\
					draw_lines_to_axe_snp_matrix = False, strain_size_on_axe_strain_pca=14, pic_area=self.pic_area,\
					map_pca_line_alpha=0.2, map_pca_linewidth=0.2)	#customize a couple of things
		
		axe_strain_pca.set_xlim(axe_strain_pca_xlim)
		axe_strain_pca.set_ylim(axe_strain_pca_ylim)
		axe_strain_map_pca_cover.set_ylim(axe_strain_map_pca_cover_ylim)
		
		png_output_fname = '%s.png'%self.output_fname_prefix
		pylab.savefig(png_output_fname, dpi=400)
		pylab.savefig('%s.svg'%self.output_fname_prefix)
		
		self.plotLatLonPhenVsPC(ecotype_info, StrainID2PCAPosInfo, phenData, phenotype_col_index, phenotype_cmap, phenotype_norm, 
						self.output_fname_prefix, commit=self.commit)
Beispiel #33
0
    def plone_run(self, min_call_info_mismatch_rate=0.1):
        """
		2009-6-9
			pass self.max_mismatch_rate, self.min_no_of_non_NA_pairs to TwoSNPData to filter entries stored in db.
		2009-4-13
			add min_call_info_mismatch_rate
		2009-2-5
			add "create_tables=False" to db.setup()
		2008-07-02
			fix a bug which causes the program to continue read data even while call_info_id2fname is empty and input_dir is null.
		2008-07-01
			adjust to the newest functions in QC_250k.py
		2008-04-25
			return None if QC_method_id==0
		2008-04-20
			for plone to call it just to get row_id2NA_mismatch_rate
		"""

        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname,
                               host=self.hostname,
                               user=self.user,
                               passwd=self.passwd)
        curs = conn.cursor()
        self.curs = curs

        #database connection and etc
        db = Stock_250kDB.Stock_250kDB(username=self.user,
                                       password=self.passwd,
                                       hostname=self.hostname,
                                       database=self.dbname)
        db.setup(create_tables=False)
        session = db.session
        session.begin()
        #transaction = session.create_transaction()
        # if cmp_data_filename not specified, try to find in the data_description column in table QC_method.
        qm = QCMethod.query.get(self.QC_method_id)
        if not self.cmp_data_filename and self.QC_method_id != 0:
            if qm.data_description:
                data_description_ls = qm.data_description.split('=')
                if len(data_description_ls) > 1:
                    self.cmp_data_filename = qm.data_description.split(
                        '=')[1].strip()

        #after db query, cmp_data_filename is still nothing, exit program.
        if not self.cmp_data_filename and self.QC_method_id != 0:
            sys.stderr.write(
                "cmp_data_filename is still nothing even after db query. please specify it on the commandline.\n"
            )
            sys.exit(3)

        #from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.cmp_data_filename)
        strain_acc_list = map(
            int, strain_acc_list
        )  #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
        snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \
            data_matrix=data_matrix, snps_table=self.QC_method_id2snps_table.get(self.QC_method_id), ignore_het=qm.ignore_het)
        #category_list is not used.

        if self.input_dir:
            #04/22/08 Watch: call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid)
            #no submission to db
            call_info_id2fname = self.get_array_id2fname(curs, self.input_dir)
        else:
            #call_info_id2fname = self.get_call_info_id2fname(curs, self.call_info_table, self.call_QC_table, self.QC_method_id)
            call_data = self.get_call_info_id2fname(db, self.QC_method_id, self.call_method_id, \
             filter_calls_QCed=0, max_call_info_mismatch_rate=1, min_call_info_mismatch_rate=min_call_info_mismatch_rate,\
             debug=self.debug)
            call_info_id2fname = call_data.call_info_id2fname
            call_info_ls_to_return = call_data.call_info_ls_to_return

        #2008-07-01 pick the call_info_ids to be handled
        new_call_info_id2fname = {}
        for call_info_id_wanted in self.call_info_id_ls:
            if call_info_id_wanted in call_info_id2fname:
                new_call_info_id2fname[
                    call_info_id_wanted] = call_info_id2fname[
                        call_info_id_wanted]
            elif self.report:
                sys.stderr.write("%s not in call_info_id2fname.\n" %
                                 (call_info_id_wanted))
        call_info_id2fname = new_call_info_id2fname

        if call_info_id2fname:
            pdata = self.read_call_matrix(call_info_id2fname,
                                          self.min_probability)
            header = pdata.header
            call_info_id_ls = pdata.call_info_id_ls
            array_id_ls = pdata.array_id_ls
            ecotype_id_ls = pdata.ecotype_id_ls
            data_matrix = pdata.data_matrix
        elif self.input_dir:  #2008-07-02
            #input file is SNP by strain format. double header (1st two lines)
            header, snps_name_ls, category_list, data_matrix = FilterStrainSNPMatrix.read_data(
                self.input_dir, double_header=1)
            ecotype_id_ls = header[0][2:]
            call_info_id_ls = header[1][2:]
            data_matrix = numpy.array(data_matrix)
            data_matrix = data_matrix.transpose()
            header = ['', ''] + snps_name_ls  #fake a header for SNPData
        else:  #2008-07-02
            sys.stderr.write("No good arrays.\n")
            return None

        snps_name2snps_id = None

        #swap the ecotype_id_ls and call_info_id_ls when passing them to SNPData. now strain_acc_list=ecotype_id_ls
        snpData1 = SNPData(header=header, strain_acc_list=ecotype_id_ls, category_list= call_info_id_ls, data_matrix=data_matrix, \
            min_probability=self.min_probability, call_method_id=self.call_method_id, col_id2id=snps_name2snps_id,\
            max_call_info_mismatch_rate=self.max_call_info_mismatch_rate, snps_table='stock_250k.snps') #snps_table is set to the stock_250k snps_table

        twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \
             QC_method_id=self.QC_method_id, user=self.user, row_matching_by_which_value=0, debug=self.debug,\
             max_mismatch_rate=self.max_mismatch_rate, min_no_of_non_NA_pairs=self.min_no_of_non_NA_pairs)
        #2009-6-9 cross-matching results whose mismatch_rates are below max_mismatch_rate would be put into db.

        row_id2NA_mismatch_rate = None

        #2008-05-01 create a cross match table temporarily
        twoSNPData.qc_cross_match_table = 'qc_cross_match'
        twoSNPData.new_QC_cross_match_table = self.new_QC_cross_match_table
        twoSNPData.cal_row_id2pairwise_dist(
        )  #database submission is done along.
        return row_id2NA_mismatch_rate
    def prepareTwoSNPData(self,
                          db,
                          max_mismatch_rate=0.25,
                          min_no_of_non_NA_pairs=40,
                          report=0):
        """
		2009-9-23
			add arguments max_mismatch_rate & min_no_of_non_NA_pairs, and pass them to twoSNPData.
			However it's useless to control what should be inserted into db because TwoSNPData.qc_cross_match_table is
			not defined and even if it's defined, the table it'll create doesn't concord to the one in 149SNP db. 
		2008-09-10
			if self.input_fname is given, get 149SNP data from it , instead of database
		2008-8-28
			split out of run() so that MpiQC149CrossMatch could call this easily
		"""
        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname,
                               host=self.hostname,
                               user=self.db_user,
                               passwd=self.db_passwd)
        curs = conn.cursor()
        if self.input_fname:
            header, strain_acc_list, category_list, data_matrix = read_data(
                self.input_fname)
        else:
            from dbSNP2data import dbSNP2data
            snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m(
                curs, StockDB.Calls.table.name, StockDB.SNPs.table.name)
            strain_info_data = self.get_strain_id_info(
                self.QC_method_id, ignore_strains_with_qc=False)
            data_matrix = self.get_data_matrix(
                db, strain_info_data.strain_id2index, snp_id2index,
                StockDB.Calls.table.name)
            strain_acc_list = [
                strain_info_data.strain_id2acc[strain_id]
                for strain_id in strain_info_data.strain_id_list
            ]  #tg_ecotypeid
            category_list = [
                strain_info_data.strain_id2category[strain_id]
                for strain_id in strain_info_data.strain_id_list
            ]  #strainid
            header = ['ecotypeid', 'strainid']
            for snp_id in snp_id_list:
                snp_name, chromosome, position = snp_id2info[snp_id]
                header.append(snp_name)
        snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \
            snps_table='stock.snps') #snps_table is set to the stock_250k snps_table
        if self.QC_method_id == 4:
            snpData2 = snpData1
        else:
            self.cmp_data_filename = self.findOutCmpDataFilename(
                self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod)
            header, strain_acc_list, category_list, data_matrix = read_data(
                self.cmp_data_filename)
            strain_acc_list = map(
                int, strain_acc_list
            )  #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
            snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \
                data_matrix=data_matrix) #category_list is not used to facilitate row-id matching


        twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, \
             QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug,\
             max_mismatch_rate=max_mismatch_rate, min_no_of_non_NA_pairs=min_no_of_non_NA_pairs, report=report)
        return twoSNPData
Beispiel #35
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()
        db = StockDB.StockDB(drivername=self.drivername,
                             username=self.db_user,
                             password=self.db_passwd,
                             hostname=self.hostname,
                             database=self.dbname,
                             schema=self.schema)
        db.setup(create_tables=False)
        session = db.session
        order_by_sentence = " order by c.longitude, c.latitude, e.longitude, e.latitude, e.nativename "  #how to order strains.
        if self.QC_method_id == 4:
            sql_table_str = "from %s e, %s s, %s a, %s c"%(StockDB.Ecotype.table.name, StockDB.Site.table.name, StockDB.Address.table.name,\
                 StockDB.Country.table.name)
            common_where_condition = "where e.siteid=s.id and s.addressid=a.id and a.countryid=c.id %s " + order_by_sentence

            strain_where_condition = common_where_condition % (
                " and e.id=st.ecotypeid")
            strain_id_info_query = "select distinct st.id as strainid, e.id as ecotypeid, e.nativename, s.name as sitename, c.abbr %s, %s st %s" % (
                sql_table_str, StockDB.Strain.table.name,
                strain_where_condition)
        else:
            sql_table_str = "from %s q, %s e, %s s, %s a, %s c"%(StockDB.QCCrossMatch.table.name, StockDB.Ecotype.table.name, StockDB.Site.table.name, StockDB.Address.table.name,\
                  StockDB.Country.table.name)
            common_where_condition = "where e.siteid=s.id and s.addressid=a.id and a.countryid=c.id %s"+ " and q.qc_method_id=%s and q.no_of_non_NA_pairs>=%s and q.mismatch_rate<=%s "%\
             (self.QC_method_id, self.min_no_of_non_NAs, self.max_mismatch_rate) + order_by_sentence

            strain_where_condition = common_where_condition % (
                " and e.id=st.ecotypeid and st.id=q.strainid")
            strain_id_info_query = "select distinct q.strainid, e.id as ecotypeid, e.nativename, s.name as sitename, c.abbr %s, %s st %s" % (
                sql_table_str, StockDB.Strain.table.name,
                strain_where_condition)

        if self.how_to_group_strains == 2 or self.how_to_group_strains == 3:
            plate_info = self.alignStrainsAccordingToSeqPlate(db)
            id_set_data = PassingData()
            id_set_data.strain_id_set = None
            id_set_data.target_id_set = None
        elif self.input_fname:
            id_set_data = self.getStrainidTargetidFromFile(
                db, self.QC_method_id, self.input_fname,
                self.max_mismatch_rate, self.min_no_of_non_NAs)
        else:
            id_set_data = PassingData()
            id_set_data.strain_id_set = None
            id_set_data.target_id_set = None

        if self.how_to_group_strains == 2 or self.how_to_group_strains == 3:
            strain_id_info = self.getStrainInfoGivenPlateInfo(
                db, plate_info, strain_id_info_query, strain_id_set=None)
        else:
            strain_id_info = self.getStrainIDInfo(db, strain_id_info_query,
                                                  id_set_data.strain_id_set)

        if self.QC_method_id == 4:
            if self.how_to_group_strains == 3:
                #2008-09-15 column strain id is in country, strain-longitude order
                target_id_info = self.getStrainIDInfo(
                    db, strain_id_info_query, id_set_data.strain_id_set)
            else:
                target_id_info = strain_id_info
        else:
            target_where_condition = common_where_condition % (
                " and e.id=q.target_id")
            target_id_info_query = "select distinct e.id as strainid, e.id as ecotypeid, e.nativename, s.name as sitename, c.abbr %s %s" % (
                sql_table_str, target_where_condition)
            target_id_info = self.getStrainIDInfo(db, target_id_info_query)

        if self.input_fname:
            rdata = self.get_data_matrixFromFile(db, strain_id_info,
                                                 target_id_info,
                                                 self.QC_method_id,
                                                 self.input_fname,
                                                 self.max_mismatch_rate,
                                                 self.min_no_of_non_NAs)
        else:
            rdata = self.get_data_matrix(db, strain_id_info, target_id_info,
                                         self.QC_method_id,
                                         self.max_mismatch_rate,
                                         self.min_no_of_non_NAs)

        rdata.data_matrix = self.markDataMatrixBoundary(
            rdata.data_matrix, strain_id_info, target_id_info)

        header = ['strain info', ''] + target_id_info.strain_label_ls
        strain_acc_list = strain_id_info.strain_label_ls
        category_list = [1] * len(strain_acc_list)
        if SNPData.isDataMatrixEmpty(rdata.data_matrix):
            sys.stderr.write("Nothing fetched from database.\n")
            sys.exit(3)
        if self.output_fname:
            write_data_matrix(rdata.data_matrix, self.output_fname, header,
                              strain_acc_list, category_list)

        if self.fig_fname:
            font = get_font(self.font_path,
                            font_size=self.font_size)  #2008-08-01
            value2color_func = lambda x: Value2Color.value2HSLcolor(
                x, rdata.min_value, rdata.max_value)
            im_legend = drawContinousLegend(rdata.min_value, rdata.max_value,
                                            self.no_of_ticks, value2color_func,
                                            font)
            #im.save('%s_legend.png'%self.fig_fname_prefix)
            im = drawMatrix(rdata.data_matrix, value2color_func, strain_id_info.strain_label_ls,\
               target_id_info.strain_label_ls, with_grid=1, font=font)
            im = combineTwoImages(im, im_legend, font=font)
            im.save(self.fig_fname)
Beispiel #36
0
	def run(self):	
		if self.debug:
			import pdb
			pdb.set_trace()
		db = StockDB.StockDB(drivername=self.drivername, username=self.db_user,
						password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		session = db.session
		order_by_sentence = " order by c.longitude, c.latitude, e.longitude, e.latitude, e.nativename "	#how to order strains.
		if self.QC_method_id ==4:
			sql_table_str = "from %s e, %s s, %s a, %s c"%(StockDB.Ecotype.table.name, StockDB.Site.table.name, StockDB.Address.table.name,\
								StockDB.Country.table.name)
			common_where_condition = "where e.siteid=s.id and s.addressid=a.id and a.countryid=c.id %s " + order_by_sentence
			
			strain_where_condition = common_where_condition%(" and e.id=st.ecotypeid")
			strain_id_info_query = "select distinct st.id as strainid, e.id as ecotypeid, e.nativename, s.name as sitename, c.abbr %s, %s st %s"%(sql_table_str, StockDB.Strain.table.name, strain_where_condition)
		else:
			sql_table_str = "from %s q, %s e, %s s, %s a, %s c"%(StockDB.QCCrossMatch.table.name, StockDB.Ecotype.table.name, StockDB.Site.table.name, StockDB.Address.table.name,\
									StockDB.Country.table.name)
			common_where_condition = "where e.siteid=s.id and s.addressid=a.id and a.countryid=c.id %s"+ " and q.qc_method_id=%s and q.no_of_non_NA_pairs>=%s and q.mismatch_rate<=%s "%\
				(self.QC_method_id, self.min_no_of_non_NAs, self.max_mismatch_rate) + order_by_sentence
			
			strain_where_condition = common_where_condition%(" and e.id=st.ecotypeid and st.id=q.strainid")
			strain_id_info_query = "select distinct q.strainid, e.id as ecotypeid, e.nativename, s.name as sitename, c.abbr %s, %s st %s"%(sql_table_str, StockDB.Strain.table.name, strain_where_condition)
		
		if self.how_to_group_strains==2 or self.how_to_group_strains==3:
			plate_info = self.alignStrainsAccordingToSeqPlate(db)
			id_set_data = PassingData()
			id_set_data.strain_id_set = None
			id_set_data.target_id_set = None
		elif self.input_fname:
			id_set_data = self.getStrainidTargetidFromFile(db, self.QC_method_id, self.input_fname, self.max_mismatch_rate, self.min_no_of_non_NAs)
		else:
			id_set_data = PassingData()
			id_set_data.strain_id_set = None
			id_set_data.target_id_set = None
		
		if self.how_to_group_strains==2 or self.how_to_group_strains==3:
			strain_id_info = self.getStrainInfoGivenPlateInfo(db, plate_info, strain_id_info_query, strain_id_set=None)
		else:
			strain_id_info = self.getStrainIDInfo(db, strain_id_info_query, id_set_data.strain_id_set)
		
		if self.QC_method_id==4:
			if self.how_to_group_strains==3:
				#2008-09-15 column strain id is in country, strain-longitude order
				target_id_info = self.getStrainIDInfo(db, strain_id_info_query, id_set_data.strain_id_set)
			else:
				target_id_info = strain_id_info
		else:
			target_where_condition = common_where_condition%(" and e.id=q.target_id")
			target_id_info_query = "select distinct e.id as strainid, e.id as ecotypeid, e.nativename, s.name as sitename, c.abbr %s %s"%(sql_table_str, target_where_condition)
			target_id_info = self.getStrainIDInfo(db, target_id_info_query)
		
		if self.input_fname:
			rdata = self.get_data_matrixFromFile(db, strain_id_info, target_id_info,  self.QC_method_id, self.input_fname, self.max_mismatch_rate, self.min_no_of_non_NAs)
		else:
			rdata = self.get_data_matrix(db, strain_id_info, target_id_info, self.QC_method_id, self.max_mismatch_rate, self.min_no_of_non_NAs)
		
		rdata.data_matrix = self.markDataMatrixBoundary(rdata.data_matrix, strain_id_info, target_id_info)
		
		header = ['strain info', ''] + target_id_info.strain_label_ls
		strain_acc_list = strain_id_info.strain_label_ls
		category_list = [1]*len(strain_acc_list)
		if SNPData.isDataMatrixEmpty(rdata.data_matrix):
			sys.stderr.write("Nothing fetched from database.\n")
			sys.exit(3)
		if self.output_fname:
			write_data_matrix(rdata.data_matrix, self.output_fname, header, strain_acc_list, category_list)
		
		if self.fig_fname:
			font = get_font(self.font_path, font_size=self.font_size)	#2008-08-01
			value2color_func = lambda x: Value2Color.value2HSLcolor(x, rdata.min_value, rdata.max_value)
			im_legend = drawContinousLegend(rdata.min_value, rdata.max_value, self.no_of_ticks, value2color_func, font)
			#im.save('%s_legend.png'%self.fig_fname_prefix)
			im = drawMatrix(rdata.data_matrix, value2color_func, strain_id_info.strain_label_ls,\
						target_id_info.strain_label_ls, with_grid=1, font=font)
			im = combineTwoImages(im, im_legend, font=font)
			im.save(self.fig_fname)
Beispiel #37
0
    def run(self):
        """
		
		"""
        db = Stock_250kDB.Stock_250kDB(drivername=self.drivername,
                                       username=self.user,
                                       password=self.passwd,
                                       hostname=self.hostname,
                                       database=self.dbname)
        db.setup(create_tables=False)
        session = db.session
        if self.debug:
            import pdb
            pdb.set_trace()
        chr_pos2ancestral_allele = self.get_chr_pos2ancestral_allele(
            self.ancestral_allele_fname)
        pheno_data = SNPData(input_fname=self.phenotype_fname,
                             turn_into_integer=0,
                             ignore_2nd_column=1)
        pheno_data = self.process_phenotype_data(pheno_data)

        geno_data = SNPData(input_fname=self.genotype_fname,
                            turn_into_array=1,
                            matrix_data_type=int,
                            ignore_2nd_column=1)

        query = Stock_250kDB.ResultsMethod.query.filter_by(
            call_method_id=self.call_method_id).filter_by(
                analysis_method_id=self.analysis_method_id).filter_by(
                    phenotype_method_id=self.phenotype_method_id)
        if query.count() == 1:
            rm = query.first()
        elif query.count() > 1:
            sys.stderr.write(
                "Warning: more than 1 results_method for call_method_id=%s, analysis_method_id=%s, phenotype_method_id=%s.\n"
                % (self.call_method_id, self.analysis_method_id,
                   self.phenotype_method_id))
            rm = query.first()
        else:
            sys.stderr.write(
                "Error: no results_method for call_method_id=%s, analysis_method_id=%s, phenotype_method_id=%s.\n"
                % (self.call_method_id, self.analysis_method_id,
                   self.phenotype_method_id))
            sys.exit(3)

        phenotype_ls_data = self.get_phenotype_ls(rm, self.no_of_top_snps, chr_pos2ancestral_allele, pheno_data, geno_data, \
                  self.min_MAF, results_directory=self.input_dir)

        import pylab
        pylab.clf()
        hist_patch_ls = []
        legend_ls = []
        if len(phenotype_ls_data.ancestral_allele_phenotype_ls) > 2:
            n1 = pylab.hist(phenotype_ls_data.ancestral_allele_phenotype_ls,
                            100,
                            alpha=0.4,
                            normed=1)
            hist_patch_ls.append(
                n1[2][0])  #first patch in all patches of a histogram
            legend_ls.append('ancestral allele')
        if len(phenotype_ls_data.derived_allele_phenotype_ls) > 2:
            n2 = pylab.hist(phenotype_ls_data.derived_allele_phenotype_ls,
                            100,
                            alpha=0.4,
                            normed=1,
                            facecolor='r')
            hist_patch_ls.append(n2[2][0])
            legend_ls.append('derived allele')
        pylab.legend(hist_patch_ls, legend_ls)
        if self.output_fname_prefix:
            pylab.savefig('%s.svg' % self.output_fname_prefix, dpi=300)
Beispiel #38
0
    def outputArray(cls, session, curs, output_dir=None, array_info_table=None, snps=None, \
       probes=None, array_id_ls=[], \
       xy_ls=[], chr_pos_ls=[], probes_id_ls=[],\
       call_method_id=0, run_type=1, array_file_directory=None, outputCNVIntensity=True,\
       returnArrayIntensityData=False):
        """
		2010-5-10
			curs could be elixirdb.metadata.bind or MySQLdb.connect
		2010-5-5
			changed to classmethod
			add argument outputCNVIntensity: whether to output CNV intensity data, default=True.
				returnArrayIntensityData: whether return array CNV intensity data in a SNPData structure
		2009-10-9
			add argument array_file_directory.
		2009-3-11
			add run_type=3
				calculate intensity medium of all probes in the array and store the value in db
			array_id_ls is a list of array_ids in str type
		2009-3-5
			skip if no probes (if one_snp.probes_id_ls == [-1]*4:) for that SNP (fake SNP in the SNP table)
		2008-12-09
			add option run_type
		2008-07-12
			add option array_id
		2008-04-08
		"""
        sys.stderr.write("Outputting arrays ... \n")
        import rpy
        rpy.r.library('affy')
        array_width = None
        if run_type != 3 and output_dir and not os.path.isdir(
                output_dir):  #2010-5-5 test if output_dir is something
            os.makedirs(output_dir)

        sql_query = cls.generateSQLQueryToGetArrays(array_info_table, array_id_ls=array_id_ls, \
                  call_method_id=call_method_id, run_type=run_type)
        print sql_query
        rows = curs.execute(sql_query)
        is_elixirdb = 1  # 2010-5-10 By default, assume curs is elixirdb.metadata.bind
        if hasattr(curs, 'fetchall'):  # 2010-5-10 curs is MySQLdb.connect
            rows = curs.fetchall()
            is_elixirdb = 0
            no_of_objects = len(rows)
        else:
            no_of_objects = int(rows.rowcount)

        if run_type == 2:  #2008-12-09 don't initialize the data_matrix if run_type is not 2 (CNV probe).
            data_matrix = numpy.zeros([len(probes_id_ls), no_of_objects],
                                      numpy.float32)
        array_id_avail_ls = []
        array_label_ls = []
        i = 0
        for row in rows:
            if is_elixirdb:
                array_id = row.array_id
                filename = row.filename
                ecotype_id = row.maternal_ecotype_id
            else:
                array_id, filename, ecotype_id = row[:3]
            array_id_avail_ls.append(array_id)
            array_label_ls.append('%s_%s' % (array_id, ecotype_id))

            if array_file_directory and os.path.isdir(array_file_directory):
                filename = os.path.join(array_file_directory,
                                        os.path.split(filename)[1])

            sys.stderr.write("\t%d/%d: Extracting intensity from %s ... \n" %
                             (i + 1, no_of_objects, filename))

            if run_type == 1:  #output SNP probe intensity within the loop
                output_fname = os.path.join(
                    output_dir, '%s_array_intensity.tsv' % (array_id))
                if os.path.isfile(output_fname):
                    sys.stderr.write("\tFile %s already exists. Ignore.\n" %
                                     (output_fname))
                    continue

            #read array by calling R
            if array_width == None:
                returnData = cls.getArrayWidth(filename)
                intensity_array = returnData.intensity_array
                array = returnData.array
                array_width = returnData.array_width
            else:
                array = rpy.r.read_affybatch(filenames=filename)
                intensity_array = rpy.r.intensity(
                    array)  #return a lengthX1 2-Dimensional array.

            if run_type == 2:  #CNV probe
                for j in range(len(xy_ls)):
                    xpos, ypos = xy_ls[j]
                    #chromosome, position = chr_pos_ls[j]
                    intensity_array_index = array_width * (array_width - xpos -
                                                           1) + ypos
                    #output_row = [chromosome, position]
                    intensity = math.log10(
                        intensity_array[intensity_array_index][0])
                    #output_row.append(intensity)
                    #writer.writerow(output_row)
                    data_matrix[j][i] = intensity
            elif run_type == 1:  #SNP probe intensity
                writer = csv.writer(open(output_fname, 'w'), delimiter='\t')
                header = ['sense1', 'sense2', 'antisense1', 'antisense2']

                func = lambda x: '%s_%s' % (array_id, x)
                header = map(func, header)
                header = ['SNP_ID'] + header
                writer.writerow(header)
                for snps_id in snps.snps_id_ls:
                    one_snp = snps.get_one_snp(snps_id)
                    output_row = [one_snp.snpid]
                    if one_snp.probes_id_ls == [
                            -1
                    ] * 4:  #2009-3-5 skip if no probes for that SNP (fake SNP in the SNP table)
                        continue
                    for probes_id in one_snp.probes_id_ls:
                        one_probe = probes.get_one_probe(probes_id)
                        intensity_array_index = array_width * (
                            array_width - one_probe.xpos - 1) + one_probe.ypos
                        output_row.append(
                            intensity_array[intensity_array_index][0])
                    writer.writerow(output_row)
                del writer
            elif run_type == 3:  #calculate the intensity medium of all probes and store into db
                median_intensity = numpy.median(intensity_array)
                array_info_entry = Stock_250kDB.ArrayInfo.get(array_id)
                array_info_entry.median_intensity = median_intensity
                session.add(array_info_entry)
            else:
                sys.stderr.write("Error: run_type %s is not supported.\n" %
                                 run_type)
                sys.exit(3)

            del intensity_array, array
            i += 1

        if run_type == 2 and outputCNVIntensity:
            #2008-11-13 output in Roger's multi-sample format
            header = ['probes_id'
                      ] + array_id_avail_ls + ['chromosome', 'position']
            output_fname = os.path.join(
                output_dir,
                'call_method_%s_CNV_intensity.tsv' % (call_method_id))

            writer = csv.writer(open(output_fname, 'w'), delimiter='\t')
            writer.writerow(header)
            for i in range(data_matrix.shape[0]):
                data_row = [probes_id_ls[i]] + list(data_matrix[i]) + list(
                    chr_pos_ls[i])
                writer.writerow(data_row)
            del writer
        sys.stderr.write("Done.\n")
        if returnArrayIntensityData:  #2010-5-5
            arrayIntensityData = SNPData(row_id_ls=xy_ls,
                                         col_id_ls=array_label_ls,
                                         data_matrix=data_matrix)
            return arrayIntensityData
Beispiel #39
0
    def getCNVQCMatrix(self, probe_id2snp_id_ls, snp_id2tup, snpData,
                       SNP2Col_allele, cnvIntensityData):
        """
		2009-2-12
		"""
        sys.stderr.write("Getting CNV QC matricies ...")
        mismatch_matrix = numpy.zeros(
            [len(snpData.row_id_ls),
             len(probe_id2snp_id_ls)], numpy.int)
        mismatch_matrix[:] = -2
        insertion_matrix = numpy.zeros(mismatch_matrix.shape, numpy.int)
        insertion_matrix[:] = -2
        deletion_matrix = numpy.zeros(mismatch_matrix.shape, numpy.int)
        deletion_matrix[:] = -2
        qc_matrix = numpy.zeros(mismatch_matrix.shape, numpy.int)
        qc_matrix[:] = -2

        cnv_probe_ls = probe_id2snp_id_ls.keys()
        cnv_probe_ls.sort()
        cnv_probe2index = dict(zip(cnv_probe_ls, range(len(cnv_probe_ls))))

        total_disp_pos_ls = []
        total_intensity_ls = []
        total_mismatch_ls = []
        total_insertion_ls = []
        total_deletion_ls = []
        total_mis_ls = []
        for i in range(mismatch_matrix.shape[0]):
            row_id = snpData.row_id_ls[i]
            if row_id in cnvIntensityData.row_id2row_index:
                cnv_row_index = cnvIntensityData.row_id2row_index[row_id]
                for probe_id, snp_id_ls in probe_id2snp_id_ls.iteritems():
                    col_index = cnv_probe2index[probe_id]
                    probe_id_label = '%s_%s' % (probe_id[0], probe_id[1])
                    cnv_col_index = cnvIntensityData.col_id2col_index[
                        probe_id_label]

                    no_of_mismatches = 0
                    no_of_deletions = 0
                    no_of_insertions = 0
                    is_this_probe_NA = 1
                    disp_pos_ls = []
                    for snp_id, disp_pos in snp_id_ls:
                        snp_id_tup = snp_id2tup[snp_id]
                        disp_pos_ls.append(disp_pos)
                        snp_col_index = snpData.col_id2col_index[snp_id]
                        allele = snpData.data_matrix[i][snp_col_index]
                        col_allele = SNP2Col_allele[snp_id]
                        if allele == -2 or allele == 0:
                            continue
                        else:
                            is_this_probe_NA = 0
                            if snp_id_tup[2] != 0:  #the offset is not 0
                                if allele != -1:  #if it's deleted, then it's nothing
                                    no_of_insertions += 1
                            elif allele == -1:
                                no_of_deletions += 1
                            elif col_allele == -2 or col_allele == 0:
                                sys.stderr.write("allele for this accession %s at snp %s is %s while reference allele is NA: %s.\n"%\
                                    (snpData.row_id_ls[i], snp_id, allele, col_allele))
                            elif allele != col_allele:
                                no_of_mismatches += 1
                    if not is_this_probe_NA:
                        mean_disp_pos = numpy.mean(disp_pos_ls)
                        mismatch_matrix[i][col_index] = no_of_mismatches
                        insertion_matrix[i][col_index] = no_of_insertions
                        deletion_matrix[i][col_index] = no_of_deletions
                        total_mis_count = no_of_mismatches + no_of_insertions + no_of_deletions
                        qc_matrix[i][col_index] = total_mis_count
                        total_disp_pos_ls.append(mean_disp_pos)
                        total_intensity_ls.append(
                            cnvIntensityData.data_matrix[cnv_row_index]
                            [cnv_col_index])
                        total_mismatch_ls.append(no_of_mismatches)
                        total_insertion_ls.append(no_of_insertions)
                        total_deletion_ls.append(no_of_deletions)
                        total_mis_ls.append(total_mis_count)
        plotData = PassingData(total_disp_pos_ls=total_disp_pos_ls, total_intensity_ls=total_intensity_ls,\
             total_mismatch_ls=total_mismatch_ls, total_insertion_ls=total_insertion_ls, total_deletion_ls=total_deletion_ls,\
             total_mis_ls=total_mis_ls)
        mismatchData = SNPData(row_id_ls=snpData.row_id_ls,
                               col_id_ls=cnv_probe_ls,
                               data_matrix=mismatch_matrix)
        insertionData = SNPData(row_id_ls=snpData.row_id_ls,
                                col_id_ls=cnv_probe_ls,
                                data_matrix=insertion_matrix)
        deletionData = SNPData(row_id_ls=snpData.row_id_ls,
                               col_id_ls=cnv_probe_ls,
                               data_matrix=deletion_matrix)
        qcData = SNPData(row_id_ls=snpData.row_id_ls,
                         col_id_ls=cnv_probe_ls,
                         data_matrix=qc_matrix)
        sys.stderr.write("Done.\n")
        return PassingData(mismatchData=mismatchData,
                           insertionData=insertionData,
                           deletionData=deletionData,
                           qcData=qcData,
                           plotData=plotData)
Beispiel #40
0
    def doFilter(self, snpData, snpData_qc_strain, snpData_qc_snp, min_call_probability, max_call_mismatch_rate, max_call_NA_rate,\
       max_snp_mismatch_rate, max_snp_NA_rate, npute_window_size , output_dir=None):
        """
		2009-10-11
			replace imputeData() with NPUTE.samplingImpute(..., no_of_accessions_per_sampling=300, coverage=3) to avoid memory blowup. 
		2008-12-22
			replace '=' and ',' with '_' in the output filename
		2008-05-19
			matrix_ls has to be of length >0 before concatenation
		2008-05-19
			use SNPData structure
		2008-05-18
			add onlyCommon=True to FilterAccessions.filterByError()
		2008-05-17
			add argument output_dir. if it's available, output data matrix before and after imputation
		2008-05-12
			add
			qcdata.no_of_accessions_filtered_by_mismatch
			qcdata.no_of_accessions_filtered_by_na
			qcdata.no_of_snps_filtered_by_mismatch
			qcdata.no_of_snps_filtered_by_na
			qcdata.no_of_monomorphic_snps_removed
		
		2008-05-11
			split up from computing_node_handler
		"""
        qcdata = PassingData()
        twoSNPData = TwoSNPData(SNPData1=snpData, SNPData2=snpData_qc_strain, \
            row_matching_by_which_value=0, debug=self.debug)
        row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise()
        del twoSNPData

        newSnpData = SNPData.removeRowsByMismatchRate(snpData,
                                                      row_id2NA_mismatch_rate,
                                                      max_call_mismatch_rate)
        qcdata.no_of_accessions_filtered_by_mismatch = newSnpData.no_of_rows_removed

        newSnpData = SNPData.removeRowsByNARate(newSnpData, max_call_NA_rate)
        qcdata.no_of_accessions_filtered_by_na = newSnpData.no_of_rows_removed

        twoSNPData = TwoSNPData(SNPData1=newSnpData, SNPData2=snpData_qc_snp, \
            row_matching_by_which_value=0, debug=self.debug)
        col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise()
        del twoSNPData
        newSnpData = SNPData.removeColsByMismatchRate(newSnpData,
                                                      col_id2NA_mismatch_rate,
                                                      max_snp_mismatch_rate)
        qcdata.no_of_snps_filtered_by_mismatch = newSnpData.no_of_cols_filtered_by_mismatch

        newSnpData = SNPData.removeColsByNARate(newSnpData, max_snp_NA_rate)
        qcdata.no_of_snps_filtered_by_na = newSnpData.no_of_cols_filtered_by_na

        twoSNPData = TwoSNPData(SNPData1=newSnpData, SNPData2=snpData_qc_snp, \
            row_matching_by_which_value=0, debug=self.debug)
        newSnpData = twoSNPData.mergeTwoSNPData(priority=2)
        del twoSNPData
        #MergeSnpsData.merge(snpsd_250k_tmp, snpsd_ls_qc_snp, unionType=0, priority=2)

        newSnpData = SNPData.removeMonomorphicCols(newSnpData)
        qcdata.no_of_monomorphic_snps_removed = newSnpData.no_of_monomorphic_cols

        #FilterSnps.filterMonomorphic(snpsd_250k_tmp)

        if output_dir:
            #output data here
            if not os.path.isdir(output_dir):
                os.makedirs(output_dir)
            output_fname_prefix_ls = ['min_oligo_call_probability_%s'%min_call_probability,\
                  'max_array_mismatch_rate_%s'%max_call_mismatch_rate,\
                  'max_array_NA_rate_%s'%max_call_NA_rate,\
                  'max_snp_mismatch_rate_%s'%max_snp_mismatch_rate,\
                  'max_snp_NA_rate_%s'%max_snp_NA_rate,\
                  'npute_window_size_%s'%npute_window_size]
            output_fname = os.path.join(
                output_dir,
                '_'.join(output_fname_prefix_ls + ['before_imputation.tsv']))
            newSnpData.tofile(output_fname)
            #chromosomes = [snpsd_250k_tmp[i].chromosome for i in range(len(snpsd_250k_tmp))]
            #snpsdata.writeRawSnpsDatasToFile(output_fname, snpsd_250k_tmp, chromosomes=chromosomes, deliminator=',', withArrayIds = True)
        """
		qcdata.no_of_snps_filtered_by_mismatch = 0
		qcdata.no_of_snps_filtered_by_na = 0
		qcdata.no_of_monomorphic_snps_removed = 0
		for snpsd in snpsd_250k_tmp:
			qcdata.no_of_snps_filtered_by_mismatch += snpsd.no_of_snps_filtered_by_mismatch
			qcdata.no_of_snps_filtered_by_na += snpsd.no_of_snps_filtered_by_na
			qcdata.no_of_monomorphic_snps_removed += snpsd.no_of_monomorphic_snps_removed
		"""

        #snpData0 = RawSnpsData_ls2SNPData(snpsd_250k_tmp)

        twoSNPData0 = TwoSNPData(SNPData1=newSnpData, SNPData2=snpData_qc_strain, \
            row_matching_by_which_value=0)
        row_id2NA_mismatch_rate0 = twoSNPData0.cmp_row_wise()
        col_id2NA_mismatch_rate0 = twoSNPData0.cmp_col_wise()
        del twoSNPData0

        result = []
        #for npute_window_size in npute_window_size_ls:
        #snpsd_250k_tmp_1 = copy.deepcopy(snpsd_250k_tmp)	#deepcopy, otherwise snpsd_250k_tmp_1[i].snps = [] would clear snpsd_250k_tmp up as well
        if len(newSnpData.row_id_ls) > 5:
            snps_name_ls = newSnpData.col_id_ls
            ## 2009-10-8 use NPUTE.samplingImpute()
            imputed_matrix, new_snps_name_ls = NPUTE.samplingImpute(snps_name_ls, newSnpData.data_matrix, \
                         input_file_format=1, input_NA_char=0, lower_case_for_imputation=False,\
                         npute_window_size=int(npute_window_size), \
                         no_of_accessions_per_sampling=300, coverage=3)
            snpData_imputed = SNPData(row_id_ls=newSnpData.row_id_ls,
                                      col_id_ls=new_snps_name_ls,
                                      data_matrix=imputed_matrix)
            """
			## 2009-10-8 use NPUTE.samplingImpute() instead. comment out below
			chr2no_of_snps = NPUTE.get_chr2no_of_snps(snps_name_ls)
			chr_ls = chr2no_of_snps.keys()
			chr_ls.sort()
			snpData_imputed = SNPData(row_id_ls = newSnpData.row_id_ls, col_id_ls=[])
			matrix_ls = []
			for chromosome in chr_ls:
				if chr2no_of_snps[chromosome]>5:	#enough for imputation
					npute_data_struc = NPUTESNPData(snps_name_ls=snps_name_ls, data_matrix=newSnpData.data_matrix, chromosome=chromosome, \
									input_file_format=1, input_NA_char=0)
					imputeData(npute_data_struc, int(npute_window_size))
					matrix_ls.append(npute_data_struc.snps)
					snpData_imputed.col_id_ls += npute_data_struc.chosen_snps_name_ls
			if len(matrix_ls)>0:
				snpData_imputed.data_matrix = numpy.transpose(numpy.concatenate(matrix_ls))
			"""
            if output_dir:  #2008-05-16 write the data out if output_fname is available
                #chromosomes = [snpsd_250k_tmp[i].chromosome for i in range(len(snpsd_250k_tmp))]	#already produced in the previous before_imputation output
                output_fname = os.path.join(
                    output_dir, '_'.join(output_fname_prefix_ls +
                                         ['after_imputation.tsv']))
                #snpsdata.writeRawSnpsDatasToFile(output_fname, snpsd_250k_tmp, chromosomes=chromosomes, deliminator=',', withArrayIds = True)
                snpData_imputed.tofile(output_fname)

            twoSNPData1 = TwoSNPData(SNPData1=snpData_imputed, SNPData2=snpData_qc_strain, \
                row_matching_by_which_value=0)
            qcdata.row_id2NA_mismatch_rate1 = twoSNPData1.cmp_row_wise()
            qcdata.col_id2NA_mismatch_rate1 = twoSNPData1.cmp_col_wise()
            del twoSNPData1, snpData_imputed
        else:
            snpData_imputed = None
            #qcdata.row_id2NA_mismatch_rate1 = {}
            #qcdata.col_id2NA_mismatch_rate1 = {}
        del newSnpData
        """
		for i in range(len(snpsd_250k_tmp)):
			#snpsd_250k_tmp_1[i].snps = []	#clear it up
			
			if len(snpsd_250k_tmp[i].accessions)>5 and len(snpsd_250k_tmp[i].positions)>5:	#not enough for imputation
				npute_data_struc = NPUTESNPData(inFile=snpsd_250k_tmp[i], input_NA_char='NA', input_file_format=4, lower_case_for_imputation=0)
				imputeData(npute_data_struc, int(npute_window_size))
				snpsd_250k_tmp[i].snps = npute_data_struc.snps
				del npute_data_struc
			"""
        qcdata.row_id2NA_mismatch_rate0 = row_id2NA_mismatch_rate0
        qcdata.col_id2NA_mismatch_rate0 = col_id2NA_mismatch_rate0

        qcdata.min_call_probability = min_call_probability
        qcdata.max_call_mismatch_rate = max_call_mismatch_rate
        qcdata.max_call_NA_rate = max_call_NA_rate
        qcdata.max_snp_mismatch_rate = max_snp_mismatch_rate
        qcdata.max_snp_NA_rate = max_snp_NA_rate
        qcdata.npute_window_size = npute_window_size
        result.append(qcdata)
        return result
	def loadDataStructure(self, db_250k=None, association_locus_id=None, association_landscape_type_id=None, \
						locusExtensionDistance=5000,\
						data_dir=None, list_type_id_list=None, gene_annotation_pickleFname=None, \
						snpInfoPickleFname=None, locus_type_id=1, snp_matrix_fname=None, snp_matrix_data_type=None, \
						phenotype_fname=None):
		"""
		2012.11.14
		"""
		sys.stderr.write("Fetching GWAS landscape for association-locus %s, landscape type %s ..."%(association_locus_id, association_landscape_type_id))
		# fetch the associationLocus
		associationLocus = Stock_250kDB.AssociationLocus.get(association_locus_id)
		associationLandscapeType = Stock_250kDB.AssociationLandscapeType.get(association_landscape_type_id)
		
		# fetch all result-peaks
		landscape_gwr_ls = []
		# fetch landscape within this interval
		start = max(1, associationLocus.start-locusExtensionDistance)
		stop = associationLocus.stop + locusExtensionDistance
		pd = PassingData(min_MAF=associationLandscapeType.min_MAF, data_dir=data_dir, \
						need_chr_pos_ls=0, chromosome=associationLocus.chromosome, \
						start=start, stop=stop, report=False)	#report controls whether getResultMethodContent() will report progress.
		association_landscape_id_set = set()
		
		for association_peak in associationLocus.association_peak_ls:
			association_landscape = db_250k.getAssociationLandscape(result_id=association_peak.result_id, association_landscape_type_id=associationLandscapeType.id)
			if association_landscape and association_landscape.id not in association_landscape_id_set:
				association_landscape_id_set.add(association_landscape.id)
				genome_wide_result = db_250k.getResultMethodContent(association_landscape=association_landscape, data_dir=data_dir, \
												construct_chr_pos2index=True, pdata=pd)
				landscape_gwr_ls.append(genome_wide_result)
				sys.stderr.write(" %s%s "%('\x08'*80, len(landscape_gwr_ls)))
		sys.stderr.write("%s landscapes.\n"%(len(landscape_gwr_ls)))
		
		centralLocus = SNPPassingData(chromosome=associationLocus.chromosome, position=start, \
						snps_id=associationLocus.id, start=start, stop=stop,
						fileNamePrefix="")
		
		LD_info = None
		gene_annotation = DrawSNPRegion.dealWithGeneAnnotation(gene_annotation_pickleFname)
		if snpInfoPickleFname:
			snp_info = db_250k.dealWithSNPInfo(snpInfoPickleFname, locus_type_id=locus_type_id)	#2012.3.8
		else:
			snp_info = None
		
		candidate_gene_set = set()
		if list_type_id_list:
			for list_type_id in list_type_id_list:
				candidate_gene_list = db_250k.getGeneList(list_type_id)
				candidate_gene_set |= set(candidate_gene_list)
		
		if snp_matrix_fname and phenotype_fname:
			if snp_matrix_data_type==3:
				matrix_data_type=float		#2009-3-23 for CNV amplitude file
			else:
				matrix_data_type=int
			snpData = SNPData(input_fname=snp_matrix_fname, turn_into_integer=1, turn_into_array=1, ignore_2nd_column=1,\
							matrix_data_type=matrix_data_type)
			if snpData.data_matrix is None:
				sys.stderr.write("Error. snpData.data_matrix is None.\n")
				sys.exit(3)
			header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(phenotype_fname, turn_into_integer=0)
			
			phenData = SNPData(header=header_phen, strain_acc_list=snpData.strain_acc_list, data_matrix=data_matrix_phen)
			#row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
			
			phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, \
																		strain_acc_list_phen, phenData.data_matrix)
			#tricky, using strain_acc_list_phen
			
			#2008-12-05 fake a snp_info for findSNPsInRegion
			DrawSNPRegion.construct_chr_pos2index_forSNPData(snpData, snp_info=snp_info)
			ecotype_info = getEcotypeInfo(db_250k)
		else:
			snpData = None
			phenData = None
			ecotype_info = None
		
		return_data = PassingData(associationLocus=associationLocus, associationLandscapeType=associationLandscapeType, \
								landscape_gwr_ls=landscape_gwr_ls, \
								gene_annotation=gene_annotation, snp_info=snp_info, LD_info=LD_info, \
								candidate_gene_set=candidate_gene_set, snpData=snpData, phenData=phenData,\
								ecotype_info=ecotype_info, centralLocus=centralLocus)
		return return_data
Beispiel #42
0
	def run(self):
		"""
		2008-09-06
		"""
		if self.debug:
			#for one-node testing purpose
			import pdb
			pdb.set_trace()
			header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname)
			snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \
							data_matrix=data_matrix, turn_into_array=1)	#category_list is not used to facilitate row-id matching
			
			picklef = open(self.snps_context_fname)
			snps_context_wrapper = cPickle.load(picklef)
			del picklef
			gene_id2snps_id_ls = self.get_gene_id2snps_id_ls(snps_context_wrapper)
			gene_id_ls = gene_id2snps_id_ls.keys()
			gene_id_ls.sort()
			
			header_phen, strain_acc_list_phen, category_list_phen, data_matrix_phen = read_data(self.phenotype_fname, turn_into_integer=0)
			phenData = SNPData(header=header_phen, strain_acc_list=strain_acc_list, data_matrix=data_matrix_phen)	#row label is that of the SNP matrix, because the phenotype matrix is gonna be re-ordered in that way
			phenData.data_matrix = Kruskal_Wallis.get_phenotype_matrix_in_data_matrix_order(snpData.row_id_ls, phenData.row_id_ls, phenData.data_matrix)
			
			other_data = PassingData(gene_id2snps_id_ls=gene_id2snps_id_ls, gene_id_ls=gene_id_ls, phenData=phenData)
			other_data_pickle = cPickle.dumps(other_data, -1)
			phenotype_label_ls_pickle = cPickle.dumps(phenData.col_id_ls, -1)
			snpData_pickle = cPickle.dumps(snpData, -1)
			sys.exit(2)
		
		self.communicator = MPI.world.duplicate()
		node_rank = self.communicator.rank
		free_computing_nodes = range(1, self.communicator.size-1)	#exclude the 1st and last node
		free_computing_node_set = Set(free_computing_nodes)
		output_node_rank = self.communicator.size-1
		
		if node_rank == 0:
			dstruc = self.inputNodePrepare()
			params_ls = dstruc.params_ls
			#send the output node the phenotype_label_ls
			self.communicator.send(dstruc.output_node_data_pickle, output_node_rank, 0)
			del dstruc.output_node_data_pickle
			
			for node in free_computing_nodes:	#send it to the computing_node
				sys.stderr.write("passing initial data to nodes from %s to %s ... "%(node_rank, node))
				self.communicator.send(dstruc.snpData_pickle, node, 0)
				self.communicator.send(dstruc.other_data_pickle, node, 0)
				sys.stderr.write(".\n")
			del dstruc
			
		elif node_rank in free_computing_node_set:
			data, source, tag = self.communicator.receiveString(0, 0)
			snpData =  cPickle.loads(data)
			del data
			data, source, tag = self.communicator.receiveString(0, 0)
			other_data = cPickle.loads(data)
			del data
			self.phenotype_index_ls = other_data.phenotype_index_ls
		else:
			data, source, tag = self.communicator.receiveString(0, 0)
			output_node_data_pickle = cPickle.loads(data)
			phenotype_label_ls = output_node_data_pickle.phenotype_label_ls
			self.phenotype_index_ls = output_node_data_pickle.phenotype_index_ls
			
		self.synchronize()
		if node_rank == 0:
			param_obj = PassingData(params_ls=params_ls, output_node_rank=output_node_rank, report=self.report, counter=0)
			self.inputNode(param_obj, free_computing_nodes, param_generator = params_ls)
			#self.input_node(param_obj, free_computing_nodes, input_handler=self.input_fetch_handler, message_size=1)
		elif node_rank in free_computing_node_set:
			computing_parameter_obj = PassingData(snpData=snpData, gene_id_ls=other_data.gene_id_ls, \
												gene_id2snps_id_ls=other_data.gene_id2snps_id_ls, phenData=other_data.phenData,
												phenotype_index_ls=self.phenotype_index_ls, min_data_point=self.min_data_point,
												test_type=self.test_type)
			self.computing_node(computing_parameter_obj, self.computing_node_handler)
		else:
			self.general_output_node(self.output_dir, self.phenotype_index_ls, phenotype_label_ls, free_computing_nodes)
		self.synchronize()	#to avoid some node early exits
Beispiel #43
0
	def run(self):
		"""
		2008-11-08
			generate combinations of results_id, list_type_id and generate plots one after another
			save the plots into database if commit=1
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		db = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user,
				   password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup()
		session = db.session
		
		param_obj = PassingData(call_method_id=self.call_method_id, \
								analysis_method_id=getattr(self, 'analysis_method_id', None),\
								analysis_method_id_ls=getattr(self, 'analysis_method_id_ls', None),\
								phenotype_method_id_ls=getattr(self, 'phenotype_method_id_ls', None),\
								list_type_id_ls=self.list_type_id_ls, \
								results_type=self.results_type)
		params_ls = MpiGeneListRankTest.generate_params(param_obj)
		
		for results_id, list_type_id in params_ls:
			rm = Stock_250kDB.ResultsMethod.get(results_id)
			list_type = Stock_250kDB.GeneListType.get(list_type_id)
			title = 'result(%s) of %s on %s with %s(%s) list'%\
				(results_id, rm.analysis_method.short_name, rm.phenotype_method.short_name, list_type.short_name, list_type.id)
			
			TopSNPTestType_id_ls = self.getTopSNPTestType_id_ls(self.get_closest, self.min_MAF, self.allow_two_sample_overlapping, self.results_type, \
								self.test_type_id, self.null_distribution_type_id)
			if self.commit:
				rows = Stock_250kDB.CandidateVsNonRatioPlot.query.filter_by(type_id=TopSNPTestType_id_ls[0]).\
					filter_by(results_id=results_id).filter_by(list_type_id=list_type_id)
				if rows.count()>0:
					row = rows.first()
					sys.stderr.write('%s already in db (%s of them) with first id=%s.\n'%(title, rows.count(), row.id))
					continue
			
			if not TopSNPTestType_id_ls:
				sys.stderr.write("No TopSNPTestType matches the input requirements. Exit.\n")
				sys.exit(3)
			TopSNPTestType_id_ls_str = map(str, TopSNPTestType_id_ls)
			from_where_clause = "from %s t, %s y where t.type_id=y.id and t.results_id=%s and t.list_type_id=%s and y.id in (%s)"%\
				(Stock_250kDB.CandidateGeneTopSNPTestRM.table.name, Stock_250kDB.CandidateGeneTopSNPTestRMType.table.name,\
				results_id, list_type_id, ','.join(TopSNPTestType_id_ls_str))
			
			no_of_top_snps_info = self.get_no_of_top_snps_info(db, from_where_clause)
			min_distance_info = self.get_min_distance_info(db, from_where_clause)
			rdata = self.get_data_matrix(db, no_of_top_snps_info, min_distance_info, from_where_clause, need_other_values=True, \
										null_distribution_type_id=self.null_distribution_type_id)
			
			header = ['no_of_top_snps', ''] + min_distance_info.label_ls
			strain_acc_list = no_of_top_snps_info.label_ls
			category_list = no_of_top_snps_info.label_ls
			
			if SNPData.isDataMatrixEmpty(rdata.data_matrix):
				sys.stderr.write("Nothing fetched from database.\n")
				#sys.exit(3)
				continue
			
			if self.output_fname:
				write_data_matrix(rdata.data_matrix, self.output_fname, header, strain_acc_list, category_list)
			
			"""
			if self.fig_fname:
				font = get_font(self.font_path, font_size=self.font_size)	#2008-08-01
				value2color_func = lambda x: Value2Color.value2HSLcolor(x, rdata.min_value, rdata.max_value)
				im_legend = drawContinousLegend(rdata.min_value, rdata.max_value, self.no_of_ticks, value2color_func, font)
				#im.save('%s_legend.png'%self.fig_fname_prefix)
				im = drawMatrix(rdata.data_matrix, value2color_func, no_of_top_snps_info.label_ls,\
							min_distance_info.label_ls, with_grid=1, font=font)
				im = combineTwoImages(im, im_legend, font=font)
				im.save(self.fig_fname)
			"""
			if self.commit:
				output_fname_prefix = None
			else:
				title_cp = title
				title_cp = title_cp.replace('/', '_')
				output_fname_prefix='%s_%s_type_%s.png'%(os.path.splitext(self.fig_fname)[0], title_cp, TopSNPTestType_id_ls[0])
			
			if rm.analysis_method_id ==1 or rm.analysis_method_id==7:
				preset_xlim = [0,8]
			else:
				preset_xlim = None
			return_data = self.plotCurve(rdata, no_of_top_snps_info, min_distance_info, output_fname_prefix, title=title, commit=self.commit, preset_xlim=preset_xlim)
			
			if self.commit and return_data.png_data:
				rows = Stock_250kDB.CandidateVsNonRatioPlot.query.filter_by(type_id=TopSNPTestType_id_ls[0]).\
					filter_by(results_id=results_id).filter_by(list_type_id=list_type_id)
				if rows.count()>0:
					row = rows.first()
					sys.stderr.write('%s already in db (%s of them) with first id=%s.\n'%(title, rows.count(), row.id))
					continue
				plot = Stock_250kDB.CandidateVsNonRatioPlot(type_id=TopSNPTestType_id_ls[0], results_id=results_id, list_type_id=list_type_id)
				plot.png_thumbnail = return_data.png_thumbnail.getvalue()
				plot.png_data = return_data.png_data.getvalue()
				plot.svg_data = return_data.svg_data.getvalue()
				db.session.save(plot)
				db.session.flush()
Beispiel #44
0
    def run(self):
        self.communicator = MPI.world.duplicate()
        node_rank = self.communicator.rank
        free_computing_nodes = range(1, self.communicator.size -
                                     1)  #exclude the 1st and last node
        free_computing_node_set = Set(free_computing_nodes)
        output_node_rank = self.communicator.size - 1
        """
		if node_rank!=output_node_rank:
			header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname)
			snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \
							data_matrix=data_matrix)	#category_list is not used to facilitate row-id matching
		"""
        if node_rank == 0:
            header, strain_acc_list, category_list, data_matrix = read_data(
                self.input_fname)
            snpData = SNPData(header=header, strain_acc_list=strain_acc_list, \
                data_matrix=data_matrix) #category_list is not used to facilitate row-id matching
            snpData_pickle = cPickle.dumps(snpData, -1)
            for node in free_computing_nodes:  #send it to the computing_node
                sys.stderr.write(
                    "passing initial data to nodes from %s to %s ... " %
                    (node_rank, node))
                self.communicator.send(snpData_pickle, node, 0)
                sys.stderr.write(".\n")
            del snpData_pickle
            params_ls = self.generate_params(len(snpData.col_id_ls),
                                             self.block_size)
            del snpData
        elif node_rank in free_computing_node_set:
            data, source, tag = self.communicator.receiveString(0, 0)
            snpData = cPickle.loads(data)
            del data
        else:
            pass

        self.synchronize()
        if node_rank == 0:
            param_obj = PassingData(params_ls=params_ls,
                                    output_node_rank=output_node_rank,
                                    report=self.report,
                                    counter=0)
            self.inputNode(param_obj,
                           free_computing_nodes,
                           param_generator=params_ls)
            #self.input_node(param_obj, free_computing_nodes, input_handler=self.input_handler, message_size=1)
            #self.input_node(param_obj, free_computing_nodes, self.message_size)
        elif node_rank in free_computing_node_set:
            computing_parameter_obj = PassingData(
                snpData=snpData,
                min_LD_to_output=self.min_LD_to_output,
                min_MAF=self.min_MAF,
                discard_perc=self.discard_perc)
            self.computing_node(computing_parameter_obj,
                                self.computing_node_handler)
        else:
            if getattr(self, 'output_fname', None):
                writer = csv.writer(open(self.output_fname, 'w'),
                                    delimiter='\t')
                #header_row = ['snp1_id', 'snp2_id', 'r2', 'D', "D'", "no_of_pairs"]
                #writer.writerow(header_row)
            else:
                writer = None
            param_obj = PassingData(writer=writer, is_header_written=False)
            self.output_node(free_computing_nodes, param_obj,
                             self.output_node_handler)
            del writer
        self.synchronize()  #to avoid some node early exits
Beispiel #45
0
    def run(self):

        if self.debug:
            import pdb
            pdb.set_trace()

        db = StockDB.StockDB(drivername=self.drivername,
                             username=self.db_user,
                             password=self.db_passwd,
                             hostname=self.hostname,
                             database=self.dbname)
        db.setup(create_tables=False)
        session = db.session
        session.begin()

        self.cmp_data_filename = self.findOutCmpDataFilename(
            self.cmp_data_filename, self.QC_method_id, StockDB.QCMethod)
        header, strain_acc_list, category_list, data_matrix = read_data(
            self.cmp_data_filename)
        strain_acc_list = map(
            int, strain_acc_list
        )  #it's ecotypeid, cast it to integer to be compatible to the later ecotype_id_ls from db
        snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list, \
             data_matrix=data_matrix) #category_list is not used.

        readme = formReadmeObj(sys.argv, self.ad, StockDB.README)
        session.save(readme)

        import MySQLdb
        conn = MySQLdb.connect(db=self.dbname,
                               host=self.hostname,
                               user=self.db_user,
                               passwd=self.db_passwd)
        curs = conn.cursor()
        from dbSNP2data import dbSNP2data
        snp_id2index, snp_id_list, snp_id2info = dbSNP2data.get_snp_id2index_m(
            curs, StockDB.Calls.table.name, StockDB.SNPs.table.name)
        strain_info_data = self.get_strain_id_info(self.QC_method_id)
        data_matrix = self.get_data_matrix(db,
                                           strain_info_data.strain_id2index,
                                           snp_id2index,
                                           StockDB.Calls.table.name)
        strain_acc_list = [
            strain_info_data.strain_id2acc[strain_id]
            for strain_id in strain_info_data.strain_id_list
        ]
        category_list = [
            strain_info_data.strain_id2category[strain_id]
            for strain_id in strain_info_data.strain_id_list
        ]
        header = ['ecotypeid', 'strainid']
        for snp_id in snp_id_list:
            snp_name, chromosome, position = snp_id2info[snp_id]
            header.append(snp_name)
        snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list, data_matrix=data_matrix, \
            snps_table='stock.snps') #snps_table is set to the stock_250k snps_table

        twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, curs=curs, \
             QC_method_id=self.QC_method_id, user=self.db_user, row_matching_by_which_value=0, debug=self.debug)
        if self.run_type == 1:
            row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise()
        elif self.run_type == 2:
            #twoSNPData.save_col_wise(session, readme)	#2008-08-18 need to implement a new one for 149SNP
            row_id2NA_mismatch_rate = {}
        else:
            sys.stderr.write("run_type=%s is not supported.\n" % self.run_type)
            sys.exit(5)
        if self.output_fname and self.run_type == 1 and row_id2NA_mismatch_rate:
            self.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate,
                                                self.output_fname)

        if self.run_type == 1 and self.commit and not self.input_dir and row_id2NA_mismatch_rate:
            #if self.input_dir is given, no db submission. call_info_id2fname here is fake, it's actually keyed by (array_id, ecotypeid)
            #row_id2NA_mismatch_rate might be None if it's method 0.
            self.submit_to_call_QC(session, row_id2NA_mismatch_rate, self.QC_method_id, self.db_user, \
                 twoSNPData.row_id12row_id2, readme)
        if self.commit:
            session.commit()
        else:
            session.rollback()