def boxcoxTransformation(data=None):
    ind = sp.where(data.flatten() <= 0.0)[0]
    if ind.shape[0] > 0:
        return data
    if data.ndim == 1:
        [data, boxcox_lambda] = boxcox(data)
    elif data.shape[1] == 1:
        [tmp_y, boxcox_lambda] = boxcox(data[:, 0])
        data = sp.zeros((tmp_y.shape[0], 1))
        data[:, 0] = tmp_y
    elif data.shape[0] == 1:
        [tmp_y, boxcox_lambda] = boxcox(data[0, :])
        data = sp.zeros((1, tmp_y.shape[0]))
        data[0, :] = tmp_y
    return data
def boxcoxTransformation(data=None):
    ind = sp.where(data.flatten()<=0.0)[0]
    if ind.shape[0]>0:
        return data
    if data.ndim==1:
        [data, boxcox_lambda] = boxcox(data)           
    elif data.shape[1]==1:
        [tmp_y, boxcox_lambda] = boxcox(data[:,0])
        data = sp.zeros((tmp_y.shape[0],1))
        data[:,0] = tmp_y
    elif data.shape[0]==1:
        [tmp_y, boxcox_lambda] = boxcox(data[0,:])
        data = sp.zeros((1,tmp_y.shape[0]))
        data[0,:] = tmp_y
    return data
Beispiel #3
0
	def treatdata(self, data):
		"""
		Returns the box-cox transformed data if the flag has been set.
		otherwise it will return the data unchanged.
		"""
		#print 'length', len(data)
		if self.use_bc:
			t_data, llmb = boxcox(data)
			
			return t_data
		else:
			return data
Beispiel #4
0
 def doBoxCoxTransformation(self):
     for i in range(self.n_t):
         y = self.Y[:, i]
         idx = SP.isfinite(y)
         self.Y[idx, i] = MS.boxcox(y[idx])[0]
Beispiel #5
0
 def doBoxCoxTransformation(self):
     for i in range(self.n_t):
         y = self.Y[:,i]
         idx = SP.isfinite(y)
         self.Y[idx,i] = MS.boxcox(y[idx])[0]
def plotHistogram(y=None,phenotype_name=None,transform='sqrt',outdir=None):
	ind = sp.where(~sp.isnan(y))
	y = y[ind]
	pl.figure(figsize=(12,6))
	pl.subplot(121)
	[test_statistic, p_value] = shapiro(y)
	pl.hist(y,bins=30,color=color_t[1],label="Original")
	pl.title(phenotype_name.replace("_"," ") + ", Shapiro: %.2e"%(p_value))
	leg = pl.legend(fancybox=True)
	leg.get_frame().set_alpha(0.2)
	#leg.get_frame().set_edgecolor("none")
	remove_border()
	
	pl.subplot(122)

	if transform=="all":
		p_vals = []
		transformations = sp.array(['boxcox','sqrt','log','log10'])
		for t in transformations:
			zeros = sp.where(y==0)[0]
			if zeros.shape[0]==0:
				if t=='sqrt':
					tmpy = sp.sqrt(y)
				elif t=="boxcox":
					[tmpy,b_lambda] = boxcox(y)
				elif t=="log":
					tmpy = sp.log(y)
				elif t=="log10":
					tmpy = sp.log10(y)
				[test_statistic, pv] = shapiro(tmpy)
			else:
				pv = 0.0
			p_vals.append(pv)
		p_vals = sp.array(p_vals)
		ind = sp.argmax(p_vals)
		transform = transformations[ind]
			
	ind = sp.where(y==0)[0]
	if ind.shape[0]>0:
		print "IMPORTANT: y contains 0 -> transformation changed to SQRT"
		transform = "sqrt"
	
	if transform=='sqrt':
		y = sp.sqrt(y)
	elif transform=="boxcox":
		[y,b_lambda] = boxcox(y)
	elif transform=="log":
		y = sp.log(y)
	elif transform=="log10":
		y = sp.log10(y)
	[test_statistic, p_value_t] = shapiro(y)
	pl.hist(y,bins=30,color=color_t[4],label=transform)
	pl.title(phenotype_name.replace("_"," ") + ", Shapiro: %.2e"%(p_value_t))
	leg = pl.legend(fancybox=True)
	leg.get_frame().set_alpha(0.2)
	#leg.get_frame().set_edgecolor("none")
	remove_border()
	pl.subplots_adjust(left=0.03,bottom=0.05,right=0.99,top=0.94,wspace=0.07,hspace=0.34)
	pl.savefig(os.path.join(outdir,phenotype_name + ".pdf"))
	if(p_value>p_value_t):
		return "original"
	else:
		return transform
	if len(sys.argv)==4:
		transform = sys.argv[3]
	
	[y,phenotype_names,sample_ids,fid] = read_data(sys.argv[1])
	output_dir = sys.argv[2]

	for i,phenotype in enumerate(phenotype_names):
		selected_transform = plotHistogram(y=y[:,i],phenotype_name=phenotype,transform=transform,outdir=output_dir)
		ind = sp.where(~sp.isnan(y[:,i]))[0]
		if selected_transform=='sqrt':
			phenotype_names[i] = "sqrt_" + phenotype
			y[ind,i] = sp.sqrt(y[ind,i])
		elif selected_transform=="boxcox":
			phenotype_names[i] = "boxcox_" + phenotype 
			tmp = y[ind,i]
			[y[ind,i],b_lambda] = boxcox(tmp)
		elif selected_transform=="log":
			phenotype_names[i] = "log_" + phenotype
			y[ind,i] = sp.log(y[ind,i])
		elif selected_transform=="log10":
			phenotype_names[i] = "log10_" + phenotype
			y[ind,i] = sp.log10(y[ind,i])
	
	f = open(os.path.join(output_dir, "transformed_phenotypes.txt"),'w')
	f.write("FID IID ")
	string = ""
	for phenotype in phenotype_names:
		string += phenotype + " "
	f.write(string[:-1] + "\n")
	for i in range(fid.shape[0]):
		f.write(fid[i] + " " + sample_ids[i] + " ")
def plotHistogram(y=None, phenotype_name=None, transform='sqrt', outdir=None):
    ind = sp.where(~sp.isnan(y))
    y = y[ind]
    pl.figure(figsize=(12, 6))
    pl.subplot(121)
    [test_statistic, p_value] = shapiro(y)
    pl.hist(y, bins=30, color=color_t[1], label="Original")
    pl.title(phenotype_name.replace("_", " ") + ", Shapiro: %.2e" % (p_value))
    leg = pl.legend(fancybox=True)
    leg.get_frame().set_alpha(0.2)
    #leg.get_frame().set_edgecolor("none")
    remove_border()

    pl.subplot(122)

    if transform == "all":
        p_vals = []
        transformations = sp.array(['boxcox', 'sqrt', 'log', 'log10'])
        for t in transformations:
            zeros = sp.where(y == 0)[0]
            if zeros.shape[0] == 0:
                if t == 'sqrt':
                    tmpy = sp.sqrt(y)
                elif t == "boxcox":
                    [tmpy, b_lambda] = boxcox(y)
                elif t == "log":
                    tmpy = sp.log(y)
                elif t == "log10":
                    tmpy = sp.log10(y)
                [test_statistic, pv] = shapiro(tmpy)
            else:
                pv = 0.0
            p_vals.append(pv)
        p_vals = sp.array(p_vals)
        ind = sp.argmax(p_vals)
        transform = transformations[ind]

    ind = sp.where(y == 0)[0]
    if ind.shape[0] > 0:
        print "IMPORTANT: y contains 0 -> transformation changed to SQRT"
        transform = "sqrt"

    if transform == 'sqrt':
        y = sp.sqrt(y)
    elif transform == "boxcox":
        [y, b_lambda] = boxcox(y)
    elif transform == "log":
        y = sp.log(y)
    elif transform == "log10":
        y = sp.log10(y)
    [test_statistic, p_value_t] = shapiro(y)
    pl.hist(y, bins=30, color=color_t[4], label=transform)
    pl.title(
        phenotype_name.replace("_", " ") + ", Shapiro: %.2e" % (p_value_t))
    leg = pl.legend(fancybox=True)
    leg.get_frame().set_alpha(0.2)
    #leg.get_frame().set_edgecolor("none")
    remove_border()
    pl.subplots_adjust(left=0.03,
                       bottom=0.05,
                       right=0.99,
                       top=0.94,
                       wspace=0.07,
                       hspace=0.34)
    pl.savefig(os.path.join(outdir, phenotype_name + ".pdf"))
    if (p_value > p_value_t):
        return "original"
    else:
        return transform
    [y, phenotype_names, sample_ids, fid] = read_data(sys.argv[1])
    output_dir = sys.argv[2]

    for i, phenotype in enumerate(phenotype_names):
        selected_transform = plotHistogram(y=y[:, i],
                                           phenotype_name=phenotype,
                                           transform=transform,
                                           outdir=output_dir)
        ind = sp.where(~sp.isnan(y[:, i]))[0]
        if selected_transform == 'sqrt':
            phenotype_names[i] = "sqrt_" + phenotype
            y[ind, i] = sp.sqrt(y[ind, i])
        elif selected_transform == "boxcox":
            phenotype_names[i] = "boxcox_" + phenotype
            tmp = y[ind, i]
            [y[ind, i], b_lambda] = boxcox(tmp)
        elif selected_transform == "log":
            phenotype_names[i] = "log_" + phenotype
            y[ind, i] = sp.log(y[ind, i])
        elif selected_transform == "log10":
            phenotype_names[i] = "log10_" + phenotype
            y[ind, i] = sp.log10(y[ind, i])

    f = open(os.path.join(output_dir, "transformed_phenotypes.txt"), 'w')
    f.write("FID IID ")
    string = ""
    for phenotype in phenotype_names:
        string += phenotype + " "
    f.write(string[:-1] + "\n")
    for i in range(fid.shape[0]):
        f.write(fid[i] + " " + sample_ids[i] + " ")
Beispiel #10
0
	def train(data, dist_choices, FORCE_APPART = False, bc_transform = True):
		"""
		Given a vector of data and list of distribution types the trainer
		will find the best fit for the mixture distribution.
		"""
		
		param_vec = N.array([])
		bounds = []
		
		if bc_transform:
			t_data, llm = boxcox(data)
		else:
			t_data = data
		
		#make initial guesses based on kmeans
		num_clust = len(dist_choices)
		init_weight = float(1)/float(num_clust)
		if num_clust == 1:
			centroids = [N.mean(t_data)]
			
		elif (num_clust == 2) & FORCE_APPART:
			c1 = N.max(t_data)
			c2 = N.min(t_data)
			
			centroids = N.array([c1, c2])
		else:
			(centroids, distortion) = kmeans(t_data, num_clust)
		centroids.sort()
		
		min_val = N.min(t_data)
		max_val = N.max(t_data)
		
		#create an "emperical" pdf
		bin_divisor = get_bin_divisor(len(t_data))

		n, bins = N.histogram(t_data, new=True,
				      bins = len(t_data)/bin_divisor, normed = True)
		if len(dist_choices) > 1:
			max_width = (bins[1]-bins[0])*bin_divisor
		else:
			max_width = None
		#create param_vec and bounds vector
		for this_dist, cent in izip(dist_choices, centroids):
			if this_dist == 'norm':
				try:
					param_vec = N.concatenate((param_vec, 
								   N.array([cent,1,init_weight])))
				except:
					
					param_vec = N.concatenate((param_vec, 
								   N.array([cent[0],1,init_weight])))
				bounds += [(min_val, max_val), (0, max_width), (0,1)]
			elif this_dist == 'uniform':
				param_vec = N.concatenate((param_vec, 
							   N.array([min_val,max_val,init_weight])))
				bounds += [(min_val, max_val), (min_val, max_val), (0,1)]
			elif this_dist == 'skewnorm':
				try:
					param_vec = N.concatenate((param_vec, 
								   N.array([cent,1,1,init_weight])))
				except:
					
					param_vec = N.concatenate((param_vec, 
								   N.array([cent[0],1,1,init_weight])))
				bounds += [(min_val, max_val), (0, max_width), (0, max_width), 
							(0,1)]
			else:
				raise KeyError, 'Unknown distribution %s' % this_dist
		#do the actual training
		
		param_val, like, d = fmin_tnc(score, param_vec, 
					      args = (dist_choices, bins[1:], n), 
					      approx_grad = True, bounds = bounds,
					      messages = 0)
		
		#make the trained distribution
		t_dist = m_modal()
		t_dist.use_bc = bc_transform
		t_dist.dists, t_w = unpack(param_val, dist_choices)
		
		#save normalized weights
		t_dist.weights = N.array(t_w)/N.array(t_w).sum()
		#TF, pval = t_dist.pval(data)
		
		return t_dist