Exemple #1
0
def qcrun(opt):
	# init 
	fsample = opt.sample_info
	fmatrix = opt.matrixdata

	sampleinfo = mutilstats.SampleInfo()
	ret = sampleinfo.parse_sampleinfo(fsample)
	if ret <> 0:
		sys.stderr.write("[ERROR] Sample information failed to parse, please check the sampleinfo file\n")
		return 1
	## process
	data = mutilstats.MatrixAnno()
	if opt.log2tr == 1:
		ret = data.parse_matrix_anno(fmatrix,addtolog=opt.addbg,log2tr=1,cutoff=opt.noise)
	else:
		assert opt.log2tr == 0
		ret = data.parse_matrix_anno(fmatrix,cutoff=opt.noise)
	if opt.normalize:
		ret = mutilstats.normalize(data.data)
	if ret <> 0:
		sys.stderr.write("[ERROR] Data parse failed, please check the matrix file\n")
		return 1
	
	#1 CDF
	statplot.exprs_density(data.data,sampleinfo.classcolors,sampleinfo.classlabels,"exprs_CDF","expression level","cumulative distribution","cdf")
	#2 PDF
	statplot.exprs_density(data.data,sampleinfo.classcolors,sampleinfo.classlabels,"exprs_pdf","expression level","probability density distribution","pdf")
	#3 boxplot
	statplot.plot_boxplot(data.data,"exprs_boxplot","","expression level",sampleinfo.samplenames,colors=sampleinfo.classcolors,ylim=0)
	#4 RLE
	statplot.exprs_RLE(data.data,"mean","RLE_plot",sampleinfo.samplenames,colors=sampleinfo.classcolors)

	#5 corr 
	corr_matrix = statplot.exprs_corrarray(data.data,sampleinfo.samplenames,"corrarray")
	## out corr_matrix
	foutcorr = file("corr_matrix.xls","w")
	foutcorr.write("## correlation coefficient matrix for samples' expression level\n")
	foutcorr.write("\t".join(["#correlation"]+sampleinfo.samplenames)+"\n")
	for i in xrange(len(sampleinfo.samplenames)):
		foutcorr.write("\t".join([sampleinfo.samplenames[i],]+map(fmtout,corr_matrix[i,:].tolist()))+"\n")
	foutcorr.close()

	#6 cluster
	statplot.hcluster(data.data,sampleinfo.samplenames,"hcluster")

	#7 MDS

	#statdir = "Exprs"
	html_main = mhtml.simple_main(title="样本表达质量控制结果",css="../CSS")
	html_main.add_head("样本表达质量控制结果")
	html_main.add_enter()
	#html_main.add_back1()
	#html_main.add_enter()
	html_main.add_head("1. 样本信息列表",2)
	html_main.add_line()
	html_main.add_enter()
	tmptable,tmpnote = mhtml.xls2table("%s"%fsample)
	html_main.add_content(tmptable)
	html_main.add_precontent(tmpnote)
	html_main.add_enter()
	html_main.add_head("2. 样本表达质量控制结果,数据可靠性分析",2)
	html_main.add_line()
	html_main.add_enter()
	html_main.add_head("a. 样本表达水平概率密度分布",3)
	html_main.add_enter()
	if opt.log2tr == 1:
		strlog = "采用log<sub>2</sub>变换,并"
	else:
		strlog = ""
	html_main.add_content("""对各样本表达水平,%s计算概率密度。查看各样本及各组间表达水平的分布情况。概率密度估计采用Kernel density estimation, implementation in python with scipy(http://www.scipy.org/)"""%strlog)
	html_main.add_content("""<img src="./exprs_pdf.png" width="50%" /><a href="./exprs_pdf.svg">SVG矢量图版本</a>""")
	html_main.add_enter()
	html_main.add_head("b. 样本表达水平累积概率密度分布",3)
	html_main.add_enter()
	html_main.add_content("""对各样本表达水平,%s计算累积概率密度。查看各样本及各组间表达水平的分布情况。"""%strlog)
	html_main.add_content("""<img src="./exprs_CDF.png" width="50%" /><a href="./exprs_CDF.svg">SVG矢量图版本</a>""")
	html_main.add_enter()
	html_main.add_head("c. 样本表达水平箱式图",3)
	html_main.add_content("""对各样本表达水平,%s绘制箱式图。查看各样本及各组间表达水平的分布情况。"""%strlog)
	html_main.add_content("""<img src="./exprs_boxplot.png" width="50%" /><a href="./exprs_boxplot.svg">SVG矢量图版本</a>""")
	html_main.add_enter()

	html_main.add_head("d. 样本间表达相关性分析",3)
	html_main.add_enter()
	html_main.add_content("""计算样本两两间的fisher 相关系数, 将相关系数矩阵按实验分组形式,绘制成热图。样本处理组间,大部分表达具有相关性,主要是因为维持生命基本活动的大部分基因均不差异表达,只有少部分为差异表达(当处理条件十分剧烈时,实验组和处理组间可能并不满足此假设,但组内样本应满足此假设)。因此,各样本间,表达水平相关性应较高。若图中存在特异性的样本,或实验条件不统一且未做校正时,该特殊样本与其他样本的表达相关性会非常低。""")
	html_main.add_content("""<img src="./corrarray.png" width="50%" /><a href="./corrarray.svg">SVG矢量图版本</a>""")
	tmptable,tmpnote = mhtml.xls2table("corr_matrix.xls")
	html_main.add_content(tmptable)
	html_main.add_enter()
	html_main.add_head("e. 样本相对表达水平比较",3)
	html_main.add_enter()
	html_main.add_content("""在同一组实验中,即使是相互比较的对照组与实验组之间,大部分基因的表达量还是应该保持一致的。当使用相对对数表达水平(Relative Log Expression(RLE))的箱线图来控制不同组之间的实验质量时,箱线图应该在垂直中央相类的位置(通常接近0)。如果有一个样本的表现和其它的平行组都很不同,那说明它可能出现了质量问题。""")
	html_main.add_content("""<img src="./RLE_plot.png" height="450" width="550" /><a href="./RLE_plot.svg">SVG矢量图版本</a>""")
	html_main.add_enter()
	html_main.add_head("f. 样本聚类结果",3)
	html_main.add_enter()
	html_main.add_content("""基于表达水平数据的样本聚类,计算样本间欧式距离,采用离差平方和法(wald法)进行层次聚类,验证聚类结果是否同实验设计基本一致。若聚类结果明显不一致,则样本间存在着明显的其他未知的因素,而不仅仅是实验处理效应。""")
	html_main.add_content("""<img src="./hcluster_hcluster.png" width="50%" /><a href="./hcluster_hcluster.svg">SVG矢量图版本</a>""")
	html_main.add_enter()

	html_main.add_head("g. 样本多维尺度分析",3)
	html_main.add_content("多维尺度分析(Multi Dimensional Scaling, MDS)是一种将多维空间的研究对象简化到低维空间进行定位、分析和归类,同时又保留对象间原始关系的数据分析方法。此处我们采用样本间欧式距离反映样本间的差异,选择前3个本征值最大的维度,绘制样本在前三个维度上的分布,若实验处理因素为表达差异的主要因素,则一般而言,样本组内差异应小于组间差异。")

	sinfo = sampleinfo
	snnum = len(sinfo.sns)

	mdsout = mds.mds_ps(data.data,10)
	xlabel = "Number of dimensions"
	ylabel = "Variation percentage"
	statplot.plotline(np.arange(0,len(mdsout.p)+1),np.asarray([[0,]+mdsout.p.tolist(),]),"Variation_percentage",xlabel,ylabel,['r^-'],xlimmax=10+1,ylimmax=102)

	if snnum == 2:
		ret = statplot.plot_Xscore(mdsout.v,sinfo.classnums,sinfo.uniqclassnum,sinfo.uniqcolor,sinfo.uniqmarker,sinfo.uniqclasslabel,"MDS_samples_distribution","1st dimension","2nd dimension","3rd dimension",dim=2)
		html_main.add_content("前n个维度累积解释变异的百分比图(见下图)。其中,前两个维度,累积解释变异的百分比: %.2f%%, %.2f%%"%(mdsout.p[0],mdsout.p[1]))
	elif snnum >= 3:
		ret = statplot.plot_Xscore(mdsout.v,sinfo.classnums,sinfo.uniqclassnum,sinfo.uniqcolor,sinfo.uniqmarker,sinfo.uniqclasslabel,"MDS_samples_distribution","1st dimension","2nd dimension","3rd dimension",dim=3)
		html_main.add_content("前n个维度累积解释变异的百分比图(见下图)。其中,前三个维度,累积解释变异的百分比: %.2f%%, %.2f%%, %.2f%%"%(mdsout.p[0],mdsout.p[1],mdsout.p[2]))
	html_main.add_content("""<img src="./Variation_percentage.png" width="50%" /><a href="./Variation_percentage.svg">SVG矢量图版本</a>""")
	html_main.add_enter()
	html_main.add_content("""样本在前三个维度中的空间分布图""")
	html_main.add_content("""<img src="./MDS_samples_distribution.png" width="50%" /><a href="./MDS_samples_distribution.svg">SVG矢量图版本</a>""")
	html_main.add_enter()


	f = file("exprs_samples_qc.html","w")
	f.write(str(html_main))
	f.close()
	return 0
Exemple #2
0
def mds_ps(X_raw,nvs_output=10):
	"""
	Here, a mutil demensional scale method was used to calculate the population structure.
	I think this must be improve, we will check the reliability of method for population structure analysis, soon.
	
	Input:
		nvs_output: is number of reduced demensions of raw data
		X_SNPs: is the same as it in plsgwas
	Output:
		w : a list (len(list) = nvs_output) of eigenvalue
		v : a matrix of eigenvector corresponds to the eigenvalue
	"""
	X_SNPs = X_raw.copy()
	if X_SNPs.dtype == np.float64 or X_SNPs.dtype == np.float32:
		xtype = X_SNPs.dtype
		pass
	else:
		sys.stderr.write("""The format of X_SNPs matrix should be numpy.float32 or numpy.float64, please check it.
		If the memory is sufficient, we suggest you use the numpy.float64\n""")
		exit(1)
	nx,px = X_SNPs.shape
	X_SNPs = np.asmatrix(X_SNPs)
	if nvs_output>nx:
		sys.stderr.write('too many nvs_output, it must be smaller than number of samples, we have changed auto\n')
	nvs_output = min(nx,nvs_output)
	mutilstats.centring(X_SNPs)
	mutilstats.normalize(X_SNPs)
	#print X_SNPs
	dist = np.asmatrix(np.zeros((nx,nx)))
	for i in xrange(nx):
		temp = X_SNPs - X_SNPs[i,:]
		temp = np.power(temp,2)
		dist[:,i] = np.power(np.sum(temp,axis=1),0.5)
	I = np.asmatrix(np.eye(nx))
	I_n = np.asmatrix(np.ones((nx,nx)))
	dist = -1*(I-(1.0/nx)*I_n)*dist*(I-(1.0/nx)*I_n)/2
	del I_n
	del I
	w,v=np.linalg.eig(dist)
	del dist
	idx = np.argsort(w)[::-1]
	w = w[idx]
	v = v[:,idx]
	precent = np.cumsum(w)/np.sum(w) * 100
	mds_output = mdsoutput()
	mds_output.p = precent[0:nvs_output]
	mds_output.w = w[0:nvs_output]
	mds_output.v = v[:,0:nvs_output]
	"""
	w=list(w)
	wtemp=w[:]
	wtemp.sort()
	last=-1
	vector_ind = []
	return_v = np.asmatrix(np.zeros((nx,nvs_output)))
	while nvs_output:
		vector_ind.append(w.index(wtemp[last]))
		last -= 1
		nvs_output -= 1
	return_w = []
	while vector_ind:
		ind = vector_ind.pop(0)
		return_w.append(w[ind])
		return_v[:,nvs_output] = v[:,ind]
		nvs_output += 1
	"""
	return mds_output
def MutSubPattern(annofns, outdir="./", target_region=None):
    #annoregion = ['downstream','exonic','intergenic','intronic','ncRNA_exonic','ncRNA_intronic','ncRNA_UTR3','ncRNA_UTR5','splicing','upstream','UTR3','UTR5']
    #flag = 0
    #if target_region in annoregion:
    #   flag  = 1
    sample_arr = []
    indel_c_arr = []
    snp_c_arr = []
    C_A_arr = []  # C->A / G->T
    C_T_arr = []  # C->T / G->A
    C_G_arr = []  # C->G / G->C
    T_A_arr = []  # T->A / A->T
    T_C_arr = []  # T->C / A->G
    T_G_arr = []  # T->G / A->C
    for i in xrange(len(annofns)):
        samplename = annofns[i].split(
            os.sep)[-1].split(".")[0].split("_vs_")[0]  # to
        variant_function = file(annofns[i], "r")
        indel_c = 0
        snp_c = 0
        snp_CA = 0
        snp_CT = 0
        snp_CG = 0
        snp_TA = 0
        snp_TC = 0
        snp_TG = 0
        line = variant_function.next()
        if not line: continue
        if line[0:3] == "Chr" or line[0] == "#":
            pass
        else:
            variant_function.seek(0)
        for line in variant_function:
            if line.startswith("#"): continue
            if line.startswith("Note:"): break
            arr = line.split("\t")
            try:
                ref = arr[3]
                alt = arr[4]
                assert ref != alt
            except:
                print arr
            if ref == "-" or alt == "-":
                indel_c += 1
            elif len(ref) != len(alt):
                indel_c += 1
            else:
                snp_c += 1
                if ref == "C":
                    if alt == "A":
                        snp_CA += 1
                    elif alt == "T":
                        snp_CT += 1
                    elif alt == "G":
                        snp_CG += 1
                elif ref == "G":
                    if alt == "T":
                        snp_CA += 1
                    elif alt == "A":
                        snp_CT += 1
                    elif alt == "C":
                        snp_CG += 1
                elif ref == "T":
                    if alt == "A":
                        snp_TA += 1
                    elif alt == "C":
                        snp_TC += 1
                    elif alt == "G":
                        snp_TG += 1
                elif ref == "A":
                    if alt == "T":
                        snp_TA += 1
                    elif alt == "G":
                        snp_TC += 1
                    elif alt == "C":
                        snp_TG += 1
        sample_arr.append(samplename)
        C_A_arr.append(snp_CA)
        C_T_arr.append(snp_CT)
        C_G_arr.append(snp_CG)
        T_A_arr.append(snp_TA)
        T_C_arr.append(snp_TC)
        T_G_arr.append(snp_TG)
        indel_c_arr.append(indel_c)
        snp_c_arr.append(snp_c)
        variant_function.close()
    leng = len(sample_arr)
    mut_stat_xls = file(outdir + "/" + "Mutation_pattern.xls", "w")
    mut_stat_xls.write("#Variant\t" + "\t".join(sample_arr) + "\n")
    if len(C_A_arr) == leng and len(C_T_arr) == leng and len(
            C_G_arr) == leng and len(T_A_arr) == leng and len(
                T_C_arr) == leng and len(T_G_arr) == leng and len(
                    indel_c_arr) == leng and len(snp_c_arr) == leng:
        #mut_stat_xls.write("indel_count\t"+"\t".join(map(str,indel_c_arr))+"\n")
        mut_stat_xls.write("SNP_count\t" + "\t".join(map(str, snp_c_arr)) +
                           "\n")
        mut_stat_xls.write("C->A/G->T\t" + "\t".join(map(str, C_A_arr)) + "\n")
        mut_stat_xls.write("C->T/G->A\t" + "\t".join(map(str, C_T_arr)) + "\n")
        mut_stat_xls.write("C->G/G->C\t" + "\t".join(map(str, C_G_arr)) + "\n")
        mut_stat_xls.write("T->A/A->T\t" + "\t".join(map(str, T_A_arr)) + "\n")
        mut_stat_xls.write("T->C/A->G\t" + "\t".join(map(str, T_C_arr)) + "\n")
        mut_stat_xls.write("T->G/A->C\t" + "\t".join(map(str, T_G_arr)) + "\n")
        #mut_stat_xls.write("InDel\t"+"\t".join(map(str,indel_c_arr))+"\n")
        mut_stat_xls.close()
        #tot_snp = sum(snp_c_arr);
        tot1 = sum(C_A_arr)
        tot2 = sum(C_T_arr)
        tot3 = sum(C_G_arr)
        tot4 = sum(T_A_arr)
        tot5 = sum(T_C_arr)
        tot6 = sum(T_G_arr)
        labels = [
            "C->A/G->T", "C->T/G->A", "C->G/G->C", "T->A/A->T", "T->C/A->G",
            "T->G/A->C"
        ]
        fracs = [tot1, tot2, tot3, tot4, tot5, tot6]
        h = {}
        for i in xrange(6):
            h[labels[i]] = fracs[i]
        bar_dict(h,
                 "total_snp_substitution",
                 "Substitution",
                 "Counts",
                 fmt="%d")
        plot_data = np.asmatrix(
            np.float64(
                np.asarray(
                    (C_A_arr, C_T_arr, C_G_arr, T_A_arr, T_C_arr, T_G_arr))))
        stackv_bar_plot(plot_data,
                        sample_arr,
                        "SNP_substitution_pattern",
                        "",
                        "Percentage",
                        width=0.5,
                        legends=labels,
                        scale=1,
                        orientation="horizontal",
                        rotation=0)
        # p * n
        plot_2 = plot_data.T[0:, :]
        plot_2 = np.asarray(plot_2)
        plot_2new = plot_2.T / np.sum(plot_2.T, axis=0)
        #print np.sum(plot_2new.T,axis=1)
        plot_2new = plot_2new.T
        centring(plot_2new)
        normalize(plot_2new)
        if len(sample_arr) > 1:
            statplot.cluster_heatmap(
                np.asmatrix(plot_2new),
                sample_arr,
                labels,
                fig_prefix="SNP_substitution_Mutation_Spectrum",
                colornorm=1,
                nosample=False,
                nogene=True,
                plotxlabel=1,
                plotylabel=1,
                cbarlabel="Normalized Frequency",
                trees=3)
    return 0