Esempio n. 1
0
def getKinshipMatrix():
    #snpsDataFile="/Network/Data/250k/dataFreeze_011209/250K_f13_012509.csv"
    snpsDataFile = "/home/cmb-01/bvilhjal/Projects/data/250K_f13_012609.csv"
    import dataParsers, snpsdata
    snpsds = dataParsers.parseCSVData(snpsDataFile, format=1,
                                      deliminator=",")  #,debug=True)
    snps = []
    sys.stdout.write("Converting format")
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        snps += snpsd.getSnpsData(missingVal="NA").snps
    print ""
    #snps = _sampleSNPs_(snps,100)
    print "Calculating kinship"
    K = calcKinship(snps)
    eDict = phenotypeData._getEcotypeIdToStockParentDict_()
    accessions = map(int, snpsd.accessions)
    #for et in accessions:
    #print eDict[et]
    for i in range(0, len(accessions)):
        et = accessions[i]
        info = eDict[et]
        st = str(et) + ", " + str(info[0]) + ", " + str(info[1]) + ":"
        st += str(K[i][0])
        for j in range(1, i + 1):
            st += ", " + str(K[i][j])
        print st
Esempio n. 2
0
def getKinshipMatrix():
	#snpsDataFile="/Network/Data/250k/dataFreeze_011209/250K_f13_012509.csv"
	snpsDataFile="/home/cmb-01/bvilhjal/Projects/data/250K_f13_012609.csv"
	import dataParsers,snpsdata
	snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",")#,debug=True)
	snps = []
	sys.stdout.write("Converting format")
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		snps += snpsd.getSnpsData(missingVal="NA").snps
	print ""
	#snps = _sampleSNPs_(snps,100)
	print "Calculating kinship"
	K = calcKinship(snps)
	eDict = phenotypeData._getEcotypeIdToStockParentDict_()
	accessions = map(int,snpsd.accessions)
	#for et in accessions:
	#print eDict[et]
	for i in range(0,len(accessions)):
		et = accessions[i]
		info = eDict[et]
		st = str(et)+", "+str(info[0])+", "+str(info[1])+":"
		st += str(K[i][0])
		for j in range(1,i+1):
			st += ", "+str(K[i][j])
		print st
def plot_250k_Tree(chr=None, startPos=None, endPos=None):
    import scipy as sp
    import scipy.cluster.hierarchy as hc
    import Emma
    import pylab
    import phenotypeData
    e_dict = phenotypeData._getEcotypeIdToStockParentDict_()
    snpsds = dataParsers.parseCSVData(
        "/Network/Data/250k/dataFreeze_011209/250K_192_043009.csv")
    snps = []
    for snpsd in snpsds:
        snps += snpsd.getSnpsData().snps
    snps = sampleSNPs(snps, 100000, False)
    labels = []
    for acc in snpsds[0].accessions:
        try:
            s = unicode(e_dict[int(acc, )][0], 'iso-8859-1')
        except Exception, err_s:
            print err_s
            print e_dict[int(acc)][0]
            s = acc
        labels.append(s)
Esempio n. 4
0
def _plotKW_():
    """
	Analyze how population structure affects KW.
	"""
    filterProb = 0.1
    p_i = 1
    res_dir = "/Users/bjarni/tmp/"
    runId = "_full_quick_"

    snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv"
    snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",")  # ,debug=True)
    phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv"
    print "Loading phenotype data"
    phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter="\t")
    snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds)

    totalSNPs = []
    for i in range(len(snpsds)):
        snpsds[i] = snpsds[i].getSnpsData()
        totalSNPs += snpsds[i].snps

        # For memory, remove random SNPs
    snps = []
    for snp in totalSNPs:
        if random.random() < filterProb:
            snps.append(snp)
    totalSNPs = snps

    # globalKinship = calcKinship(totalSNPs)
    gc.collect()  # Calling garbage collector, in an attempt to clean up memory..

    # chr = 1
    # for snpsd in snpsds:

    snpsd = snpsds[3]

    k = calcKinship(snpsd.snps[200:1400])
    res = runEmma(phed, p_i, k, snpsd.snps[200:1400])  # runEmma(phed,p_i,k,snps):
    pvals = res["ps"]
    log_pvals = []
    for pval in pvals:
        # print pval
        log_pvals.append(-math.log10(pval))
    pylab.plot(snpsd.positions[200:1400], log_pvals, "c.", label="Emma (local)")

    k = calcKinship(totalSNPs)
    res = runEmma(phed, p_i, k, snpsd.snps[200:1400])  # runEmma(phed,p_i,k,snps):
    pvals = res["ps"]
    log_pvals = []
    for pval in pvals:
        # print pval
        log_pvals.append(-math.log10(pval))
    pylab.plot(snpsd.positions[200:1400], log_pvals, "g.", label="Emma (global)")

    phenVals = phed.getPhenVals(p_i)
    pvals = _run_kw_(snpsd.snps[200:1400], phenVals)
    log_pvals = []
    for pval in pvals:
        # print pval
        log_pvals.append(-math.log10(pval))

    pylab.plot(snpsd.positions[200:1400], log_pvals, "r.", label="KW (full data)")

    (pvals, new_positions, acc_groups) = get_KW_pvals(
        snpsd.snps[200:1400], snpsd.positions[200:1400], phed, p_i, kinshipThreshold=0.95, method="KW"
    )
    ecot_map = phenotypeData._getEcotypeIdToStockParentDict_()

    for i in range(0, len(acc_groups)):
        acc_list = []
        for a_i in acc_groups[i]:
            e_i = snpsd.accessions[a_i]
            # print e_i
            acc_list.append(ecot_map[int(e_i)][0])
        print "group", i, ":", acc_list

    log_pvals = []
    for pval in pvals:
        # print pval
        log_pvals.append(-math.log10(pval))

    pylab.plot(new_positions, log_pvals, "b.", label="KW (merged data)")

    pylab.legend(numpoints=2, handlelen=0.005)

    pylab.show()
def _plotKW_():
    """
	Analyze how population structure affects KW.
	"""
    filterProb = 0.1
    p_i = 1
    res_dir = "/Users/bjarni/tmp/"
    runId = "_full_quick_"

    snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv"
    snpsds = dataParsers.parseCSVData(snpsDataFile, format=1,
                                      deliminator=",")  #,debug=True)
    phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv"
    print "Loading phenotype data"
    phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter='\t')
    snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds)

    totalSNPs = []
    for i in range(len(snpsds)):
        snpsds[i] = snpsds[i].getSnpsData()
        totalSNPs += snpsds[i].snps

    #For memory, remove random SNPs
    snps = []
    for snp in totalSNPs:
        if random.random() < filterProb:
            snps.append(snp)
    totalSNPs = snps

    #globalKinship = calcKinship(totalSNPs)
    gc.collect(
    )  #Calling garbage collector, in an attempt to clean up memory..

    #chr = 1
    #for snpsd in snpsds:

    snpsd = snpsds[3]

    k = calcKinship(snpsd.snps[200:1400])
    res = runEmma(phed, p_i, k,
                  snpsd.snps[200:1400])  #runEmma(phed,p_i,k,snps):
    pvals = res["ps"]
    log_pvals = []
    for pval in pvals:
        #print pval
        log_pvals.append(-math.log10(pval))
    pylab.plot(snpsd.positions[200:1400],
               log_pvals,
               "c.",
               label="Emma (local)")

    k = calcKinship(totalSNPs)
    res = runEmma(phed, p_i, k,
                  snpsd.snps[200:1400])  #runEmma(phed,p_i,k,snps):
    pvals = res["ps"]
    log_pvals = []
    for pval in pvals:
        #print pval
        log_pvals.append(-math.log10(pval))
    pylab.plot(snpsd.positions[200:1400],
               log_pvals,
               "g.",
               label="Emma (global)")

    phenVals = phed.getPhenVals(p_i)
    pvals = _run_kw_(snpsd.snps[200:1400], phenVals)
    log_pvals = []
    for pval in pvals:
        #print pval
        log_pvals.append(-math.log10(pval))

    pylab.plot(snpsd.positions[200:1400],
               log_pvals,
               "r.",
               label="KW (full data)")

    (pvals, new_positions,
     acc_groups) = get_KW_pvals(snpsd.snps[200:1400],
                                snpsd.positions[200:1400],
                                phed,
                                p_i,
                                kinshipThreshold=0.95,
                                method="KW")
    ecot_map = phenotypeData._getEcotypeIdToStockParentDict_()

    for i in range(0, len(acc_groups)):
        acc_list = []
        for a_i in acc_groups[i]:
            e_i = snpsd.accessions[a_i]
            #print e_i
            acc_list.append(ecot_map[int(e_i)][0])
        print "group", i, ":", acc_list

    log_pvals = []
    for pval in pvals:
        #print pval
        log_pvals.append(-math.log10(pval))

    pylab.plot(new_positions, log_pvals, "b.", label="KW (merged data)")

    pylab.legend(numpoints=2, handlelen=0.005)

    pylab.show()