Esempi in Python per plotResult

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: plotResults

Metodo/funzione: plotResult

Esempi su hotexamples.com: 7

plotResult in Python: 7 esempi trovati. Questi sono i migliori esempi reali in Python per plotResults.plotResult, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Esempio n. 1

Mostra file

	def drawGWPlot(phenotypeID):
		"""
		Draws all the GWA plots for 6 methods.
		"""
		import plotResults		
		results = self.results_map[phenotypeID]
		for m_i in range(0,len(results)): #For all methods 
			result = results[m_i]
			print "\nPlotting result",result.name,":"
			if m_i==0:
				plotResults.plotResult(result,ylab="KW: -log(p-value)")
			elif m_i==1:
				plotResults.plotResult(result,ylab="Emma: -log(p-value)")
			elif m_i==2:
				plotResults.plotResult(result,type="score",ylab="Margarita: ARG score")
			elif m_i==3:
				plotResults.plotResult(result,type="score",ylab="RF: Importance score")
			elif m_i==4:
				plotResults.plotResult(result,type="score",ylab="Simple composite rank score")

Esempio n. 2

Mostra file

File: Emma.py Progetto: bopopescu/gwasmodules

def _run_():
    if len(sys.argv) == 1:
        print __doc__
        sys.exit(2)

    long_options_list = [
        "rFile=",
        "chr=",
        "delim=",
        "missingval=",
        "BoundaryStart=",
        "removeOutliers=",
        "addConstant=",
        "logTransform",
        "BoundaryEnd=",
        "phenotypeFileType=",
        "help",
        "parallel=",
        "parallelAll",
        "LRT",
        "minMAF=",
        "kinshipDatafile=",
        "phenotypeRanks",
        "onlyMissing",
        "onlyOriginal96",
        "onlyOriginal192",
        "onlyBelowLatidue=",
        "complement",
        "negate",
        "srInput=",
        "sr",
        "srOutput=",
        "srPar=",
        "srSkipFirstRun",
        "testRobustness",
        "permutationFilter=",
        "useLinearRegress",
        "regressionCofactors=",
        "FriLerAsCofactor",
        "FriColAsCofactor",
        "memReq=",
        "walltimeReq=",
    ]
    try:
        opts, args = getopt.getopt(sys.argv[1:], "o:c:d:m:h",
                                   long_options_list)

    except:
        traceback.print_exc()
        print sys.exc_info()
        print __doc__
        sys.exit(2)

    phenotypeRanks = False
    removeOutliers = None
    addConstant = -1
    phenotypeFileType = 1
    rFile = None
    delim = ","
    missingVal = "NA"
    help = 0
    minMAF = 0.0
    boundaries = [-1, -1]
    chr = None
    parallel = None
    logTransform = False
    negate = False
    parallelAll = False
    lrt = False
    kinshipDatafile = None
    onlyMissing = False
    onlyOriginal96 = False
    onlyOriginal192 = False
    onlyBelowLatidue = None
    complement = False

    sr = False
    srOutput = False
    srInput = False
    srSkipFirstRun = False
    srTopQuantile = 0.95
    srWindowSize = 30000

    testRobustness = False
    permutationFilter = 0.002

    useLinearRegress = False
    regressionCofactors = None
    FriLerAsCofactor = False
    FriColAsCofactor = False

    memReq = "5g"
    walltimeReq = "150:00:00"

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            help = 1
            print __doc__
        elif opt in ("-o", "--rFile"):
            rFile = arg
        elif opt in ("--phenotypeFileType"):
            phenotypeFileType = int(arg)
        elif opt in ("--BoundaryStart"):
            boundaries[0] = int(arg)
        elif opt in ("--BoundaryEnd"):
            boundaries[1] = int(arg)
        elif opt in ("--addConstant"):
            addConstant = float(arg)
        elif opt in ("--parallel"):
            parallel = arg
        elif opt in ("--minMAF"):
            minMAF = float(arg)
        elif opt in ("--parallelAll"):
            parallelAll = True
        elif opt in ("--onlyMissing"):
            onlyMissing = True
        elif opt in ("--onlyOriginal96"):
            onlyOriginal96 = True
        elif opt in ("--onlyOriginal192"):
            onlyOriginal192 = True
        elif opt in ("--onlyBelowLatidue"):
            onlyBelowLatidue = float(arg)
        elif opt in ("--complement"):
            complement = True
        elif opt in ("--logTransform"):
            logTransform = True
        elif opt in ("--negate"):
            negate = True
        elif opt in ("--removeOutliers"):
            removeOutliers = float(arg)
        elif opt in ("--LRT"):
            lrt = True
        elif opt in ("-c", "--chr"):
            chr = int(arg)
        elif opt in ("-d", "--delim"):
            delim = arg
        elif opt in ("-m", "--missingval"):
            missingVal = arg
        elif opt in ("--kinshipDatafile"):
            kinshipDatafile = arg
        elif opt in ("--phenotypeRanks"):
            phenotypeRanks = True
        elif opt in ("--sr"):
            sr = True
        elif opt in ("--srSkipFirstRun"):
            srSkipFirstRun = True
        elif opt in ("--srInput"):
            srInput = arg
        elif opt in ("--srOutput"):
            srOutput = arg
        elif opt in ("--srPar"):
            vals = arg.split(",")
            srTopQuantile = float(vals[0])
            srWindowSize = int(vals[1])
        elif opt in ("--testRobustness"):
            testRobustness = True
        elif opt in ("--permutationFilter"):
            permutationFilter = float(arg)
        elif opt in ("--FriLerAsCofactor"):
            FriLerAsCofactor = True
        elif opt in ("--FriColAsCofactor"):
            FriColAsCofactor = True
        elif opt in ("--useLinearRegress"):
            useLinearRegress = True
        elif opt in ("--regressionCofactors"):
            regressionCofactors = arg
        elif opt in ("--memReq"):
            memReq = arg
        elif opt in ("--walltimeReq"):
            walltimeReq = arg
        else:
            if help == 0:
                print "Unkown option!!\n"
                print __doc__
            sys.exit(2)

    if len(args) < 3 and not parallel:
        if help == 0:
            print "Arguments are missing!!\n"
            print __doc__
        sys.exit(2)

    print "Emma is being set up with the following parameters:"
    print "output:", rFile
    print "phenotypeRanks:", phenotypeRanks
    print "phenotypeFileType:", phenotypeFileType
    print "parallel:", parallel
    print "parallelAll:", parallelAll
    print "minMAF:", minMAF
    print "LRT:", lrt
    print "delim:", delim
    print "missingval:", missingVal
    print "kinshipDatafile:", kinshipDatafile
    print "chr:", chr
    print "boundaries:", boundaries
    print "onlyMissing:", onlyMissing
    print "onlyOriginal96:", onlyOriginal96
    print "onlyOriginal192:", onlyOriginal192
    print "onlyBelowLatidue:", onlyBelowLatidue
    print "complement:", complement
    print "negate:", negate
    print "logTransform:", logTransform
    print "addConstant:", addConstant
    print "removeOutliers:", removeOutliers
    print "sr:", sr
    print "srSkipFirstRun:", srSkipFirstRun
    print "srInput:", srInput
    print "srOutput:", srOutput
    print "srTopQuantile:", srTopQuantile
    print "srWindowSize:", srWindowSize
    print "testRobustness:", testRobustness
    print "permutationFilter:", permutationFilter
    print "useLinearRegress:", useLinearRegress
    print "regressionCofactors:", regressionCofactors
    print "FriLerAsCofactor:", FriLerAsCofactor
    print "FriColAsCofactor:", FriColAsCofactor
    print "walltimeReq:", walltimeReq
    print "memReq:", memReq

    def runParallel(phenotypeIndex, phed):
        #Cluster specific parameters
        print phenotypeIndex
        phenName = phed.getPhenotypeName(phenotypeIndex)
        outFileName = resultDir + "Emma_" + parallel + "_" + phenName

        shstr = "#!/bin/csh\n"
        shstr += "#PBS -l walltime=" + walltimeReq + "\n"
        shstr += "#PBS -l mem=" + memReq + "\n"
        shstr += "#PBS -q cmb\n"

        shstr += "#PBS -N E" + phenName + "_" + parallel + "\n"
        shstr += "set phenotypeName=" + parallel + "\n"
        shstr += "set phenotype=" + str(phenotypeIndex) + "\n"
        if useLinearRegress:
            outFileName = resultDir + "LR_" + parallel + "_" + phenName
        shstr += "(python " + emmadir + "Emma.py -o " + outFileName + " "
        if useLinearRegress:
            shstr += " --useLinearRegress "

        if regressionCofactors:
            shstr += " --regressionCofactors=" + str(regressionCofactors) + " "
        if FriLerAsCofactor:
            shstr += " --FriLerAsCofactor "
        if FriColAsCofactor:
            shstr += " --FriColAsCofactor "
        if onlyOriginal96:
            shstr += " --onlyOriginal96 "
        elif onlyOriginal192:
            shstr += " --onlyOriginal192 "
        if onlyBelowLatidue:
            shstr += " --onlyBelowLatidue=" + str(onlyBelowLatidue) + " "
        if logTransform:
            shstr += " --logTransform "
        if negate:
            shstr += " --negate "
        if removeOutliers:
            shstr += " --removeOutliers=" + str(removeOutliers) + " "
        if phenotypeRanks:
            shstr += " --phenotypeRanks "
        if testRobustness:
            shstr += " --testRobustness "

        shstr += " --permutationFilter=" + str(permutationFilter) + " "

        if sr:
            shstr += " --sr "
            if not srOutput:
                output = resultDir + "Emma_" + parallel + "_" + phenName + ".sr.pvals"
            shstr += " --srOutput=" + str(output) + " "
            if srSkipFirstRun:
                if not srInput:
                    output = resultDir + "Emma_" + parallel + "_" + phenName + ".pvals"
                shstr += " --srInput=" + str(output) + " "
                shstr += " --srSkipFirstRun "
            shstr += " --srPar=" + str(srTopQuantile) + "," + str(
                srWindowSize) + " "

        if kinshipDatafile:
            shstr += " --kinshipDatafile=" + str(kinshipDatafile) + " "
        shstr += " --addConstant=" + str(addConstant) + " "
        shstr += snpsDataFile + " " + phenotypeDataFile + " " + str(
            phenotypeIndex) + " "
        shstr += "> " + outFileName + "_job" + ".out) >& " + outFileName + "_job" + ".err\n"

        f = open(parallel + ".sh", 'w')
        f.write(shstr)
        f.close()

        #Execute qsub script
        os.system("qsub " + parallel + ".sh ")

    snpsDataFile = args[0]
    phenotypeDataFile = args[1]
    if parallel:  #Running on the cluster..
        phed = phenotypeData.readPhenotypeFile(
            phenotypeDataFile, delimiter='\t')  #Get Phenotype data
        if parallelAll:
            for phenotypeIndex in phed.phenIds:
                if onlyMissing:
                    phenName = phed.getPhenotypeName(phenotypeIndex)
                    pvalFile = resultDir + "Emma_" + parallel + "_" + phenName + ".pvals"
                    res = None
                    try:
                        res = os.stat(pvalFile)

                    except Exception:
                        print "File", pvalFile, "does not exist."
                    if res and res.st_size > 0:
                        print "File", pvalFile, "already exists, and is non-empty."
                        if sr:
                            srInput = resultDir + "Emma_" + parallel + "_" + phenName + ".sr.pvals"
                            srRes = None
                            try:
                                srRes = os.stat(srInput)
                            except Exception:
                                print "File", srInput, "does not exist."
                            if srRes and srRes.st_size > 0:
                                print "File", srInput, "already exists, and is non-empty."
                            else:
                                runParallel(phenotypeIndex, phed)

                    else:
                        print "Setting up the run."
                        runParallel(phenotypeIndex, phed)

                else:
                    runParallel(phenotypeIndex, phed)
        else:
            phenotypeIndex = int(args[2])
            runParallel(phenotypeIndex, phed)
        return
    else:
        phenotypeIndex = int(args[2])

    print "phenotypeIndex:", phenotypeIndex
    print "\nStarting program now!\n"

    snpsds = dataParsers.parseCSVData(snpsDataFile,
                                      format=1,
                                      deliminator=delim,
                                      missingVal=missingVal)

    #Load phenotype file
    phed = phenotypeData.readPhenotypeFile(phenotypeDataFile,
                                           delimiter='\t')  #Get Phenotype data
    numAcc = len(snpsds[0].accessions)

    #Removing outliers
    if removeOutliers:
        print "Remoing outliers"
        phed.naOutliers(phenotypeIndex, removeOutliers)

    #If onlyOriginal96, then remove all other phenotypes..
    if onlyOriginal96:
        print "Filtering for the first 96 accessions"
        original_96_ecotypes = phenotypeData._getFirst96Ecotypes_()
        original_96_ecotypes = map(str, original_96_ecotypes)
        keepEcotypes = []
        if complement:
            for acc in phed.accessions:
                if not acc in original_96_ecotypes:
                    keepEcotypes.append(acc)
        else:
            keepEcotypes = original_96_ecotypes
        phed.filterAccessions(keepEcotypes)
        print "len(phed.accessions)", len(phed.accessions)

    if onlyOriginal192:
        print "Filtering for the first 192 accessions"
        original_192_ecotypes = phenotypeData._getFirst192Ecotypes_()
        original_192_ecotypes = map(str, original_192_ecotypes)
        keepEcotypes = []
        if complement:
            for acc in phed.accessions:
                if not acc in original_192_ecotypes:
                    keepEcotypes.append(acc)
        else:
            keepEcotypes = original_192_ecotypes
        phed.filterAccessions(keepEcotypes)
        print "len(phed.accessions)", len(phed.accessions)

    if onlyBelowLatidue:
        print "Filtering for the accessions which orginate below latitude", onlyBelowLatidue
        eiDict = phenotypeData._getEcotypeIdInfoDict_()
        print eiDict
        keepEcotypes = []
        for acc in phed.accessions:
            acc = int(acc)
            if eiDict.has_key(acc) and eiDict[acc][
                    2] and eiDict[acc][2] < onlyBelowLatidue:
                keepEcotypes.append(str(acc))
            elif eiDict.has_key(acc) and eiDict[acc][2] == None:
                keepEcotypes.append(str(acc))

        phed.filterAccessions(keepEcotypes)
        print "len(phed.accessions)", len(phed.accessions)
    sys.stdout.write("Finished prefiltering phenotype accessions.\n")
    sys.stdout.flush()

    phenotype = phed.getPhenIndex(phenotypeIndex)

    accIndicesToKeep = []
    phenAccIndicesToKeep = []
    #Checking which accessions to keep and which to remove .
    for i in range(0, len(snpsds[0].accessions)):
        acc1 = snpsds[0].accessions[i]
        for j in range(0, len(phed.accessions)):
            acc2 = phed.accessions[j]
            if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA':
                accIndicesToKeep.append(i)
                phenAccIndicesToKeep.append(j)
                break

    print "\nFiltering accessions in genotype data:"
    #Filter accessions which do not have the phenotype value (from the genotype data).
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        snpsd.removeAccessionIndices(accIndicesToKeep)
    print ""
    print numAcc - len(
        accIndicesToKeep
    ), "accessions removed from genotype data, leaving", len(
        accIndicesToKeep), "accessions in all."

    print "\nNow filtering accessions in phenotype data:"
    phed.removeAccessions(
        phenAccIndicesToKeep
    )  #Removing accessions that don't have genotypes or phenotype values

    print "Verifying number of accessions: len(phed.accessions)==len(snpsds[0].accessions) is", len(
        phed.accessions) == len(snpsds[0].accessions)
    if len(phed.accessions) != len(snpsds[0].accessions):
        raise Exception

    #Filtering monomorphic
    print "Filtering monomorphic SNPs"
    for snpsd in snpsds:
        print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

    #Remove minor allele frequencies
    if minMAF != 0:
        sys.stdout.write("Filterting SNPs with MAF<" + str(minMAF) + ".")
        for snpsd in snpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            snpsd.filterMinMAF(minMAF)

    #Removing SNPs which are outside of boundaries.
    if chr:
        print "\nRemoving SNPs which are outside of boundaries."
        snpsds[chr - 1].filterRegion(boundaries[0], boundaries[1])
        snpsds = [snpsds[chr - 1]]

    #Ordering accessions in genotype data to fit phenotype data.
    print "Ordering genotype data accessions."
    accessionMapping = []
    i = 0
    for acc in phed.accessions:
        if acc in snpsds[0].accessions:
            accessionMapping.append((snpsds[0].accessions.index(acc), i))
            i += 1

    #print zip(accessionMapping,snpsds[0].accessions)
    print "len(snpsds[0].snps)", len(snpsds[0].snps)

    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        snpsd.orderAccessions(accessionMapping)
    print "\nGenotype data has been ordered."

    #Converting format to 01
    newSnpsds = []
    sys.stdout.write("Converting data format")
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        newSnpsds.append(snpsd.getSnpsData(missingVal=missingVal))
    print ""

    print "Checking kinshipfile:", kinshipDatafile

    if kinshipDatafile:  #Is there a special kinship file?
        kinshipSnpsds = dataParsers.parseCSVData(kinshipDatafile,
                                                 format=1,
                                                 deliminator=delim,
                                                 missingVal=missingVal)

        accIndicesToKeep = []
        #Checking which accessions to keep and which to remove (genotype data).
        sys.stdout.write(
            "Removing accessions which do not have a phenotype value for " +
            phed.phenotypeNames[phenotype] + ".")
        sys.stdout.flush()
        for i in range(0, len(kinshipSnpsds[0].accessions)):
            acc1 = kinshipSnpsds[0].accessions[i]
            for j in range(0, len(phed.accessions)):
                acc2 = phed.accessions[j]
                if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA':
                    accIndicesToKeep.append(i)
                    break
        print accIndicesToKeep

        for snpsd in kinshipSnpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            snpsd.removeAccessionIndices(accIndicesToKeep)
        print ""
        print numAcc - len(
            accIndicesToKeep
        ), "accessions removed from kinship genotype data, leaving", len(
            accIndicesToKeep), "accessions in all."

        print "Ordering kinship data accessions."
        accessionMapping = []
        i = 0
        for acc in snpsds[0].accessions:
            if acc in kinshipSnpsds[0].accessions:
                accessionMapping.append(
                    (kinshipSnpsds[0].accessions.index(acc), i))
                i += 1

        print zip(accessionMapping, snpsds[0].accessions)
        print "len(snpsds[0].snps)", len(snpsds[0].snps)

        for snpsd in kinshipSnpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            snpsd.orderAccessions(accessionMapping)
        print "Kinship genotype data has been ordered."

        newKinshipSnpsds = []
        sys.stdout.write("Converting data format")
        for snpsd in kinshipSnpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            newKinshipSnpsds.append(snpsd.getSnpsData(
                missingVal=missingVal))  #This data might have NAs
        print ""
        kinshipSnpsds = newKinshipSnpsds

    else:
        kinshipSnpsds = newSnpsds

    print "Found kinship data."

    #Ordering accessions according to the order of accessions in the genotype file
    #	accessionMapping = []
    #	i = 0
    #	for acc in snpsds[0].accessions:
    #		if acc in phed.accessions:
    #			accessionMapping.append((phed.accessions.index(acc),i))
    #			i += 1
    #	phed.orderAccessions(accessionMapping)

    #Negating phenotypic values
    if negate:
        phed.negateValues(phenotypeIndex)

    if logTransform and not phed.isBinary(
            phenotypeIndex) and phed.getMinValue(phenotypeIndex) <= 0:
        addConstant = 0

    #Adding a constant.
    if addConstant != -1:
        if addConstant == 0:
            addConstant = math.sqrt(phed.getVariance(phenotypeIndex)) / 10
            addConstant = addConstant - phed.getMinValue(phenotypeIndex)

        print "Adding a constant to phenotype:", addConstant
        phed.addConstant(phenotypeIndex, addConstant)

    #Log-transforming
    if logTransform:
        print "Log transforming phenotype"
        phed.logTransform(phenotypeIndex)
    #Converting phenotypes to Ranks
    elif phenotypeRanks:
        phed.transformToRanks(phenotypeIndex)

    if not chr:
        snpsDataset = snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5])
        kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds,
                                                  [1, 2, 3, 4, 5])
    else:
        snpsDataset = snpsdata.SNPsDataSet(newSnpsds, [chr])
        kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds, [chr])

    phenotypeName = phed.getPhenotypeName(phenotypeIndex)

    sys.stdout.flush()

    if testRobustness:
        print "Starting a robustness test"
        allSNPs = []
        for snpsd in snpsDataset.snpsDataList:
            allSNPs += snpsd.snps
        phenVals = phed.getPhenVals(phenotypeIndex)
        _robustness_test_(allSNPs, phenVals, rFile, filter=permutationFilter)
        sys.exit(0)

    if useLinearRegress:
        phenVals = phed.getPhenVals(phenotypeIndex)
        d0 = {}
        d0["phen"] = phenVals
        dh = {}
        dh["phen"] = phenVals
        import rpy, gc
        if regressionCofactors:  #Adds ler and col as cofactors
            import pickle
            f = open(regressionCofactors, "r")
            co_factors = pickle.load(f)
            f.close()
            #inserting co factors into model
            for factor in co_factors:
                d[factor] = co_factors[factor]
        import analyzeHaplotype as ah
        (ler_factor, col_factor) = ah.getLerAndColAccessions(newSnpsds, True)
        if FriColAsCofactor:
            d0["col"] = col_factor
            dh["col"] = col_factor
        if FriLerAsCofactor:
            d0["ler"] = ler_factor
            dh["ler"] = ler_factor
        chr_pos_pvals = []
        stats = []
        sys.stdout.write("Applying the linear model")
        sys.stdout.flush()
        for i in range(0, len(newSnpsds)):  #[3]:#
            snpsd = newSnpsds[i]
            sys.stdout.write("|")
            sys.stdout.flush()
            gc.collect(
            )  #Calling garbage collector, in an attempt to clean up memory..
            for j in range(0, len(snpsd.snps)):
                if j % 5000 == 0:
                    sys.stdout.write(".")
                    sys.stdout.flush()
                #if snpsd.positions[j]>1700000:
                #	break
                snp = snpsd.snps[j]
                d0["snp"] = snp
                try:
                    rpy.set_default_mode(rpy.NO_CONVERSION)
                    aov0 = rpy.r.aov(r("phen ~ ."), data=d0)
                    aovh = rpy.r.aov(r("phen ~ ."), data=dh)
                    rpy.set_default_mode(rpy.BASIC_CONVERSION)
                    s0 = rpy.r.summary(aov0)
                    sh = rpy.r.summary(aovh)
                    #print s0,sh
                    rss_0 = s0['Sum Sq'][-1]
                    if type(sh['Sum Sq']) != float:
                        rss_h = sh['Sum Sq'][-1]

                    else:
                        rss_h = sh['Sum Sq']
                    f = (rss_h - rss_0) / (rss_0 /
                                           (len(phenVals) - len(d0) + 1))
                    pval = rpy.r.pf(f, 1, len(phenVals), lower_tail=False)
                except Exception, err_str:
                    print "Calculating p-value failed"  #,err_str
                    pval = 1.0
                #print "dh:",dh
                #print "d0:",d0
                #print "rss_h,rss_0:",rss_h,rss_0
                #print "f,p:",f,pval
                chr_pos_pvals.append([i + 1, snpsd.positions[j], pval])
                mafc = min(snp.count(snp[0]), len(snp) - snp.count(snp[0]))
                maf = mafc / float(len(snp))
                stats.append([maf, mafc])
        sys.stdout.write("\n")
        #Write out to a result file
        sys.stdout.write("Writing results to file\n")
        sys.stdout.flush()
        pvalFile = rFile + ".pvals"
        f = open(pvalFile, "w")
        f.write("Chromosome,position,p-value,marf,maf\n")
        for i in range(0, len(chr_pos_pvals)):
            chr_pos_pval = chr_pos_pvals[i]
            stat = stats[i]
            f.write(
                str(chr_pos_pval[0]) + "," + str(chr_pos_pval[1]) + "," +
                str(chr_pos_pval[2]) + "," + str(stat[0]) + "," +
                str(stat[1]) + "\n")
        f.close()

        #Plot results
        print "Generating a GW plot."
        phenotypeName = phed.getPhenotypeName(phenotypeIndex)
        res = gwaResults.Result(pvalFile,
                                name="LM_" + phenotypeName,
                                phenotypeID=phenotypeIndex)
        res.negLogTransform()
        pngFile = pvalFile + ".png"
        plotResults.plotResult(res,
                               pngFile=pngFile,
                               percentile=90,
                               type="pvals",
                               ylab="$-$log$_{10}(p)$",
                               plotBonferroni=True,
                               usePylab=False)

Esempio n. 3

Mostra file

File: Emma.py Progetto: bopopescu/gwasmodules

                                   rFile,
                                   chr=chr,
                                   delim=delim,
                                   missingVal=missingVal,
                                   boundaries=boundaries,
                                   lrt=lrt)
        res = gwaResults.Result(pvalFile,
                                name="EMMA_" + phenotypeName,
                                phenotypeID=phenotypeIndex)
        res.filterMAF()
        res.negLogTransform()
        pngFile = pvalFile + ".png"
        plotResults.plotResult(res,
                               pngFile=pngFile,
                               percentile=90,
                               type="pvals",
                               ylab="$-$log$_{10}(p)$",
                               plotBonferroni=True,
                               usePylab=False)
        srInput = pvalFile

    if sr:
        _secondRun_(srOutput, srInput, srTopQuantile, srWindowSize, newSnpsds,
                    phed, phenotypeIndex, kinshipSnpsDataset)
        print "Generating second run GW plot."
        res = gwaResults.Result(srInput,
                                name="KW_" + phenotypeName,
                                phenotypeID=phenotypeIndex)
        res.filterMAF()
        res.negLogTransform()
        srRes = gwaResults.Result(srOutput,

Esempio n. 4

Mostra file

def _run_():
	if len(sys.argv)==1:
		print __doc__
		sys.exit(2)
	
	long_options_list=["outputFile=", "delim=", "missingval=", "phenotypeFileType=", 
					"help", "parallel=", "parallelAll", "addToDB", 
					"callMethodID=", "comment=", "onlyOriginal192","onlyOriginal96", "subSample=" , 
					"subSampleLikePhenotype=", "subsampleTest=", "complement", "onlyBelowLatidue=", 
					"onlyAboveLatidue=", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun",
					"permTest=", "savePermutations", "permutationFilter=", "testRobustness",
					"memReq=","walltimeReq=",]
	try:
		opts, args=getopt.getopt(sys.argv[1:], "o:c:d:m:h", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
	phenotypeFileType=1
	outputFile=None
	delim=","
	missingVal="NA"
	help=0
	parallel=None
	parallelAll=False
	addToDB=False
	callMethodID=None
	comment=""
	subSample=None
	onlyOriginal96=False
	onlyOriginal192 = False
	subSampleLikePhenotype = None
	subsampleTest = False
	numSubSamples = None
	complement = False
	onlyBelowLatidue = None
	onlyAboveLatidue = None

	sr = False
	srOutput = False
	srInput = False
	srSkipFirstRun = False
	srTopQuantile = 0.95
	srWindowSize = 30000
	
	permTest = None
	savePermutations = False
	permutationFilter = 1.0
	
	testRobustness = False

	memReq = "5g"
	walltimeReq = "100:00:00"

	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help=1
			print __doc__
		elif opt in ("-o", "--outputFile"):
			outputFile=arg
		elif opt in ("--phenotypeFileType"):
			phenotypeFileType=int(arg)
		elif opt in ("--parallel"):
			parallel=arg
		elif opt in ("--parallelAll"):
			parallelAll=True
		elif opt in ("--addToDB"):
			addToDB=True
  		elif opt in ("--onlyOriginal96"):
			onlyOriginal96=True
  		elif opt in ("--onlyOriginal192"):
			onlyOriginal192=True
		elif opt in ("--complement"):
			complement=True
		elif opt in ("--subSample"):
			subSample=int(arg)
		elif opt in ("--subsampleTest"):
			subsampleTest = True
			l = arg.split(",")
			subSample=int(l[0])
			numSubSamples=int(l[1])
		elif opt in ("--onlyBelowLatidue"):
			onlyBelowLatidue=float(arg)
		elif opt in ("--onlyAboveLatidue"):
			onlyAboveLatidue=float(arg)
		elif opt in ("--subSampleLikePhenotype"):
			subSampleLikePhenotype=int(arg)
		elif opt in ("--callMethodID"):
			callMethodID=int(arg)
		elif opt in ("--comment"):
			comment=arg
		elif opt in ("-d", "--delim"):
			delim=arg
		elif opt in ("-m", "--missingval"):
			missingVal=arg
		elif opt in ("--sr"):
			sr = True
		elif opt in ("--testRobustness"):
			testRobustness = True
		elif opt in ("--permTest"):
			permTest = int(arg)
		elif opt in ("--savePermutations"):
			savePermutations = True
		elif opt in ("--permutationFilter"):
			permutationFilter = float(arg)
		elif opt in ("--srSkipFirstRun"):
			srSkipFirstRun = True
		elif opt in ("--srInput"):
			srInput = arg
		elif opt in ("--srOutput"):
			srOutput = arg
		elif opt in ("--srPar"):
			vals = arg.split(",")
			srTopQuantile = float(vals[0]) 
			srWindowSize = int(vals[1]) 
		elif opt in ("--memReq"):
			memReq=arg
		elif opt in ("--walltimeReq"):
			walltimeReq=arg
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)

	if len(args)<3 and not parallel:
		if help==0:
			print "Arguments are missing!!\n"
			print __doc__
		sys.exit(2)

	snpsDataFile=args[0]
	phenotypeDataFile=args[1]

	print "Kruskal-Wallis is being set up with the following parameters:"
	print "phenotypeDataFile:",phenotypeDataFile
	print "snpsDataFile:",snpsDataFile
	print "parallel:",parallel
	print "parallelAll:",parallelAll
	print "onlyOriginal96:",onlyOriginal96
	print "onlyOriginal192:",onlyOriginal192
	print "onlyBelowLatidue:",onlyBelowLatidue
	print "onlyAboveLatidue:",onlyAboveLatidue
	print "complement:",complement
	print "subSampleLikePhenotype:",subSampleLikePhenotype
	print "subsampleTest:",subsampleTest
	print "numSubSamples:",numSubSamples
	print "subSample:",subSample
	print "sr:",sr
	print "srSkipFirstRun:",srSkipFirstRun
	print "srInput:",srInput
	print "srOutput:",srOutput
	print "srTopQuantile:",srTopQuantile
	print "srWindowSize:",srWindowSize
	print "permTest:",permTest
	print "savePermutations:",savePermutations
	print "permutationFilter:",permutationFilter
	print "testRobustness:",testRobustness
	print "walltimeReq:",walltimeReq
	print "memReq:",memReq

	def runParallel(phenotypeIndex,id=""):
		#Cluster specific parameters
		phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
		phenName=phed.getPhenotypeName(phenotypeIndex)
		print phenName
		outputFile=resultDir+"KW_"+parallel+"_"+phenName+id

		shstr = "#!/bin/csh\n"
		shstr += "#PBS -l walltime="+walltimeReq+"\n"
		shstr += "#PBS -l mem="+memReq+"\n"
		shstr +="#PBS -q cmb\n"
		
		shstr+="#PBS -N K"+phenName+"_"+parallel+"\n"
		shstr+="set phenotypeName="+parallel+"\n"
		shstr+="set phenotype="+str(phenotypeIndex)+"\n"
		shstr+="(python "+scriptDir+"KW.py -o "+outputFile+" "
		if subSample:
			shstr+=" --subSample="+str(subSample)+" "			
		elif onlyOriginal96:
			shstr+=" --onlyOriginal96 "			
		elif onlyOriginal192:
			shstr+=" --onlyOriginal192 "
		if onlyBelowLatidue:
			shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" "
		elif onlyAboveLatidue:
			shstr+=" --onlyAboveLatidue="+str(onlyAboveLatidue)+" "
		if complement: 			
			shstr+=" --complement "
		if permTest:
			shstr+=" --permTest="+str(permTest)+" "
			if savePermutations:
				shstr+=" --savePermutations "
		
		shstr+=" --permutationFilter="+str(permutationFilter)+" "
		if testRobustness:
			shstr+=" --testRobustness "
			
		if sr:
			shstr += " --sr "			
			if not srOutput:
				output = resultDir+"KW_"+parallel+"_"+phenName+".sr.pvals"				
			shstr += " --srOutput="+str(output)+" "
			if srSkipFirstRun:
				if not srInput:
					output = resultDir+"KW_"+parallel+"_"+phenName+".pvals"
				shstr += " --srInput="+str(output)+" "
				shstr += " --srSkipFirstRun "				
			shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" "


		shstr+=snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" "
		shstr+="> "+outputFile+"_job"+".out) >& "+outputFile+"_job"+".err\n"

		f=open(parallel+".sh", 'w')
		f.write(shstr)
		f.close()

		#Execute qsub script
		os.system("qsub "+parallel+".sh ")

	if parallel:  #Running on the cluster..
		if parallelAll:
			phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
			for phenotypeIndex in phed.phenIds:
				runParallel(phenotypeIndex)
		elif subsampleTest:
			phenotypeIndex=int(args[2])
			for i in range(0,numSubSamples):
				runParallel(phenotypeIndex,id="_r"+str(subSample)+"_"+str(i))
		else:
			phenotypeIndex=int(args[2])
			runParallel(phenotypeIndex)
		return
	else:
		phenotypeIndex=int(args[2])


	print "phenotypeIndex:",phenotypeIndex
	print "output:",outputFile
	print "\nStarting program now!\n"


	#Load phenotype file
	phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
	
	#If onlyOriginal96, then remove all other phenotypes..
	if onlyOriginal96: 
		print "Filtering for the first 96 accessions"
		original_96_ecotypes = phenotypeData._getFirst96Ecotypes_()
		original_96_ecotypes = map(str,original_96_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_96_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_96_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	if onlyOriginal192: 
		print "Filtering for the first 192 accessions"
		original_192_ecotypes = phenotypeData._getFirst192Ecotypes_()
		original_192_ecotypes = map(str,original_192_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_192_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_192_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
	
	if onlyBelowLatidue:
		print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)

	elif onlyAboveLatidue:
		print "Filtering for the accessions which orginate above latitude",onlyAboveLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]>onlyAboveLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)

	
	if subSampleLikePhenotype:
		p_name = phed.getPhenotypeName(subSampleLikePhenotype)
		print "Picking sample as in",p_name
		ecotypes = phed.getNonNAEcotypes(subSampleLikePhenotype)
		print ecotypes
		phed.filterAccessions(ecotypes)
		print "len(phed.accessions)", len(phed.accessions)


	if subSample: 
		sample_ecotypes = []
		ecotypes = phed.getNonNAEcotypes(phenotypeIndex)
		sample_ecotypes = random.sample(ecotypes,subSample)			
		phed.filterAccessions(sample_ecotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	sys.stdout.write("Finished prefiltering phenotype accessions.\n")
	sys.stdout.flush()
	
	
	
	#Load genotype file
	snpsds=dataParsers.parseCSVData(snpsDataFile, format = 1, deliminator = delim, missingVal = missingVal)


	#Checking overlap between phenotype and genotype accessions. 
	phenotype=phed.getPhenIndex(phenotypeIndex)
	accIndicesToKeep=[]			
	phenAccIndicesToKeep=[]
	numAcc=len(snpsds[0].accessions)
	sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".")
	sys.stdout.flush()
	for i in range(0, len(snpsds[0].accessions)):
		acc1=snpsds[0].accessions[i]
		for j in range(0, len(phed.accessions)):
			acc2=phed.accessions[j]
			if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA':
				accIndicesToKeep.append(i)
				phenAccIndicesToKeep.append(j)
				break	


	#Filter accessions which do not have the phenotype value.
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		snpsd.removeAccessionIndices(accIndicesToKeep)
	print ""
	print numAcc-len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all."
		
	print "Filtering phenotype data."
	phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values
	
	#Ordering accessions according to the order of accessions in the genotype file
	accessionMapping=[]
	i=0
	for acc in snpsds[0].accessions:
		if acc in phed.accessions:
			accessionMapping.append((phed.accessions.index(acc), i))
			i+=1
	phed.orderAccessions(accessionMapping)

		#Filtering monomorphic
	print "Filtering monomorphic SNPs"
	for snpsd in snpsds:
		print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

	#Converting format to 01
	newSnpsds=[]
	sys.stdout.write("Converting data format")
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		newSnpsds.append(snpsd.getSnpsData())
	print ""
	
	#Double check genotype file:
	problems = 0
	for i in range(0,len(newSnpsds)):
		snpsd = newSnpsds[i]
		for j in range(0,len(snpsd.snps)):
			snp = snpsd.snps[j]
			sc = snp.count(0)
			if sc==0 or sc==len(snp):
				print "Problem in file found at chr,pos",(i+1),",",snpsd.positions[i]
				problems += 1
	if problems >0:
		print "Genotype file appears to have potential problems"
	else:
		print "Genotype file appears to be good"

	if permTest:
		print "Starting a permutation test"
		allSNPs = []
		for snpsd in newSnpsds:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		test_type = "KW"
		if phed.isBinary(phenotypeIndex):
			test_type = "Fisher"
			permTest = 100	
		_perm_test_(allSNPs,phenVals,permTest,outputFile, test_type=test_type,savePermutations=savePermutations, filter=permutationFilter)
		sys.exit(0)
	
	if testRobustness:
		print "Starting a robustness test"
		allSNPs = []
		for snpsd in newSnpsds:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		test_type = "KW"
		if phed.isBinary(phenotypeIndex):
			test_type = "Fisher"
		_robustness_test_(allSNPs,phenVals,outputFile, test_type=test_type, filter=permutationFilter)
		sys.exit(0)
		

	sys.stdout.flush()
	print "sr:",sr, ", srSkipFirstRun:",srSkipFirstRun
	if (not sr) or (sr and not srSkipFirstRun):
		#Writing files
		#phed and phenotype
		sd=snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5])
		phenotypeName=phed.getPhenotypeName(phenotypeIndex)
		
		if phed.isBinary(phenotypeIndex):
			pvals = run_fet(sd.getSnps(),phed.getPhenVals(phenotypeIndex))	
		else:
			snps = sd.getSnps()
			phen_vals = phed.getPhenVals(phenotypeIndex)
			try:
				kw_res = util.kruskal_wallis(snps,phen_vals)
				pvals = kw_res['ps']
			except:
				print snps
				print phen_vals
				print len(snps),len(snps[0]),len(phen_vals)
				raise Exception
							
		res = gwaResults.Result(scores = pvals,name="KW_"+phenotypeName, snpsds=newSnpsds, load_snps=False)
		pvalFile=outputFile+".pvals"
		res.writeToFile(pvalFile)

		print "Generating a GW plot."
		res.negLogTransform()
		pngFile = pvalFile+".png"
		plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False)	
		srInput = pvalFile
		
	else:
		print "Skipping first stage analysis."
		sys.stdout.flush()

	if sr:
		_secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,binary=binary)
		print "Generating second run GW plot."
		res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex)
		res.negLogTransform()
		srRes = gwaResults.Result(srOutput,name="KW_SR_"+phenotypeName, phenotypeID=phenotypeIndex)
		srRes.negLogTransform()
		srPngFile = pvalFile+".sr.png"
		plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)

Esempio n. 5

Mostra file

def _testQQplot_(includeEmmaInBinary=False,usePvalueFiles=True):
	resdir = "/Users/bjarni/tmp/"
	#resdir = "/Network/Data/250k/tmp-bvilhjal/phenotype_analyzis/"
	#resdir = "/Network/Data/250k/tmp-bvilhjal/qq_plots/"
	phenotypeFile = "/Network/Data/250k/dataFreeze_011209/phenotypes_all_raw_012509.tsv"
	print "Loading phenotype data"
	phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t')
	phed2 = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t')
	phenotypeIndices = phenotypeData.categories_2_phenotypes[4]#+phenotypeData.categories_2_phenotypes[2]+phenotypeData.categories_2_phenotypes[3]+phenotypeData.categories_2_phenotypes[4]
	#(results_map, resultTypes_map) = _loadData_(phed, phenotypeIndices)
	q_pvalues = None
	stat_dict = {}
	for p_i in phenotypeIndices:
		(results_map, resultTypes_map) = _loadData_(phed, [p_i])
		#try:
		phenName = phed.getPhenotypeName(p_i)
		phenNamePrint = " ".join(phenName.split("_")[1:])
		print "\nWorking on phenotype",phenName
		if usePvalueFiles:
			q_pvalues = _getPermPvalues_(phenName)
			print len(q_pvalues),"permuted pvalues found"

		valCount = phed.countValues(p_i)
		print valCount,"values found."
		if (not phed.isBinary(p_i)) or includeEmmaInBinary:
			histogramFile = resdir + phenName +"_hist.pdf"
			histogramFile_png = resdir + phenName +"_hist.png"
			drawHistogram(phed, p_i, title = phenNamePrint, pngFile = histogramFile_png)
			if phed.logTransform(p_i):
				histogramFile = resdir + phenName + "_hist_logTransformed.pdf"
				histogramFile_png = resdir + phenName + "_hist_logTransformed.png"
				drawHistogram(phed, p_i, title = phenNamePrint, pngFile = histogramFile_png)
			elif not phed.isBinary(p_i):
				print "adding scaled const."
				phed.addSDscaledConstant(p_i)
				if phed.logTransform(p_i):
					histogramFile = resdir + phenName + "_hist_logTransformed_const.pdf"
					histogramFile_png = resdir + phenName + "_hist_logTransformed_const.png"
					drawHistogram(phed, p_i, title = phenNamePrint, pngFile = histogramFile_png)

#				phed2.naOutliers(p_i,10)
#				histogramFile = resdir + phenName + "_hist_noOutliers.pdf"
#				histogramFile_png = resdir + phenName + "_hist_noOutliers.png"
#				drawHistogram(phed2, p_i, title = phenName, pdfFile = histogramFile, pngFile = histogramFile_png)
#				if phed2.logTransform(p_i):
#					histogramFile = resdir + phenName + "_hist_logTransformed_noOutliers.pdf"
#					histogramFile_png = resdir + phenName + "_hist_logTransformed_noOutliers.png"
#					drawHistogram(phed2, p_i, title = phenName, pdfFile = histogramFile, pngFile = histogramFile_png)
		results = results_map[p_i]
		resultTypes = resultTypes_map[p_i]
		qqplotFile = resdir + phenName + "_qqplot.pdf"
		qqplotFile_png = resdir + phenName + "_qqplot.png"
		s_dict={}
		(As,Ms)=drawQQPlot(results, 1000, phenName = phenNamePrint, resultTypes = resultTypes, pngFile=qqplotFile_png, perm_pvalues = q_pvalues)
		s_dict["A"]=As
		s_dict["M"]=Ms
		
		qqplotFile = resdir + phenName + "_qqplot_log.pdf"
		qqplotFile_png = resdir + phenName + "_qqplot_log.png"
		(ds,areas,slopes) = drawLogQQPlot(results, 1000,5, phenName = phenNamePrint, resultTypes = resultTypes, pngFile=qqplotFile_png, perm_pvalues = q_pvalues)
		s_dict["A2"]=areas
		s_dict["D"]=ds
		s_dict["S"]=slopes
		stat_dict[p_i] = s_dict
		for i in range(0,len(results)):
			result = results[i]
			result.negLogTransform()
			pngFile = resdir + phenName + "_gwplot_" +resultTypes[i]+".png"
			plotResults.plotResult(result,pngFile=pngFile,percentile=90,type="pvals", plotBonferroni=True)	
		#except Exception:
		#	print "\nPhenotype index", p_i, "failed."
		del results_map
	       	gc.collect()  #Calling garbage collector, in an attempt to clean up memory..
		
	print stat_dict
	stat_file_name = resdir + "confounding_stat_4.txt"
	f = open(stat_file_name,"w")
	methods = ["KW","Emma"]
	f.write("phenotype_name, method_name, is_binary, D, A, B, M, S\n")
	for p_i in phenotypeIndices:
		if stat_dict.has_key(p_i):
			s_dict = stat_dict[p_i]
			phenName = phed.getPhenotypeName(p_i)
			phenName = " ".join(phenName.split("_")[1:])
			for i in range(0,len(methods)):
				st = phenName+", "+methods[i]+", "+str(phed.isBinary(p_i))+", "+str(s_dict["D"][i])+", "+str(s_dict["A"][i])+", "+str(s_dict["A2"][i])+", "+str(s_dict["M"][i])+", "+str(s_dict["S"][i])+"\n"
				f.write(st)
	f.close()

Esempio n. 6

Mostra file

def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["rFile=","chr=", "delim=", "missingval=", "withArrayId=", "BoundaryStart=", "removeOutliers=", "addConstant=",
						"logTransform", "BoundaryEnd=", "phenotypeFileType=", "help", "parallel=", "parallelAll", "LRT", "minMAF=", 
						"kinshipDatafile=", "phenotypeRanks", "onlyMissing","onlyOriginal96", "onlyOriginal192", "onlyBelowLatidue=", 
						"complement", "negate", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun", "testRobustness",
						"permutationFilter="]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "o:c:d:m:a:h", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	phenotypeRanks = False
	removeOutliers = None
	addConstant = -1
	phenotypeFileType = 1
	rFile = None
	delim = ","
	missingVal = "NA"
	help = 0
	minMAF=0.0
	withArrayIds = 1
	boundaries = [-1,-1]
	chr=None
	parallel = None
	logTransform = False
	negate = False
	parallelAll = False
	lrt = False
	kinshipDatafile = None 
	onlyMissing = False
	onlyOriginal96 = False
	onlyOriginal192 = False
	onlyBelowLatidue = None
	complement = False

	sr = False
	srOutput = False
	srInput = False
	srSkipFirstRun = False
	srTopQuantile = 0.95
	srWindowSize = 30000
	
	testRobustness = False
	permutationFilter = 0.002

	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help = 1
			print __doc__
		elif opt in ("-a","--withArrayId"):
			withArrayIds = int(arg)
		elif opt in ("-o","--rFile"):
			rFile = arg
		elif opt in ("--phenotypeFileType"):
			phenotypeFileType = int(arg)
		elif opt in ("--BoundaryStart"):
			boundaries[0] = int(arg)
		elif opt in ("--BoundaryEnd"):
			boundaries[1] = int(arg)
		elif opt in ("--addConstant"):
			addConstant = float(arg)
		elif opt in ("--parallel"):
			parallel = arg
		elif opt in ("--minMAF"):
			minMAF = float(arg)
		elif opt in ("--parallelAll"):
			parallelAll = True
		elif opt in ("--onlyMissing"):
			onlyMissing = True
		elif opt in ("--onlyOriginal96"):
			onlyOriginal96 = True
		elif opt in ("--onlyOriginal192"):
			onlyOriginal192 = True
		elif opt in ("--onlyBelowLatidue"):
			onlyBelowLatidue = float(arg)
		elif opt in ("--complement"):
			complement = True
		elif opt in ("--logTransform"):
			logTransform = True
		elif opt in ("--negate"):
			negate = True
		elif opt in ("--removeOutliers"):
			removeOutliers = float(arg)
		elif opt in ("--LRT"):
			lrt = True
		elif opt in ("-c","--chr"):
			chr = int(arg)
		elif opt in ("-d","--delim"):
			delim = arg
		elif opt in ("-m","--missingval"):
			missingVal = arg
		elif opt in ("--kinshipDatafile"):
			kinshipDatafile = arg
		elif opt in ("--phenotypeRanks"):
			phenotypeRanks = True
		elif opt in ("--sr"):
			sr = True
		elif opt in ("--srSkipFirstRun"):
			srSkipFirstRun = True
		elif opt in ("--srInput"):
			srInput = arg
		elif opt in ("--srOutput"):
			srOutput = arg
		elif opt in ("--srPar"):
			vals = arg.split(",")
			srTopQuantile = float(vals[0]) 
			srWindowSize = int(vals[1]) 
		elif opt in ("--testRobustness"):
			testRobustness = True
		elif opt in ("--permutationFilter"):
			permutationFilter = float(arg)
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)

	if len(args)<3 and not parallel:
		if help==0:
			print "Arguments are missing!!\n"
			print __doc__
		sys.exit(2)

	print "Emma is being set up with the following parameters:"
	print "output:",rFile
	print "phenotypeRanks:",phenotypeRanks
	print "withArrayId:",withArrayIds
	print "phenotypeFileType:",phenotypeFileType
	print "parallel:",parallel
	print "parallelAll:",parallelAll
	print "minMAF:",minMAF
	print "LRT:",lrt
	print "delim:",delim
	print "missingval:",missingVal
	print "kinshipDatafile:",kinshipDatafile
	print "chr:",chr
	print "boundaries:",boundaries
	print "onlyMissing:",onlyMissing
	print "onlyOriginal96:",onlyOriginal96
	print "onlyOriginal192:",onlyOriginal192
	print "onlyBelowLatidue:",onlyBelowLatidue
	print "complement:",complement
	print "negate:",negate
	print "logTransform:",logTransform
	print "addConstant:",addConstant
	print "removeOutliers:",removeOutliers
	print "sr:",sr
	print "srSkipFirstRun:",srSkipFirstRun
	print "srInput:",srInput
	print "srOutput:",srOutput
	print "srTopQuantile:",srTopQuantile
	print "srWindowSize:",srWindowSize
	print "testRobustness:",testRobustness
	print "permutationFilter:",permutationFilter


	def runParallel(phenotypeIndex,phed):
		#Cluster specific parameters
		print phenotypeIndex
		phenName = phed.getPhenotypeName(phenotypeIndex)
		outFileName = resultDir+"Emma_"+parallel+"_"+phenName

		shstr = """#!/bin/csh
#PBS -l walltime=100:00:00
#PBS -l mem=8g 
#PBS -q cmb
"""

		shstr += "#PBS -N E"+phenName+"_"+parallel+"\n"
		shstr += "set phenotypeName="+parallel+"\n"
		shstr += "set phenotype="+str(phenotypeIndex)+"\n"
		shstr += "(python "+emmadir+"Emma.py -o "+outFileName+" "
		if onlyOriginal96:
			shstr+=" --onlyOriginal96 "			
		elif onlyOriginal192:
			shstr+=" --onlyOriginal192 "
		if onlyBelowLatidue:
			shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" "
		if logTransform:
			shstr += " --logTransform "
		if negate:
			shstr += " --negate "
		if removeOutliers:
			shstr += " --removeOutliers="+str(removeOutliers)+" "
		if phenotypeRanks:
			shstr += " --phenotypeRanks "
		if testRobustness:
			shstr+=" --testRobustness "

		shstr+=" --permutationFilter="+str(permutationFilter)+" "

		if sr:
			shstr += " --sr "			
			if not srOutput:
				output = resultDir+"Emma_"+parallel+"_"+phenName+".sr.pvals"				
			shstr += " --srOutput="+str(output)+" "
			if srSkipFirstRun:
				if not srInput:
					output = resultDir+"Emma_"+parallel+"_"+phenName+".pvals"
				shstr += " --srInput="+str(output)+" "
				shstr += " --srSkipFirstRun "				
			shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" "
			
		shstr += " -a "+str(withArrayIds)+" "			
		if kinshipDatafile:
			shstr += " --kinshipDatafile="+str(kinshipDatafile)+" "			
		shstr += " --addConstant="+str(addConstant)+" "			
		shstr += snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" "
		shstr += "> "+outFileName+"_job"+".out) >& "+outFileName+"_job"+".err\n"

		f = open(parallel+".sh",'w')
		f.write(shstr)
		f.close()

		#Execute qsub script
		os.system("qsub "+parallel+".sh ")

	snpsDataFile = args[0]
	phenotypeDataFile = args[1]
	if parallel:  #Running on the cluster..
		phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t')  #Get Phenotype data 
		if parallelAll:
			for phenotypeIndex in phed.phenIds:
				if onlyMissing:
					phenName = phed.getPhenotypeName(phenotypeIndex)
					pvalFile = resultDir+"Emma_"+parallel+"_"+phenName+".pvals"
					res = None
					try:
						res = os.stat(pvalFile)

					except Exception:
						print "File",pvalFile,"does not exist."
					if res and res.st_size>0:
						print "File",pvalFile,"already exists, and is non-empty."
						if sr:
							srInput = resultDir+"Emma_"+parallel+"_"+phenName+".sr.pvals"
							srRes = None
							try:
								srRes = os.stat(srInput)
							except Exception:
								print "File",srInput,"does not exist."
							if srRes and srRes.st_size>0:
								print "File",srInput,"already exists, and is non-empty."
							else:
								runParallel(phenotypeIndex,phed)
							
					else:
						print "Setting up the run."
						runParallel(phenotypeIndex,phed)
											
				else:
					runParallel(phenotypeIndex,phed)
		else:
			phenotypeIndex = int(args[2])
			runParallel(phenotypeIndex,phed)
		return
	else:
		phenotypeIndex = int(args[2])


	print "phenotypeIndex:",phenotypeIndex
	print "\nStarting program now!\n"



	snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds)

	#Load phenotype file
	phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t')  #Get Phenotype data 
	numAcc = len(snpsds[0].accessions)

	#Removing outliers
	if removeOutliers:
		print "Remoing outliers"
		phed.naOutliers(phenotypeIndex,removeOutliers)
	
	#If onlyOriginal96, then remove all other phenotypes..
	if onlyOriginal96: 
		print "Filtering for the first 96 accessions"
		original_96_ecotypes = phenotypeData._getFirst96Ecotypes_()
		original_96_ecotypes = map(str,original_96_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_96_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_96_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	if onlyOriginal192: 
		print "Filtering for the first 192 accessions"
		original_192_ecotypes = phenotypeData._getFirst192Ecotypes_()
		original_192_ecotypes = map(str,original_192_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_192_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_192_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
	
	if onlyBelowLatidue:
		print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
	sys.stdout.write("Finished prefiltering phenotype accessions.\n")
	sys.stdout.flush()

	phenotype = phed.getPhenIndex(phenotypeIndex)

	accIndicesToKeep = []			
	phenAccIndicesToKeep = []
	#Checking which accessions to keep and which to remove .
	for i in range(0,len(snpsds[0].accessions)):
		acc1 = snpsds[0].accessions[i]
		for j in range(0,len(phed.accessions)):
			acc2 = phed.accessions[j]
			if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA':
				accIndicesToKeep.append(i)
				phenAccIndicesToKeep.append(j)
				break	

	print "\nFiltering accessions in genotype data:"
	#Filter accessions which do not have the phenotype value (from the genotype data).
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		snpsd.removeAccessionIndices(accIndicesToKeep)
	print ""
	print numAcc-len(accIndicesToKeep),"accessions removed from genotype data, leaving",len(accIndicesToKeep),"accessions in all."
		

	print "\nNow filtering accessions in phenotype data:"
	phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values

	print "Verifying number of accessions: len(phed.accessions)==len(snpsds[0].accessions) is",len(phed.accessions)==len(snpsds[0].accessions)
	if len(phed.accessions)!=len(snpsds[0].accessions):
		raise Exception

	#Filtering monomorphic
	print "Filtering monomorphic SNPs"
	for snpsd in snpsds:
		print "Removed", str(snpsd.filterMonoMorphicSnps()),"Snps"

	#Remove minor allele frequencies
	if minMAF!=0:
		sys.stdout.write("Filterting SNPs with MAF<"+str(minMAF)+".")
		for snpsd in snpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			snpsd.filterMinMAF(minMAF)

	#Removing SNPs which are outside of boundaries.
	if chr:
		print "\nRemoving SNPs which are outside of boundaries."
		snpsds[chr-1].filterRegion(boundaries[0],boundaries[1])
		snpsds = [snpsds[chr-1]]
	
	#Ordering accessions in genotype data to fit phenotype data.
	print "Ordering genotype data accessions."
	accessionMapping = []
	i = 0
	for acc in phed.accessions:
		if acc in snpsds[0].accessions:
			accessionMapping.append((snpsds[0].accessions.index(acc),i))
			i += 1

	#print zip(accessionMapping,snpsds[0].accessions)
	print "len(snpsds[0].snps)",len(snpsds[0].snps)

	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		snpsd.orderAccessions(accessionMapping)
	print "\nGenotype data has been ordered."
		
	#Converting format to 01
	newSnpsds = []
	sys.stdout.write("Converting data format")
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		newSnpsds.append(snpsd.getSnpsData(missingVal=missingVal))
	print ""


	
	print "Checking kinshipfile:",kinshipDatafile
	
	if kinshipDatafile:  #Is there a special kinship file?
		kinshipSnpsds = dataParsers.parseCSVData(kinshipDatafile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds)

		accIndicesToKeep = []			
		#Checking which accessions to keep and which to remove (genotype data).
		sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".")
		sys.stdout.flush()
		for i in range(0,len(kinshipSnpsds[0].accessions)):
			acc1 = kinshipSnpsds[0].accessions[i]
			for j in range(0,len(phed.accessions)):
				acc2 = phed.accessions[j]
				if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA':
					accIndicesToKeep.append(i)
					break	
		print accIndicesToKeep
	
		for snpsd in kinshipSnpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			snpsd.removeAccessionIndices(accIndicesToKeep)
		print ""
		print numAcc-len(accIndicesToKeep),"accessions removed from kinship genotype data, leaving",len(accIndicesToKeep),"accessions in all."
	
		print "Ordering kinship data accessions."
		accessionMapping = []
		i = 0
		for acc in snpsds[0].accessions:
			if acc in kinshipSnpsds[0].accessions:
				accessionMapping.append((kinshipSnpsds[0].accessions.index(acc),i))
				i += 1

		print zip(accessionMapping,snpsds[0].accessions)
		print "len(snpsds[0].snps)",len(snpsds[0].snps)
		
		for snpsd in kinshipSnpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			snpsd.orderAccessions(accessionMapping)
		print "Kinship genotype data has been ordered."

		newKinshipSnpsds = []
		sys.stdout.write("Converting data format")
		for snpsd in kinshipSnpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			newKinshipSnpsds.append(snpsd.getSnpsData(missingVal=missingVal))  #This data might have NAs
		print ""
		kinshipSnpsds = newKinshipSnpsds

	else:
		kinshipSnpsds = newSnpsds
		

	print "Found kinship data."

	#Ordering accessions according to the order of accessions in the genotype file
#	accessionMapping = []
#	i = 0
#	for acc in snpsds[0].accessions:
#		if acc in phed.accessions:
#			accessionMapping.append((phed.accessions.index(acc),i))
#			i += 1
#	phed.orderAccessions(accessionMapping)

	
	#Negating phenotypic values
	if negate: 
		phed.negateValues(phenotypeIndex)

	#Adding a constant.
	if addConstant!=-1:
		if addConstant==0:
			addConstant = math.sqrt(phed.getVariance(phenotypeIndex))/10
			addConstant = addConstant - phed.getMinValue(phenotypeIndex)
			
		print "Adding a constant to phenotype:",addConstant
		phed.addConstant(phenotypeIndex,addConstant)
	
		
	
	#Log-transforming
	if logTransform:
		print "Log transforming phenotype"
		phed.logTransform(phenotypeIndex)
	#Converting phenotypes to Ranks
	elif phenotypeRanks:
		phed.transformToRanks(phenotypeIndex)
	
	if not chr:
		snpsDataset = snpsdata.SNPsDataSet(newSnpsds,[1,2,3,4,5])
		kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds,[1,2,3,4,5])
	else:
		snpsDataset = snpsdata.SNPsDataSet(newSnpsds,[chr])
		kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds,[chr])
		
	
	phenotypeName = phed.getPhenotypeName(phenotypeIndex)

	sys.stdout.flush()
	
	if testRobustness:
		print "Starting a robustness test"
		allSNPs = []
		for snpsd in snpsDataset.snpsDataList:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		_robustness_test_(allSNPs,phenVals,rFile,filter=permutationFilter)
		sys.exit(0)

	if (not sr) or (sr and not srSkipFirstRun):
		sys.stdout.write("Running Primary Emma.\n")
		sys.stdout.flush()
		pvalFile = _runEmmaScript_(snpsDataset, kinshipSnpsDataset, phed, phenotypeIndex, rFile, chr=chr, delim=delim, missingVal=missingVal, boundaries=boundaries, lrt=lrt)
		res = gwaResults.Result(pvalFile,name="EMMA_"+phenotypeName, phenotypeID=phenotypeIndex)
		res.filterMAF()
		res.negLogTransform()
		pngFile = pvalFile+".png"
		plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False)	
		srInput = pvalFile

	if sr:
		_secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,kinshipSnpsDataset)
		print "Generating second run GW plot."
		res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex)
		res.filterMAF()
		res.negLogTransform()
		srRes = gwaResults.Result(srOutput,name="EMMA_SR_"+phenotypeName, phenotypeID=phenotypeIndex)
		srRes.filterMAF()
		srRes.negLogTransform()
		srPngFile = pvalFile+".sr.png"
		plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)

Esempio n. 7

Mostra file

def _run_():
	if len(sys.argv)==1:
		print __doc__
		sys.exit(2)
	
	long_options_list=["outputFile=", "delim=", "missingval=", "withArrayId=", "phenotypeFileType=", 
					"help", "parallel=", "parallelAll", "addToDB", 
					"callMethodID=", "comment=", "onlyOriginal192","onlyOriginal96", "subSample=" , 
					"subSampleLikePhenotype=", "subsampleTest=", "complement", "onlyBelowLatidue=", 
					"onlyAboveLatidue=", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun",
					"permTest=", "savePermutations", "permutationFilter=", "testRobustness"]
	try:
		opts, args=getopt.getopt(sys.argv[1:], "o:c:d:m:a:h", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
		phenotypeFileType=1
		outputFile=None
	delim=","
	missingVal="NA"
	help=0
	withArrayIds=1
	parallel=None
	parallelAll=False
	addToDB=False
	callMethodID=None
	comment=""
	subSample=None
	onlyOriginal96=False
	onlyOriginal192 = False
	subSampleLikePhenotype = None
	subsampleTest = False
	numSubSamples = None
	complement = False
	onlyBelowLatidue = None
	onlyAboveLatidue = None

	sr = False
	srOutput = False
	srInput = False
	srSkipFirstRun = False
	srTopQuantile = 0.95
	srWindowSize = 30000
	
	permTest = None
	savePermutations = False
	permutationFilter = 1.0
	
	testRobustness = False

	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help=1
			print __doc__
		elif opt in ("-a", "--withArrayId"):
			withArrayIds=int(arg)
		elif opt in ("-o", "--outputFile"):
			outputFile=arg
		elif opt in ("--phenotypeFileType"):
			phenotypeFileType=int(arg)
		elif opt in ("--parallel"):
			parallel=arg
		elif opt in ("--parallelAll"):
			parallelAll=True
		elif opt in ("--addToDB"):
			addToDB=True
  		elif opt in ("--onlyOriginal96"):
			onlyOriginal96=True
  		elif opt in ("--onlyOriginal192"):
			onlyOriginal192=True
		elif opt in ("--complement"):
			complement=True
		elif opt in ("--subSample"):
			subSample=int(arg)
		elif opt in ("--subsampleTest"):
			subsampleTest = True
			l = arg.split(",")
			subSample=int(l[0])
			numSubSamples=int(l[1])
		elif opt in ("--onlyBelowLatidue"):
			onlyBelowLatidue=float(arg)
		elif opt in ("--onlyAboveLatidue"):
			onlyAboveLatidue=float(arg)
		elif opt in ("--subSampleLikePhenotype"):
			subSampleLikePhenotype=int(arg)
		elif opt in ("--callMethodID"):
			callMethodID=int(arg)
		elif opt in ("--comment"):
			comment=arg
		elif opt in ("-d", "--delim"):
			delim=arg
		elif opt in ("-m", "--missingval"):
			missingVal=arg
		elif opt in ("--sr"):
			sr = True
		elif opt in ("--testRobustness"):
			testRobustness = True
		elif opt in ("--permTest"):
			permTest = int(arg)
		elif opt in ("--savePermutations"):
			savePermutations = True
		elif opt in ("--permutationFilter"):
			permutationFilter = float(arg)
		elif opt in ("--srSkipFirstRun"):
			srSkipFirstRun = True
		elif opt in ("--srInput"):
			srInput = arg
		elif opt in ("--srOutput"):
			srOutput = arg
		elif opt in ("--srPar"):
			vals = arg.split(",")
			srTopQuantile = float(vals[0]) 
			srWindowSize = int(vals[1]) 
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)

	if len(args)<3 and not parallel:
		if help==0:
			print "Arguments are missing!!\n"
			print __doc__
		sys.exit(2)

	snpsDataFile=args[0]
	phenotypeDataFile=args[1]

	print "Kruskal-Wallis is being set up with the following parameters:"
	print "phenotypeDataFile:",phenotypeDataFile
	print "snpsDataFile:",snpsDataFile
	print "parallel:",parallel
	print "parallelAll:",parallelAll
	print "onlyOriginal96:",onlyOriginal96
	print "onlyOriginal192:",onlyOriginal192
	print "onlyBelowLatidue:",onlyBelowLatidue
	print "onlyAboveLatidue:",onlyAboveLatidue
	print "subSampleLikePhenotype:",subSampleLikePhenotype
	print "subsampleTest:",subsampleTest
	print "numSubSamples:",numSubSamples
	print "subSample:",subSample
	print "sr:",sr
	print "srSkipFirstRun:",srSkipFirstRun
	print "srInput:",srInput
	print "srOutput:",srOutput
	print "srTopQuantile:",srTopQuantile
	print "srWindowSize:",srWindowSize
	print "permTest:",permTest
	print "savePermutations:",savePermutations
	print "permutationFilter:",permutationFilter
	print "testRobustness:",testRobustness
	

	def runParallel(phenotypeIndex,id=""):
		#Cluster specific parameters
		phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
		phenName=phed.getPhenotypeName(phenotypeIndex)
		phenName=phenName.replace("/", "_div_")
		phenName=phenName.replace("*", "_star_")
		outputFile=resultDir+"KW_"+parallel+"_"+phenName+id

		shstr="""#!/bin/csh
#PBS -l walltime=100:00:00
#PBS -l mem=4g 
#PBS -q cmb
"""
		
		shstr+="#PBS -N K"+phenName+"_"+parallel+"\n"
		shstr+="set phenotypeName="+parallel+"\n"
		shstr+="set phenotype="+str(phenotypeIndex)+"\n"
		shstr+="(python "+scriptDir+"KW.py -o "+outputFile+" "
		shstr+=" -a "+str(withArrayIds)+" "			
		if subSample:
			shstr+=" --subSample="+str(subSample)+" "			
		elif onlyOriginal96:
			shstr+=" --onlyOriginal96 "			
		elif onlyOriginal192:
			shstr+=" --onlyOriginal192 "
		if onlyBelowLatidue:
			shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" "
		elif onlyAboveLatidue:
			shstr+=" --onlyAboveLatidue="+str(onlyAboveLatidue)+" "
		if complement: 			
			shstr+=" --complement "
		if permTest:
			shstr+=" --permTest="+str(permTest)+" "
			if savePermutations:
				shstr+=" --savePermutations "
		
		shstr+=" --permutationFilter="+str(permutationFilter)+" "
		if testRobustness:
			shstr+=" --testRobustness "
			
		if sr:
			shstr += " --sr "			
			if not srOutput:
				output = resultDir+"KW_"+parallel+"_"+phenName+".sr.pvals"				
			shstr += " --srOutput="+str(output)+" "
			if srSkipFirstRun:
				if not srInput:
					output = resultDir+"KW_"+parallel+"_"+phenName+".pvals"
				shstr += " --srInput="+str(output)+" "
				shstr += " --srSkipFirstRun "				
			shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" "


		shstr+=snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" "
		shstr+="> "+outputFile+"_job"+".out) >& "+outputFile+"_job"+".err\n"

		f=open(parallel+".sh", 'w')
		f.write(shstr)
		f.close()

		#Execute qsub script
		os.system("qsub "+parallel+".sh ")

	if parallel:  #Running on the cluster..
		if parallelAll:
			phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
			for phenotypeIndex in phed.phenIds:
				runParallel(phenotypeIndex)
		elif subsampleTest:
			phenotypeIndex=int(args[2])
			for i in range(0,numSubSamples):
				runParallel(phenotypeIndex,id="_r"+str(subSample)+"_"+str(i))
		else:
			phenotypeIndex=int(args[2])
			runParallel(phenotypeIndex)
		return
	else:
		phenotypeIndex=int(args[2])


	print "phenotypeIndex:",phenotypeIndex
	print "output:",outputFile
	print "\nStarting program now!\n"


	#Load phenotype file
	phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
	
	#If onlyOriginal96, then remove all other phenotypes..
	if onlyOriginal96: 
		print "Filtering for the first 96 accessions"
		original_96_ecotypes = phenotypeData._getFirst96Ecotypes_()
		original_96_ecotypes = map(str,original_96_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_96_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_96_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	if onlyOriginal192: 
		print "Filtering for the first 192 accessions"
		original_192_ecotypes = phenotypeData._getFirst192Ecotypes_()
		original_192_ecotypes = map(str,original_192_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_192_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_192_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
	
	if onlyBelowLatidue:
		print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)

	elif onlyAboveLatidue:
		print "Filtering for the accessions which orginate above latitude",onlyAboveLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]>onlyAboveLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)

	
	if subSampleLikePhenotype:
		p_name = phed.getPhenotypeName(subSampleLikePhenotype)
		print "Picking sample as in",p_name
		ecotypes = phed.getNonNAEcotypes(subSampleLikePhenotype)
		print ecotypes
		phed.filterAccessions(ecotypes)
		print "len(phed.accessions)", len(phed.accessions)


	if subSample: 
		sample_ecotypes = []
		ecotypes = phed.getNonNAEcotypes(phenotypeIndex)
		sample_ecotypes = random.sample(ecotypes,subSample)			
		phed.filterAccessions(sample_ecotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	sys.stdout.write("Finished prefiltering phenotype accessions.\n")
	sys.stdout.flush()
	
	
	
	#Load genotype file
	snpsds=dataParsers.parseCSVData(snpsDataFile, format = 1, deliminator = delim, missingVal = missingVal, withArrayIds = withArrayIds)


	#Checking overlap between phenotype and genotype accessions. 
	phenotype=phed.getPhenIndex(phenotypeIndex)
	accIndicesToKeep=[]			
	phenAccIndicesToKeep=[]
	numAcc=len(snpsds[0].accessions)
	sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".")
	sys.stdout.flush()
	for i in range(0, len(snpsds[0].accessions)):
		acc1=snpsds[0].accessions[i]
		for j in range(0, len(phed.accessions)):
			acc2=phed.accessions[j]
			if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA':
				accIndicesToKeep.append(i)
				phenAccIndicesToKeep.append(j)
				break	


	#Filter accessions which do not have the phenotype value.
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		snpsd.removeAccessionIndices(accIndicesToKeep)
	print ""
	print numAcc-len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all."
		
	print "Filtering phenotype data."
	phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values
	
	#Ordering accessions according to the order of accessions in the genotype file
	accessionMapping=[]
	i=0
	for acc in snpsds[0].accessions:
		if acc in phed.accessions:
			accessionMapping.append((phed.accessions.index(acc), i))
			i+=1
	phed.orderAccessions(accessionMapping)

		#Filtering monomorphic
	print "Filtering monomorphic SNPs"
	for snpsd in snpsds:
		print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

	#Converting format to 01
	newSnpsds=[]
	sys.stdout.write("Converting data format")
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		newSnpsds.append(snpsd.getSnpsData())
	print ""
	
	#Double check genotype file:
	problems = 0
	for i in range(0,len(newSnpsds)):
		snpsd = newSnpsds[i]
		for j in range(0,len(snpsd.snps)):
			snp = snpsd.snps[j]
			sc = snp.count(0)
			if sc==0 or sc==len(snp):
				print "Problem in file found at chr,pos",(i+1),",",snpsd.positions[i]
				problems += 1
	if problems >0:
		print "Genotype file appears to have potential problems"
	else:
		print "Genotype file appears to be good"

	if permTest:
		print "Starting a permutation test"
		allSNPs = []
		for snpsd in newSnpsds:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		test_type = "KW"
		if phed.isBinary(phenotypeIndex):
			test_type = "Fisher"
			permTest = 100	
		_perm_test_(allSNPs,phenVals,permTest,outputFile, test_type=test_type,savePermutations=savePermutations, filter=permutationFilter)
		sys.exit(0)
	
	if testRobustness:
		print "Starting a robustness test"
		allSNPs = []
		for snpsd in newSnpsds:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		test_type = "KW"
		if phed.isBinary(phenotypeIndex):
			test_type = "Fisher"
		_robustness_test_(allSNPs,phenVals,outputFile, test_type=test_type, filter=permutationFilter)
		sys.exit(0)
		

	sys.stdout.flush()
	print "sr:",sr, ", srSkipFirstRun:",srSkipFirstRun
	if (not sr) or (sr and not srSkipFirstRun):
		#Writing files
		if env.user=="bjarni":
			tempfile.tempdir='/tmp'
		(fId, phenotypeTempFile)=tempfile.mkstemp()
		os.close(fId)
		(fId, genotypeTempFile)=tempfile.mkstemp()
		os.close(fId)
		
		phed.writeToFile(phenotypeTempFile, [phenotype])	
		sys.stdout.write("Phenotype file written\n")
		sys.stdout.flush()
		snpsDataset=snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5])
		decoder={1:1, 0:0,-1:'NA'}	
		snpsDataset.writeToFile(genotypeTempFile, deliminator = delim, missingVal = missingVal, withArrayIds = 0, decoder = decoder)
		sys.stdout.write("Genotype file written\n")
		sys.stdout.flush()
	
		phenotypeName=phed.getPhenotypeName(phenotypeIndex)
	
		rDataFile=outputFile+".rData"
		pvalFile=outputFile+".pvals"
		#Is the phenotype binary?
		binary=phed.isBinary(phenotypeIndex)
		rstr=_generateRScript_(genotypeTempFile, phenotypeTempFile, rDataFile, pvalFile, name = phenotypeName, binary = binary)
		rFileName=outputFile+".r"
		f=open(rFileName, 'w')
		f.write(rstr)
		f.close()
		outRfile=rFileName+".out"
		errRfile=rFileName+".err"
		print "Running R file:"
		cmdStr="(R --vanilla < "+rFileName+" > "+outRfile+") >& "+errRfile
		sys.stdout.write(cmdStr+"\n")
		sys.stdout.flush()	
		gc.collect() 
		os.system(cmdStr)
		#print "Emma output saved in R format in", rDataFile
		print "Generating a GW plot."
		res = gwaResults.Result(pvalFile,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex)
		res.negLogTransform()
		pngFile = pvalFile+".png"
		plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False)	
		srInput = pvalFile
		
	else:
		print "Skipping first stage analysis."
		sys.stdout.flush()

	if sr:
		_secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,binary=binary)
		print "Generating second run GW plot."
		res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex)
		res.negLogTransform()
		srRes = gwaResults.Result(srOutput,name="KW_SR_"+phenotypeName, phenotypeID=phenotypeIndex)
		srRes.negLogTransform()
		srPngFile = pvalFile+".sr.png"
		plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)