Example #1
0
def _get192Ecotypes_():
	resdir = "/Network/Data/250k/tmp-bvilhjal/phenotype_analyzis/"
	phenotypeFile = "/Network/Data/250k/dataFreeze_011209/phenotypes_all_raw_012509.tsv"
	print "Loading phenotype data"
	phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t')
	phenotypeIndices = phenotypeData.categories_2_phenotypes[1]+phenotypeData.categories_2_phenotypes[2]+phenotypeData.categories_2_phenotypes[3]+phenotypeData.categories_2_phenotypes[4]
	
	
	total_accessions = set()
	for p_i in phenotypeIndices:
		if not p_i in [5,6,7]:
			accessions = phed.getAccessionsWithValues(p_i)
			total_accessions = total_accessions.union(accessions)

	ecotypes_192 = phenotypeData._getFirst192Ecotypes_()
	ecotypes_192 = [str(e) for e in ecotypes_192]
	print "len(ecotypes_192):",len(ecotypes_192)
	#print ecotypes_192
	phed.filterAccessions(ecotypes_192)

        for p_i in [5,6,7]:
		accessions = phed.getAccessionsWithValues(p_i)
		total_accessions = total_accessions.union(accessions)
		
	total_accessions = list(total_accessions)
	print len(total_accessions)
	total_accessions.sort()
	print total_accessions
	
	ecotype_info_dict = phenotypeData._getEcotypeIdInfoDict_()
	ets = []
	
	i = 0
	for et in total_accessions:
		et = int(et)
		if ecotype_info_dict.has_key(et):
			print str(et)+", "+str(ecotype_info_dict[et][0])+", "+str(ecotype_info_dict[et][1])
			i += 1
			ets.append(et)
		else:
			print et,"is missing in genotype data."
	print i
	return ets
Example #2
0
def _run_():
    if len(sys.argv) == 1:
        print __doc__
        sys.exit(2)

    long_options_list = [
        "rFile=",
        "chr=",
        "delim=",
        "missingval=",
        "BoundaryStart=",
        "removeOutliers=",
        "addConstant=",
        "logTransform",
        "BoundaryEnd=",
        "phenotypeFileType=",
        "help",
        "parallel=",
        "parallelAll",
        "LRT",
        "minMAF=",
        "kinshipDatafile=",
        "phenotypeRanks",
        "onlyMissing",
        "onlyOriginal96",
        "onlyOriginal192",
        "onlyBelowLatidue=",
        "complement",
        "negate",
        "srInput=",
        "sr",
        "srOutput=",
        "srPar=",
        "srSkipFirstRun",
        "testRobustness",
        "permutationFilter=",
        "useLinearRegress",
        "regressionCofactors=",
        "FriLerAsCofactor",
        "FriColAsCofactor",
        "memReq=",
        "walltimeReq=",
    ]
    try:
        opts, args = getopt.getopt(sys.argv[1:], "o:c:d:m:h",
                                   long_options_list)

    except:
        traceback.print_exc()
        print sys.exc_info()
        print __doc__
        sys.exit(2)

    phenotypeRanks = False
    removeOutliers = None
    addConstant = -1
    phenotypeFileType = 1
    rFile = None
    delim = ","
    missingVal = "NA"
    help = 0
    minMAF = 0.0
    boundaries = [-1, -1]
    chr = None
    parallel = None
    logTransform = False
    negate = False
    parallelAll = False
    lrt = False
    kinshipDatafile = None
    onlyMissing = False
    onlyOriginal96 = False
    onlyOriginal192 = False
    onlyBelowLatidue = None
    complement = False

    sr = False
    srOutput = False
    srInput = False
    srSkipFirstRun = False
    srTopQuantile = 0.95
    srWindowSize = 30000

    testRobustness = False
    permutationFilter = 0.002

    useLinearRegress = False
    regressionCofactors = None
    FriLerAsCofactor = False
    FriColAsCofactor = False

    memReq = "5g"
    walltimeReq = "150:00:00"

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            help = 1
            print __doc__
        elif opt in ("-o", "--rFile"):
            rFile = arg
        elif opt in ("--phenotypeFileType"):
            phenotypeFileType = int(arg)
        elif opt in ("--BoundaryStart"):
            boundaries[0] = int(arg)
        elif opt in ("--BoundaryEnd"):
            boundaries[1] = int(arg)
        elif opt in ("--addConstant"):
            addConstant = float(arg)
        elif opt in ("--parallel"):
            parallel = arg
        elif opt in ("--minMAF"):
            minMAF = float(arg)
        elif opt in ("--parallelAll"):
            parallelAll = True
        elif opt in ("--onlyMissing"):
            onlyMissing = True
        elif opt in ("--onlyOriginal96"):
            onlyOriginal96 = True
        elif opt in ("--onlyOriginal192"):
            onlyOriginal192 = True
        elif opt in ("--onlyBelowLatidue"):
            onlyBelowLatidue = float(arg)
        elif opt in ("--complement"):
            complement = True
        elif opt in ("--logTransform"):
            logTransform = True
        elif opt in ("--negate"):
            negate = True
        elif opt in ("--removeOutliers"):
            removeOutliers = float(arg)
        elif opt in ("--LRT"):
            lrt = True
        elif opt in ("-c", "--chr"):
            chr = int(arg)
        elif opt in ("-d", "--delim"):
            delim = arg
        elif opt in ("-m", "--missingval"):
            missingVal = arg
        elif opt in ("--kinshipDatafile"):
            kinshipDatafile = arg
        elif opt in ("--phenotypeRanks"):
            phenotypeRanks = True
        elif opt in ("--sr"):
            sr = True
        elif opt in ("--srSkipFirstRun"):
            srSkipFirstRun = True
        elif opt in ("--srInput"):
            srInput = arg
        elif opt in ("--srOutput"):
            srOutput = arg
        elif opt in ("--srPar"):
            vals = arg.split(",")
            srTopQuantile = float(vals[0])
            srWindowSize = int(vals[1])
        elif opt in ("--testRobustness"):
            testRobustness = True
        elif opt in ("--permutationFilter"):
            permutationFilter = float(arg)
        elif opt in ("--FriLerAsCofactor"):
            FriLerAsCofactor = True
        elif opt in ("--FriColAsCofactor"):
            FriColAsCofactor = True
        elif opt in ("--useLinearRegress"):
            useLinearRegress = True
        elif opt in ("--regressionCofactors"):
            regressionCofactors = arg
        elif opt in ("--memReq"):
            memReq = arg
        elif opt in ("--walltimeReq"):
            walltimeReq = arg
        else:
            if help == 0:
                print "Unkown option!!\n"
                print __doc__
            sys.exit(2)

    if len(args) < 3 and not parallel:
        if help == 0:
            print "Arguments are missing!!\n"
            print __doc__
        sys.exit(2)

    print "Emma is being set up with the following parameters:"
    print "output:", rFile
    print "phenotypeRanks:", phenotypeRanks
    print "phenotypeFileType:", phenotypeFileType
    print "parallel:", parallel
    print "parallelAll:", parallelAll
    print "minMAF:", minMAF
    print "LRT:", lrt
    print "delim:", delim
    print "missingval:", missingVal
    print "kinshipDatafile:", kinshipDatafile
    print "chr:", chr
    print "boundaries:", boundaries
    print "onlyMissing:", onlyMissing
    print "onlyOriginal96:", onlyOriginal96
    print "onlyOriginal192:", onlyOriginal192
    print "onlyBelowLatidue:", onlyBelowLatidue
    print "complement:", complement
    print "negate:", negate
    print "logTransform:", logTransform
    print "addConstant:", addConstant
    print "removeOutliers:", removeOutliers
    print "sr:", sr
    print "srSkipFirstRun:", srSkipFirstRun
    print "srInput:", srInput
    print "srOutput:", srOutput
    print "srTopQuantile:", srTopQuantile
    print "srWindowSize:", srWindowSize
    print "testRobustness:", testRobustness
    print "permutationFilter:", permutationFilter
    print "useLinearRegress:", useLinearRegress
    print "regressionCofactors:", regressionCofactors
    print "FriLerAsCofactor:", FriLerAsCofactor
    print "FriColAsCofactor:", FriColAsCofactor
    print "walltimeReq:", walltimeReq
    print "memReq:", memReq

    def runParallel(phenotypeIndex, phed):
        #Cluster specific parameters
        print phenotypeIndex
        phenName = phed.getPhenotypeName(phenotypeIndex)
        outFileName = resultDir + "Emma_" + parallel + "_" + phenName

        shstr = "#!/bin/csh\n"
        shstr += "#PBS -l walltime=" + walltimeReq + "\n"
        shstr += "#PBS -l mem=" + memReq + "\n"
        shstr += "#PBS -q cmb\n"

        shstr += "#PBS -N E" + phenName + "_" + parallel + "\n"
        shstr += "set phenotypeName=" + parallel + "\n"
        shstr += "set phenotype=" + str(phenotypeIndex) + "\n"
        if useLinearRegress:
            outFileName = resultDir + "LR_" + parallel + "_" + phenName
        shstr += "(python " + emmadir + "Emma.py -o " + outFileName + " "
        if useLinearRegress:
            shstr += " --useLinearRegress "

        if regressionCofactors:
            shstr += " --regressionCofactors=" + str(regressionCofactors) + " "
        if FriLerAsCofactor:
            shstr += " --FriLerAsCofactor "
        if FriColAsCofactor:
            shstr += " --FriColAsCofactor "
        if onlyOriginal96:
            shstr += " --onlyOriginal96 "
        elif onlyOriginal192:
            shstr += " --onlyOriginal192 "
        if onlyBelowLatidue:
            shstr += " --onlyBelowLatidue=" + str(onlyBelowLatidue) + " "
        if logTransform:
            shstr += " --logTransform "
        if negate:
            shstr += " --negate "
        if removeOutliers:
            shstr += " --removeOutliers=" + str(removeOutliers) + " "
        if phenotypeRanks:
            shstr += " --phenotypeRanks "
        if testRobustness:
            shstr += " --testRobustness "

        shstr += " --permutationFilter=" + str(permutationFilter) + " "

        if sr:
            shstr += " --sr "
            if not srOutput:
                output = resultDir + "Emma_" + parallel + "_" + phenName + ".sr.pvals"
            shstr += " --srOutput=" + str(output) + " "
            if srSkipFirstRun:
                if not srInput:
                    output = resultDir + "Emma_" + parallel + "_" + phenName + ".pvals"
                shstr += " --srInput=" + str(output) + " "
                shstr += " --srSkipFirstRun "
            shstr += " --srPar=" + str(srTopQuantile) + "," + str(
                srWindowSize) + " "

        if kinshipDatafile:
            shstr += " --kinshipDatafile=" + str(kinshipDatafile) + " "
        shstr += " --addConstant=" + str(addConstant) + " "
        shstr += snpsDataFile + " " + phenotypeDataFile + " " + str(
            phenotypeIndex) + " "
        shstr += "> " + outFileName + "_job" + ".out) >& " + outFileName + "_job" + ".err\n"

        f = open(parallel + ".sh", 'w')
        f.write(shstr)
        f.close()

        #Execute qsub script
        os.system("qsub " + parallel + ".sh ")

    snpsDataFile = args[0]
    phenotypeDataFile = args[1]
    if parallel:  #Running on the cluster..
        phed = phenotypeData.readPhenotypeFile(
            phenotypeDataFile, delimiter='\t')  #Get Phenotype data
        if parallelAll:
            for phenotypeIndex in phed.phenIds:
                if onlyMissing:
                    phenName = phed.getPhenotypeName(phenotypeIndex)
                    pvalFile = resultDir + "Emma_" + parallel + "_" + phenName + ".pvals"
                    res = None
                    try:
                        res = os.stat(pvalFile)

                    except Exception:
                        print "File", pvalFile, "does not exist."
                    if res and res.st_size > 0:
                        print "File", pvalFile, "already exists, and is non-empty."
                        if sr:
                            srInput = resultDir + "Emma_" + parallel + "_" + phenName + ".sr.pvals"
                            srRes = None
                            try:
                                srRes = os.stat(srInput)
                            except Exception:
                                print "File", srInput, "does not exist."
                            if srRes and srRes.st_size > 0:
                                print "File", srInput, "already exists, and is non-empty."
                            else:
                                runParallel(phenotypeIndex, phed)

                    else:
                        print "Setting up the run."
                        runParallel(phenotypeIndex, phed)

                else:
                    runParallel(phenotypeIndex, phed)
        else:
            phenotypeIndex = int(args[2])
            runParallel(phenotypeIndex, phed)
        return
    else:
        phenotypeIndex = int(args[2])

    print "phenotypeIndex:", phenotypeIndex
    print "\nStarting program now!\n"

    snpsds = dataParsers.parseCSVData(snpsDataFile,
                                      format=1,
                                      deliminator=delim,
                                      missingVal=missingVal)

    #Load phenotype file
    phed = phenotypeData.readPhenotypeFile(phenotypeDataFile,
                                           delimiter='\t')  #Get Phenotype data
    numAcc = len(snpsds[0].accessions)

    #Removing outliers
    if removeOutliers:
        print "Remoing outliers"
        phed.naOutliers(phenotypeIndex, removeOutliers)

    #If onlyOriginal96, then remove all other phenotypes..
    if onlyOriginal96:
        print "Filtering for the first 96 accessions"
        original_96_ecotypes = phenotypeData._getFirst96Ecotypes_()
        original_96_ecotypes = map(str, original_96_ecotypes)
        keepEcotypes = []
        if complement:
            for acc in phed.accessions:
                if not acc in original_96_ecotypes:
                    keepEcotypes.append(acc)
        else:
            keepEcotypes = original_96_ecotypes
        phed.filterAccessions(keepEcotypes)
        print "len(phed.accessions)", len(phed.accessions)

    if onlyOriginal192:
        print "Filtering for the first 192 accessions"
        original_192_ecotypes = phenotypeData._getFirst192Ecotypes_()
        original_192_ecotypes = map(str, original_192_ecotypes)
        keepEcotypes = []
        if complement:
            for acc in phed.accessions:
                if not acc in original_192_ecotypes:
                    keepEcotypes.append(acc)
        else:
            keepEcotypes = original_192_ecotypes
        phed.filterAccessions(keepEcotypes)
        print "len(phed.accessions)", len(phed.accessions)

    if onlyBelowLatidue:
        print "Filtering for the accessions which orginate below latitude", onlyBelowLatidue
        eiDict = phenotypeData._getEcotypeIdInfoDict_()
        print eiDict
        keepEcotypes = []
        for acc in phed.accessions:
            acc = int(acc)
            if eiDict.has_key(acc) and eiDict[acc][
                    2] and eiDict[acc][2] < onlyBelowLatidue:
                keepEcotypes.append(str(acc))
            elif eiDict.has_key(acc) and eiDict[acc][2] == None:
                keepEcotypes.append(str(acc))

        phed.filterAccessions(keepEcotypes)
        print "len(phed.accessions)", len(phed.accessions)
    sys.stdout.write("Finished prefiltering phenotype accessions.\n")
    sys.stdout.flush()

    phenotype = phed.getPhenIndex(phenotypeIndex)

    accIndicesToKeep = []
    phenAccIndicesToKeep = []
    #Checking which accessions to keep and which to remove .
    for i in range(0, len(snpsds[0].accessions)):
        acc1 = snpsds[0].accessions[i]
        for j in range(0, len(phed.accessions)):
            acc2 = phed.accessions[j]
            if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA':
                accIndicesToKeep.append(i)
                phenAccIndicesToKeep.append(j)
                break

    print "\nFiltering accessions in genotype data:"
    #Filter accessions which do not have the phenotype value (from the genotype data).
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        snpsd.removeAccessionIndices(accIndicesToKeep)
    print ""
    print numAcc - len(
        accIndicesToKeep
    ), "accessions removed from genotype data, leaving", len(
        accIndicesToKeep), "accessions in all."

    print "\nNow filtering accessions in phenotype data:"
    phed.removeAccessions(
        phenAccIndicesToKeep
    )  #Removing accessions that don't have genotypes or phenotype values

    print "Verifying number of accessions: len(phed.accessions)==len(snpsds[0].accessions) is", len(
        phed.accessions) == len(snpsds[0].accessions)
    if len(phed.accessions) != len(snpsds[0].accessions):
        raise Exception

    #Filtering monomorphic
    print "Filtering monomorphic SNPs"
    for snpsd in snpsds:
        print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

    #Remove minor allele frequencies
    if minMAF != 0:
        sys.stdout.write("Filterting SNPs with MAF<" + str(minMAF) + ".")
        for snpsd in snpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            snpsd.filterMinMAF(minMAF)

    #Removing SNPs which are outside of boundaries.
    if chr:
        print "\nRemoving SNPs which are outside of boundaries."
        snpsds[chr - 1].filterRegion(boundaries[0], boundaries[1])
        snpsds = [snpsds[chr - 1]]

    #Ordering accessions in genotype data to fit phenotype data.
    print "Ordering genotype data accessions."
    accessionMapping = []
    i = 0
    for acc in phed.accessions:
        if acc in snpsds[0].accessions:
            accessionMapping.append((snpsds[0].accessions.index(acc), i))
            i += 1

    #print zip(accessionMapping,snpsds[0].accessions)
    print "len(snpsds[0].snps)", len(snpsds[0].snps)

    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        snpsd.orderAccessions(accessionMapping)
    print "\nGenotype data has been ordered."

    #Converting format to 01
    newSnpsds = []
    sys.stdout.write("Converting data format")
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        newSnpsds.append(snpsd.getSnpsData(missingVal=missingVal))
    print ""

    print "Checking kinshipfile:", kinshipDatafile

    if kinshipDatafile:  #Is there a special kinship file?
        kinshipSnpsds = dataParsers.parseCSVData(kinshipDatafile,
                                                 format=1,
                                                 deliminator=delim,
                                                 missingVal=missingVal)

        accIndicesToKeep = []
        #Checking which accessions to keep and which to remove (genotype data).
        sys.stdout.write(
            "Removing accessions which do not have a phenotype value for " +
            phed.phenotypeNames[phenotype] + ".")
        sys.stdout.flush()
        for i in range(0, len(kinshipSnpsds[0].accessions)):
            acc1 = kinshipSnpsds[0].accessions[i]
            for j in range(0, len(phed.accessions)):
                acc2 = phed.accessions[j]
                if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA':
                    accIndicesToKeep.append(i)
                    break
        print accIndicesToKeep

        for snpsd in kinshipSnpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            snpsd.removeAccessionIndices(accIndicesToKeep)
        print ""
        print numAcc - len(
            accIndicesToKeep
        ), "accessions removed from kinship genotype data, leaving", len(
            accIndicesToKeep), "accessions in all."

        print "Ordering kinship data accessions."
        accessionMapping = []
        i = 0
        for acc in snpsds[0].accessions:
            if acc in kinshipSnpsds[0].accessions:
                accessionMapping.append(
                    (kinshipSnpsds[0].accessions.index(acc), i))
                i += 1

        print zip(accessionMapping, snpsds[0].accessions)
        print "len(snpsds[0].snps)", len(snpsds[0].snps)

        for snpsd in kinshipSnpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            snpsd.orderAccessions(accessionMapping)
        print "Kinship genotype data has been ordered."

        newKinshipSnpsds = []
        sys.stdout.write("Converting data format")
        for snpsd in kinshipSnpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            newKinshipSnpsds.append(snpsd.getSnpsData(
                missingVal=missingVal))  #This data might have NAs
        print ""
        kinshipSnpsds = newKinshipSnpsds

    else:
        kinshipSnpsds = newSnpsds

    print "Found kinship data."

    #Ordering accessions according to the order of accessions in the genotype file
    #	accessionMapping = []
    #	i = 0
    #	for acc in snpsds[0].accessions:
    #		if acc in phed.accessions:
    #			accessionMapping.append((phed.accessions.index(acc),i))
    #			i += 1
    #	phed.orderAccessions(accessionMapping)

    #Negating phenotypic values
    if negate:
        phed.negateValues(phenotypeIndex)

    if logTransform and not phed.isBinary(
            phenotypeIndex) and phed.getMinValue(phenotypeIndex) <= 0:
        addConstant = 0

    #Adding a constant.
    if addConstant != -1:
        if addConstant == 0:
            addConstant = math.sqrt(phed.getVariance(phenotypeIndex)) / 10
            addConstant = addConstant - phed.getMinValue(phenotypeIndex)

        print "Adding a constant to phenotype:", addConstant
        phed.addConstant(phenotypeIndex, addConstant)

    #Log-transforming
    if logTransform:
        print "Log transforming phenotype"
        phed.logTransform(phenotypeIndex)
    #Converting phenotypes to Ranks
    elif phenotypeRanks:
        phed.transformToRanks(phenotypeIndex)

    if not chr:
        snpsDataset = snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5])
        kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds,
                                                  [1, 2, 3, 4, 5])
    else:
        snpsDataset = snpsdata.SNPsDataSet(newSnpsds, [chr])
        kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds, [chr])

    phenotypeName = phed.getPhenotypeName(phenotypeIndex)

    sys.stdout.flush()

    if testRobustness:
        print "Starting a robustness test"
        allSNPs = []
        for snpsd in snpsDataset.snpsDataList:
            allSNPs += snpsd.snps
        phenVals = phed.getPhenVals(phenotypeIndex)
        _robustness_test_(allSNPs, phenVals, rFile, filter=permutationFilter)
        sys.exit(0)

    if useLinearRegress:
        phenVals = phed.getPhenVals(phenotypeIndex)
        d0 = {}
        d0["phen"] = phenVals
        dh = {}
        dh["phen"] = phenVals
        import rpy, gc
        if regressionCofactors:  #Adds ler and col as cofactors
            import pickle
            f = open(regressionCofactors, "r")
            co_factors = pickle.load(f)
            f.close()
            #inserting co factors into model
            for factor in co_factors:
                d[factor] = co_factors[factor]
        import analyzeHaplotype as ah
        (ler_factor, col_factor) = ah.getLerAndColAccessions(newSnpsds, True)
        if FriColAsCofactor:
            d0["col"] = col_factor
            dh["col"] = col_factor
        if FriLerAsCofactor:
            d0["ler"] = ler_factor
            dh["ler"] = ler_factor
        chr_pos_pvals = []
        stats = []
        sys.stdout.write("Applying the linear model")
        sys.stdout.flush()
        for i in range(0, len(newSnpsds)):  #[3]:#
            snpsd = newSnpsds[i]
            sys.stdout.write("|")
            sys.stdout.flush()
            gc.collect(
            )  #Calling garbage collector, in an attempt to clean up memory..
            for j in range(0, len(snpsd.snps)):
                if j % 5000 == 0:
                    sys.stdout.write(".")
                    sys.stdout.flush()
                #if snpsd.positions[j]>1700000:
                #	break
                snp = snpsd.snps[j]
                d0["snp"] = snp
                try:
                    rpy.set_default_mode(rpy.NO_CONVERSION)
                    aov0 = rpy.r.aov(r("phen ~ ."), data=d0)
                    aovh = rpy.r.aov(r("phen ~ ."), data=dh)
                    rpy.set_default_mode(rpy.BASIC_CONVERSION)
                    s0 = rpy.r.summary(aov0)
                    sh = rpy.r.summary(aovh)
                    #print s0,sh
                    rss_0 = s0['Sum Sq'][-1]
                    if type(sh['Sum Sq']) != float:
                        rss_h = sh['Sum Sq'][-1]

                    else:
                        rss_h = sh['Sum Sq']
                    f = (rss_h - rss_0) / (rss_0 /
                                           (len(phenVals) - len(d0) + 1))
                    pval = rpy.r.pf(f, 1, len(phenVals), lower_tail=False)
                except Exception, err_str:
                    print "Calculating p-value failed"  #,err_str
                    pval = 1.0
                #print "dh:",dh
                #print "d0:",d0
                #print "rss_h,rss_0:",rss_h,rss_0
                #print "f,p:",f,pval
                chr_pos_pvals.append([i + 1, snpsd.positions[j], pval])
                mafc = min(snp.count(snp[0]), len(snp) - snp.count(snp[0]))
                maf = mafc / float(len(snp))
                stats.append([maf, mafc])
        sys.stdout.write("\n")
        #Write out to a result file
        sys.stdout.write("Writing results to file\n")
        sys.stdout.flush()
        pvalFile = rFile + ".pvals"
        f = open(pvalFile, "w")
        f.write("Chromosome,position,p-value,marf,maf\n")
        for i in range(0, len(chr_pos_pvals)):
            chr_pos_pval = chr_pos_pvals[i]
            stat = stats[i]
            f.write(
                str(chr_pos_pval[0]) + "," + str(chr_pos_pval[1]) + "," +
                str(chr_pos_pval[2]) + "," + str(stat[0]) + "," +
                str(stat[1]) + "\n")
        f.close()

        #Plot results
        print "Generating a GW plot."
        phenotypeName = phed.getPhenotypeName(phenotypeIndex)
        res = gwaResults.Result(pvalFile,
                                name="LM_" + phenotypeName,
                                phenotypeID=phenotypeIndex)
        res.negLogTransform()
        pngFile = pvalFile + ".png"
        plotResults.plotResult(res,
                               pngFile=pngFile,
                               percentile=90,
                               type="pvals",
                               ylab="$-$log$_{10}(p)$",
                               plotBonferroni=True,
                               usePylab=False)
Example #3
0
def _run_():
	if len(sys.argv)==1:
		print __doc__
		sys.exit(2)
	
	long_options_list=["outputFile=", "delim=", "missingval=", "phenotypeFileType=", 
					"help", "parallel=", "parallelAll", "addToDB", 
					"callMethodID=", "comment=", "onlyOriginal192","onlyOriginal96", "subSample=" , 
					"subSampleLikePhenotype=", "subsampleTest=", "complement", "onlyBelowLatidue=", 
					"onlyAboveLatidue=", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun",
					"permTest=", "savePermutations", "permutationFilter=", "testRobustness",
					"memReq=","walltimeReq=",]
	try:
		opts, args=getopt.getopt(sys.argv[1:], "o:c:d:m:h", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
	phenotypeFileType=1
	outputFile=None
	delim=","
	missingVal="NA"
	help=0
	parallel=None
	parallelAll=False
	addToDB=False
	callMethodID=None
	comment=""
	subSample=None
	onlyOriginal96=False
	onlyOriginal192 = False
	subSampleLikePhenotype = None
	subsampleTest = False
	numSubSamples = None
	complement = False
	onlyBelowLatidue = None
	onlyAboveLatidue = None

	sr = False
	srOutput = False
	srInput = False
	srSkipFirstRun = False
	srTopQuantile = 0.95
	srWindowSize = 30000
	
	permTest = None
	savePermutations = False
	permutationFilter = 1.0
	
	testRobustness = False

	memReq = "5g"
	walltimeReq = "100:00:00"

	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help=1
			print __doc__
		elif opt in ("-o", "--outputFile"):
			outputFile=arg
		elif opt in ("--phenotypeFileType"):
			phenotypeFileType=int(arg)
		elif opt in ("--parallel"):
			parallel=arg
		elif opt in ("--parallelAll"):
			parallelAll=True
		elif opt in ("--addToDB"):
			addToDB=True
  		elif opt in ("--onlyOriginal96"):
			onlyOriginal96=True
  		elif opt in ("--onlyOriginal192"):
			onlyOriginal192=True
		elif opt in ("--complement"):
			complement=True
		elif opt in ("--subSample"):
			subSample=int(arg)
		elif opt in ("--subsampleTest"):
			subsampleTest = True
			l = arg.split(",")
			subSample=int(l[0])
			numSubSamples=int(l[1])
		elif opt in ("--onlyBelowLatidue"):
			onlyBelowLatidue=float(arg)
		elif opt in ("--onlyAboveLatidue"):
			onlyAboveLatidue=float(arg)
		elif opt in ("--subSampleLikePhenotype"):
			subSampleLikePhenotype=int(arg)
		elif opt in ("--callMethodID"):
			callMethodID=int(arg)
		elif opt in ("--comment"):
			comment=arg
		elif opt in ("-d", "--delim"):
			delim=arg
		elif opt in ("-m", "--missingval"):
			missingVal=arg
		elif opt in ("--sr"):
			sr = True
		elif opt in ("--testRobustness"):
			testRobustness = True
		elif opt in ("--permTest"):
			permTest = int(arg)
		elif opt in ("--savePermutations"):
			savePermutations = True
		elif opt in ("--permutationFilter"):
			permutationFilter = float(arg)
		elif opt in ("--srSkipFirstRun"):
			srSkipFirstRun = True
		elif opt in ("--srInput"):
			srInput = arg
		elif opt in ("--srOutput"):
			srOutput = arg
		elif opt in ("--srPar"):
			vals = arg.split(",")
			srTopQuantile = float(vals[0]) 
			srWindowSize = int(vals[1]) 
		elif opt in ("--memReq"):
			memReq=arg
		elif opt in ("--walltimeReq"):
			walltimeReq=arg
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)

	if len(args)<3 and not parallel:
		if help==0:
			print "Arguments are missing!!\n"
			print __doc__
		sys.exit(2)

	snpsDataFile=args[0]
	phenotypeDataFile=args[1]

	print "Kruskal-Wallis is being set up with the following parameters:"
	print "phenotypeDataFile:",phenotypeDataFile
	print "snpsDataFile:",snpsDataFile
	print "parallel:",parallel
	print "parallelAll:",parallelAll
	print "onlyOriginal96:",onlyOriginal96
	print "onlyOriginal192:",onlyOriginal192
	print "onlyBelowLatidue:",onlyBelowLatidue
	print "onlyAboveLatidue:",onlyAboveLatidue
	print "complement:",complement
	print "subSampleLikePhenotype:",subSampleLikePhenotype
	print "subsampleTest:",subsampleTest
	print "numSubSamples:",numSubSamples
	print "subSample:",subSample
	print "sr:",sr
	print "srSkipFirstRun:",srSkipFirstRun
	print "srInput:",srInput
	print "srOutput:",srOutput
	print "srTopQuantile:",srTopQuantile
	print "srWindowSize:",srWindowSize
	print "permTest:",permTest
	print "savePermutations:",savePermutations
	print "permutationFilter:",permutationFilter
	print "testRobustness:",testRobustness
	print "walltimeReq:",walltimeReq
	print "memReq:",memReq

	def runParallel(phenotypeIndex,id=""):
		#Cluster specific parameters
		phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
		phenName=phed.getPhenotypeName(phenotypeIndex)
		print phenName
		outputFile=resultDir+"KW_"+parallel+"_"+phenName+id

		shstr = "#!/bin/csh\n"
		shstr += "#PBS -l walltime="+walltimeReq+"\n"
		shstr += "#PBS -l mem="+memReq+"\n"
		shstr +="#PBS -q cmb\n"
		
		shstr+="#PBS -N K"+phenName+"_"+parallel+"\n"
		shstr+="set phenotypeName="+parallel+"\n"
		shstr+="set phenotype="+str(phenotypeIndex)+"\n"
		shstr+="(python "+scriptDir+"KW.py -o "+outputFile+" "
		if subSample:
			shstr+=" --subSample="+str(subSample)+" "			
		elif onlyOriginal96:
			shstr+=" --onlyOriginal96 "			
		elif onlyOriginal192:
			shstr+=" --onlyOriginal192 "
		if onlyBelowLatidue:
			shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" "
		elif onlyAboveLatidue:
			shstr+=" --onlyAboveLatidue="+str(onlyAboveLatidue)+" "
		if complement: 			
			shstr+=" --complement "
		if permTest:
			shstr+=" --permTest="+str(permTest)+" "
			if savePermutations:
				shstr+=" --savePermutations "
		
		shstr+=" --permutationFilter="+str(permutationFilter)+" "
		if testRobustness:
			shstr+=" --testRobustness "
			
		if sr:
			shstr += " --sr "			
			if not srOutput:
				output = resultDir+"KW_"+parallel+"_"+phenName+".sr.pvals"				
			shstr += " --srOutput="+str(output)+" "
			if srSkipFirstRun:
				if not srInput:
					output = resultDir+"KW_"+parallel+"_"+phenName+".pvals"
				shstr += " --srInput="+str(output)+" "
				shstr += " --srSkipFirstRun "				
			shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" "


		shstr+=snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" "
		shstr+="> "+outputFile+"_job"+".out) >& "+outputFile+"_job"+".err\n"

		f=open(parallel+".sh", 'w')
		f.write(shstr)
		f.close()

		#Execute qsub script
		os.system("qsub "+parallel+".sh ")

	if parallel:  #Running on the cluster..
		if parallelAll:
			phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
			for phenotypeIndex in phed.phenIds:
				runParallel(phenotypeIndex)
		elif subsampleTest:
			phenotypeIndex=int(args[2])
			for i in range(0,numSubSamples):
				runParallel(phenotypeIndex,id="_r"+str(subSample)+"_"+str(i))
		else:
			phenotypeIndex=int(args[2])
			runParallel(phenotypeIndex)
		return
	else:
		phenotypeIndex=int(args[2])


	print "phenotypeIndex:",phenotypeIndex
	print "output:",outputFile
	print "\nStarting program now!\n"


	#Load phenotype file
	phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
	
	#If onlyOriginal96, then remove all other phenotypes..
	if onlyOriginal96: 
		print "Filtering for the first 96 accessions"
		original_96_ecotypes = phenotypeData._getFirst96Ecotypes_()
		original_96_ecotypes = map(str,original_96_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_96_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_96_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	if onlyOriginal192: 
		print "Filtering for the first 192 accessions"
		original_192_ecotypes = phenotypeData._getFirst192Ecotypes_()
		original_192_ecotypes = map(str,original_192_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_192_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_192_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
	
	if onlyBelowLatidue:
		print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)

	elif onlyAboveLatidue:
		print "Filtering for the accessions which orginate above latitude",onlyAboveLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]>onlyAboveLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)

	
	if subSampleLikePhenotype:
		p_name = phed.getPhenotypeName(subSampleLikePhenotype)
		print "Picking sample as in",p_name
		ecotypes = phed.getNonNAEcotypes(subSampleLikePhenotype)
		print ecotypes
		phed.filterAccessions(ecotypes)
		print "len(phed.accessions)", len(phed.accessions)


	if subSample: 
		sample_ecotypes = []
		ecotypes = phed.getNonNAEcotypes(phenotypeIndex)
		sample_ecotypes = random.sample(ecotypes,subSample)			
		phed.filterAccessions(sample_ecotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	sys.stdout.write("Finished prefiltering phenotype accessions.\n")
	sys.stdout.flush()
	
	
	
	#Load genotype file
	snpsds=dataParsers.parseCSVData(snpsDataFile, format = 1, deliminator = delim, missingVal = missingVal)


	#Checking overlap between phenotype and genotype accessions. 
	phenotype=phed.getPhenIndex(phenotypeIndex)
	accIndicesToKeep=[]			
	phenAccIndicesToKeep=[]
	numAcc=len(snpsds[0].accessions)
	sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".")
	sys.stdout.flush()
	for i in range(0, len(snpsds[0].accessions)):
		acc1=snpsds[0].accessions[i]
		for j in range(0, len(phed.accessions)):
			acc2=phed.accessions[j]
			if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA':
				accIndicesToKeep.append(i)
				phenAccIndicesToKeep.append(j)
				break	


	#Filter accessions which do not have the phenotype value.
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		snpsd.removeAccessionIndices(accIndicesToKeep)
	print ""
	print numAcc-len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all."
		
	print "Filtering phenotype data."
	phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values
	
	#Ordering accessions according to the order of accessions in the genotype file
	accessionMapping=[]
	i=0
	for acc in snpsds[0].accessions:
		if acc in phed.accessions:
			accessionMapping.append((phed.accessions.index(acc), i))
			i+=1
	phed.orderAccessions(accessionMapping)

		#Filtering monomorphic
	print "Filtering monomorphic SNPs"
	for snpsd in snpsds:
		print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

	#Converting format to 01
	newSnpsds=[]
	sys.stdout.write("Converting data format")
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		newSnpsds.append(snpsd.getSnpsData())
	print ""
	
	#Double check genotype file:
	problems = 0
	for i in range(0,len(newSnpsds)):
		snpsd = newSnpsds[i]
		for j in range(0,len(snpsd.snps)):
			snp = snpsd.snps[j]
			sc = snp.count(0)
			if sc==0 or sc==len(snp):
				print "Problem in file found at chr,pos",(i+1),",",snpsd.positions[i]
				problems += 1
	if problems >0:
		print "Genotype file appears to have potential problems"
	else:
		print "Genotype file appears to be good"

	if permTest:
		print "Starting a permutation test"
		allSNPs = []
		for snpsd in newSnpsds:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		test_type = "KW"
		if phed.isBinary(phenotypeIndex):
			test_type = "Fisher"
			permTest = 100	
		_perm_test_(allSNPs,phenVals,permTest,outputFile, test_type=test_type,savePermutations=savePermutations, filter=permutationFilter)
		sys.exit(0)
	
	if testRobustness:
		print "Starting a robustness test"
		allSNPs = []
		for snpsd in newSnpsds:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		test_type = "KW"
		if phed.isBinary(phenotypeIndex):
			test_type = "Fisher"
		_robustness_test_(allSNPs,phenVals,outputFile, test_type=test_type, filter=permutationFilter)
		sys.exit(0)
		

	sys.stdout.flush()
	print "sr:",sr, ", srSkipFirstRun:",srSkipFirstRun
	if (not sr) or (sr and not srSkipFirstRun):
		#Writing files
		#phed and phenotype
		sd=snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5])
		phenotypeName=phed.getPhenotypeName(phenotypeIndex)
		
		if phed.isBinary(phenotypeIndex):
			pvals = run_fet(sd.getSnps(),phed.getPhenVals(phenotypeIndex))	
		else:
			snps = sd.getSnps()
			phen_vals = phed.getPhenVals(phenotypeIndex)
			try:
				kw_res = util.kruskal_wallis(snps,phen_vals)
				pvals = kw_res['ps']
			except:
				print snps
				print phen_vals
				print len(snps),len(snps[0]),len(phen_vals)
				raise Exception
							
		res = gwaResults.Result(scores = pvals,name="KW_"+phenotypeName, snpsds=newSnpsds, load_snps=False)
		pvalFile=outputFile+".pvals"
		res.writeToFile(pvalFile)

		print "Generating a GW plot."
		res.negLogTransform()
		pngFile = pvalFile+".png"
		plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False)	
		srInput = pvalFile
		
	else:
		print "Skipping first stage analysis."
		sys.stdout.flush()

	if sr:
		_secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,binary=binary)
		print "Generating second run GW plot."
		res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex)
		res.negLogTransform()
		srRes = gwaResults.Result(srOutput,name="KW_SR_"+phenotypeName, phenotypeID=phenotypeIndex)
		srRes.negLogTransform()
		srPngFile = pvalFile+".sr.png"
		plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)	
Example #4
0
def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["rFile=","chr=", "delim=", "missingval=", "withArrayId=", "BoundaryStart=", "removeOutliers=", "addConstant=",
						"logTransform", "BoundaryEnd=", "phenotypeFileType=", "help", "parallel=", "parallelAll", "LRT", "minMAF=", 
						"kinshipDatafile=", "phenotypeRanks", "onlyMissing","onlyOriginal96", "onlyOriginal192", "onlyBelowLatidue=", 
						"complement", "negate", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun", "testRobustness",
						"permutationFilter="]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "o:c:d:m:a:h", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	phenotypeRanks = False
	removeOutliers = None
	addConstant = -1
	phenotypeFileType = 1
	rFile = None
	delim = ","
	missingVal = "NA"
	help = 0
	minMAF=0.0
	withArrayIds = 1
	boundaries = [-1,-1]
	chr=None
	parallel = None
	logTransform = False
	negate = False
	parallelAll = False
	lrt = False
	kinshipDatafile = None 
	onlyMissing = False
	onlyOriginal96 = False
	onlyOriginal192 = False
	onlyBelowLatidue = None
	complement = False

	sr = False
	srOutput = False
	srInput = False
	srSkipFirstRun = False
	srTopQuantile = 0.95
	srWindowSize = 30000
	
	testRobustness = False
	permutationFilter = 0.002

	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help = 1
			print __doc__
		elif opt in ("-a","--withArrayId"):
			withArrayIds = int(arg)
		elif opt in ("-o","--rFile"):
			rFile = arg
		elif opt in ("--phenotypeFileType"):
			phenotypeFileType = int(arg)
		elif opt in ("--BoundaryStart"):
			boundaries[0] = int(arg)
		elif opt in ("--BoundaryEnd"):
			boundaries[1] = int(arg)
		elif opt in ("--addConstant"):
			addConstant = float(arg)
		elif opt in ("--parallel"):
			parallel = arg
		elif opt in ("--minMAF"):
			minMAF = float(arg)
		elif opt in ("--parallelAll"):
			parallelAll = True
		elif opt in ("--onlyMissing"):
			onlyMissing = True
		elif opt in ("--onlyOriginal96"):
			onlyOriginal96 = True
		elif opt in ("--onlyOriginal192"):
			onlyOriginal192 = True
		elif opt in ("--onlyBelowLatidue"):
			onlyBelowLatidue = float(arg)
		elif opt in ("--complement"):
			complement = True
		elif opt in ("--logTransform"):
			logTransform = True
		elif opt in ("--negate"):
			negate = True
		elif opt in ("--removeOutliers"):
			removeOutliers = float(arg)
		elif opt in ("--LRT"):
			lrt = True
		elif opt in ("-c","--chr"):
			chr = int(arg)
		elif opt in ("-d","--delim"):
			delim = arg
		elif opt in ("-m","--missingval"):
			missingVal = arg
		elif opt in ("--kinshipDatafile"):
			kinshipDatafile = arg
		elif opt in ("--phenotypeRanks"):
			phenotypeRanks = True
		elif opt in ("--sr"):
			sr = True
		elif opt in ("--srSkipFirstRun"):
			srSkipFirstRun = True
		elif opt in ("--srInput"):
			srInput = arg
		elif opt in ("--srOutput"):
			srOutput = arg
		elif opt in ("--srPar"):
			vals = arg.split(",")
			srTopQuantile = float(vals[0]) 
			srWindowSize = int(vals[1]) 
		elif opt in ("--testRobustness"):
			testRobustness = True
		elif opt in ("--permutationFilter"):
			permutationFilter = float(arg)
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)

	if len(args)<3 and not parallel:
		if help==0:
			print "Arguments are missing!!\n"
			print __doc__
		sys.exit(2)

	print "Emma is being set up with the following parameters:"
	print "output:",rFile
	print "phenotypeRanks:",phenotypeRanks
	print "withArrayId:",withArrayIds
	print "phenotypeFileType:",phenotypeFileType
	print "parallel:",parallel
	print "parallelAll:",parallelAll
	print "minMAF:",minMAF
	print "LRT:",lrt
	print "delim:",delim
	print "missingval:",missingVal
	print "kinshipDatafile:",kinshipDatafile
	print "chr:",chr
	print "boundaries:",boundaries
	print "onlyMissing:",onlyMissing
	print "onlyOriginal96:",onlyOriginal96
	print "onlyOriginal192:",onlyOriginal192
	print "onlyBelowLatidue:",onlyBelowLatidue
	print "complement:",complement
	print "negate:",negate
	print "logTransform:",logTransform
	print "addConstant:",addConstant
	print "removeOutliers:",removeOutliers
	print "sr:",sr
	print "srSkipFirstRun:",srSkipFirstRun
	print "srInput:",srInput
	print "srOutput:",srOutput
	print "srTopQuantile:",srTopQuantile
	print "srWindowSize:",srWindowSize
	print "testRobustness:",testRobustness
	print "permutationFilter:",permutationFilter


	def runParallel(phenotypeIndex,phed):
		#Cluster specific parameters
		print phenotypeIndex
		phenName = phed.getPhenotypeName(phenotypeIndex)
		outFileName = resultDir+"Emma_"+parallel+"_"+phenName

		shstr = """#!/bin/csh
#PBS -l walltime=100:00:00
#PBS -l mem=8g 
#PBS -q cmb
"""

		shstr += "#PBS -N E"+phenName+"_"+parallel+"\n"
		shstr += "set phenotypeName="+parallel+"\n"
		shstr += "set phenotype="+str(phenotypeIndex)+"\n"
		shstr += "(python "+emmadir+"Emma.py -o "+outFileName+" "
		if onlyOriginal96:
			shstr+=" --onlyOriginal96 "			
		elif onlyOriginal192:
			shstr+=" --onlyOriginal192 "
		if onlyBelowLatidue:
			shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" "
		if logTransform:
			shstr += " --logTransform "
		if negate:
			shstr += " --negate "
		if removeOutliers:
			shstr += " --removeOutliers="+str(removeOutliers)+" "
		if phenotypeRanks:
			shstr += " --phenotypeRanks "
		if testRobustness:
			shstr+=" --testRobustness "

		shstr+=" --permutationFilter="+str(permutationFilter)+" "

		if sr:
			shstr += " --sr "			
			if not srOutput:
				output = resultDir+"Emma_"+parallel+"_"+phenName+".sr.pvals"				
			shstr += " --srOutput="+str(output)+" "
			if srSkipFirstRun:
				if not srInput:
					output = resultDir+"Emma_"+parallel+"_"+phenName+".pvals"
				shstr += " --srInput="+str(output)+" "
				shstr += " --srSkipFirstRun "				
			shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" "
			
		shstr += " -a "+str(withArrayIds)+" "			
		if kinshipDatafile:
			shstr += " --kinshipDatafile="+str(kinshipDatafile)+" "			
		shstr += " --addConstant="+str(addConstant)+" "			
		shstr += snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" "
		shstr += "> "+outFileName+"_job"+".out) >& "+outFileName+"_job"+".err\n"

		f = open(parallel+".sh",'w')
		f.write(shstr)
		f.close()

		#Execute qsub script
		os.system("qsub "+parallel+".sh ")

	snpsDataFile = args[0]
	phenotypeDataFile = args[1]
	if parallel:  #Running on the cluster..
		phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t')  #Get Phenotype data 
		if parallelAll:
			for phenotypeIndex in phed.phenIds:
				if onlyMissing:
					phenName = phed.getPhenotypeName(phenotypeIndex)
					pvalFile = resultDir+"Emma_"+parallel+"_"+phenName+".pvals"
					res = None
					try:
						res = os.stat(pvalFile)

					except Exception:
						print "File",pvalFile,"does not exist."
					if res and res.st_size>0:
						print "File",pvalFile,"already exists, and is non-empty."
						if sr:
							srInput = resultDir+"Emma_"+parallel+"_"+phenName+".sr.pvals"
							srRes = None
							try:
								srRes = os.stat(srInput)
							except Exception:
								print "File",srInput,"does not exist."
							if srRes and srRes.st_size>0:
								print "File",srInput,"already exists, and is non-empty."
							else:
								runParallel(phenotypeIndex,phed)
							
					else:
						print "Setting up the run."
						runParallel(phenotypeIndex,phed)
											
				else:
					runParallel(phenotypeIndex,phed)
		else:
			phenotypeIndex = int(args[2])
			runParallel(phenotypeIndex,phed)
		return
	else:
		phenotypeIndex = int(args[2])


	print "phenotypeIndex:",phenotypeIndex
	print "\nStarting program now!\n"



	snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds)

	#Load phenotype file
	phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t')  #Get Phenotype data 
	numAcc = len(snpsds[0].accessions)

	#Removing outliers
	if removeOutliers:
		print "Remoing outliers"
		phed.naOutliers(phenotypeIndex,removeOutliers)
	
	#If onlyOriginal96, then remove all other phenotypes..
	if onlyOriginal96: 
		print "Filtering for the first 96 accessions"
		original_96_ecotypes = phenotypeData._getFirst96Ecotypes_()
		original_96_ecotypes = map(str,original_96_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_96_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_96_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	if onlyOriginal192: 
		print "Filtering for the first 192 accessions"
		original_192_ecotypes = phenotypeData._getFirst192Ecotypes_()
		original_192_ecotypes = map(str,original_192_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_192_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_192_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
	
	if onlyBelowLatidue:
		print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
	sys.stdout.write("Finished prefiltering phenotype accessions.\n")
	sys.stdout.flush()

	phenotype = phed.getPhenIndex(phenotypeIndex)

	accIndicesToKeep = []			
	phenAccIndicesToKeep = []
	#Checking which accessions to keep and which to remove .
	for i in range(0,len(snpsds[0].accessions)):
		acc1 = snpsds[0].accessions[i]
		for j in range(0,len(phed.accessions)):
			acc2 = phed.accessions[j]
			if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA':
				accIndicesToKeep.append(i)
				phenAccIndicesToKeep.append(j)
				break	

	print "\nFiltering accessions in genotype data:"
	#Filter accessions which do not have the phenotype value (from the genotype data).
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		snpsd.removeAccessionIndices(accIndicesToKeep)
	print ""
	print numAcc-len(accIndicesToKeep),"accessions removed from genotype data, leaving",len(accIndicesToKeep),"accessions in all."
		

	print "\nNow filtering accessions in phenotype data:"
	phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values

	print "Verifying number of accessions: len(phed.accessions)==len(snpsds[0].accessions) is",len(phed.accessions)==len(snpsds[0].accessions)
	if len(phed.accessions)!=len(snpsds[0].accessions):
		raise Exception

	#Filtering monomorphic
	print "Filtering monomorphic SNPs"
	for snpsd in snpsds:
		print "Removed", str(snpsd.filterMonoMorphicSnps()),"Snps"

	#Remove minor allele frequencies
	if minMAF!=0:
		sys.stdout.write("Filterting SNPs with MAF<"+str(minMAF)+".")
		for snpsd in snpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			snpsd.filterMinMAF(minMAF)

	#Removing SNPs which are outside of boundaries.
	if chr:
		print "\nRemoving SNPs which are outside of boundaries."
		snpsds[chr-1].filterRegion(boundaries[0],boundaries[1])
		snpsds = [snpsds[chr-1]]
	
	#Ordering accessions in genotype data to fit phenotype data.
	print "Ordering genotype data accessions."
	accessionMapping = []
	i = 0
	for acc in phed.accessions:
		if acc in snpsds[0].accessions:
			accessionMapping.append((snpsds[0].accessions.index(acc),i))
			i += 1

	#print zip(accessionMapping,snpsds[0].accessions)
	print "len(snpsds[0].snps)",len(snpsds[0].snps)

	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		snpsd.orderAccessions(accessionMapping)
	print "\nGenotype data has been ordered."
		
	#Converting format to 01
	newSnpsds = []
	sys.stdout.write("Converting data format")
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		newSnpsds.append(snpsd.getSnpsData(missingVal=missingVal))
	print ""


	
	print "Checking kinshipfile:",kinshipDatafile
	
	if kinshipDatafile:  #Is there a special kinship file?
		kinshipSnpsds = dataParsers.parseCSVData(kinshipDatafile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds)

		accIndicesToKeep = []			
		#Checking which accessions to keep and which to remove (genotype data).
		sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".")
		sys.stdout.flush()
		for i in range(0,len(kinshipSnpsds[0].accessions)):
			acc1 = kinshipSnpsds[0].accessions[i]
			for j in range(0,len(phed.accessions)):
				acc2 = phed.accessions[j]
				if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA':
					accIndicesToKeep.append(i)
					break	
		print accIndicesToKeep
	
		for snpsd in kinshipSnpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			snpsd.removeAccessionIndices(accIndicesToKeep)
		print ""
		print numAcc-len(accIndicesToKeep),"accessions removed from kinship genotype data, leaving",len(accIndicesToKeep),"accessions in all."
	
		print "Ordering kinship data accessions."
		accessionMapping = []
		i = 0
		for acc in snpsds[0].accessions:
			if acc in kinshipSnpsds[0].accessions:
				accessionMapping.append((kinshipSnpsds[0].accessions.index(acc),i))
				i += 1

		print zip(accessionMapping,snpsds[0].accessions)
		print "len(snpsds[0].snps)",len(snpsds[0].snps)
		
		for snpsd in kinshipSnpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			snpsd.orderAccessions(accessionMapping)
		print "Kinship genotype data has been ordered."

		newKinshipSnpsds = []
		sys.stdout.write("Converting data format")
		for snpsd in kinshipSnpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			newKinshipSnpsds.append(snpsd.getSnpsData(missingVal=missingVal))  #This data might have NAs
		print ""
		kinshipSnpsds = newKinshipSnpsds

	else:
		kinshipSnpsds = newSnpsds
		

	print "Found kinship data."

	#Ordering accessions according to the order of accessions in the genotype file
#	accessionMapping = []
#	i = 0
#	for acc in snpsds[0].accessions:
#		if acc in phed.accessions:
#			accessionMapping.append((phed.accessions.index(acc),i))
#			i += 1
#	phed.orderAccessions(accessionMapping)

	
	#Negating phenotypic values
	if negate: 
		phed.negateValues(phenotypeIndex)

	#Adding a constant.
	if addConstant!=-1:
		if addConstant==0:
			addConstant = math.sqrt(phed.getVariance(phenotypeIndex))/10
			addConstant = addConstant - phed.getMinValue(phenotypeIndex)
			
		print "Adding a constant to phenotype:",addConstant
		phed.addConstant(phenotypeIndex,addConstant)
	
		
	
	#Log-transforming
	if logTransform:
		print "Log transforming phenotype"
		phed.logTransform(phenotypeIndex)
	#Converting phenotypes to Ranks
	elif phenotypeRanks:
		phed.transformToRanks(phenotypeIndex)
	
	if not chr:
		snpsDataset = snpsdata.SNPsDataSet(newSnpsds,[1,2,3,4,5])
		kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds,[1,2,3,4,5])
	else:
		snpsDataset = snpsdata.SNPsDataSet(newSnpsds,[chr])
		kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds,[chr])
		
	
	phenotypeName = phed.getPhenotypeName(phenotypeIndex)

	sys.stdout.flush()
	
	if testRobustness:
		print "Starting a robustness test"
		allSNPs = []
		for snpsd in snpsDataset.snpsDataList:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		_robustness_test_(allSNPs,phenVals,rFile,filter=permutationFilter)
		sys.exit(0)

	if (not sr) or (sr and not srSkipFirstRun):
		sys.stdout.write("Running Primary Emma.\n")
		sys.stdout.flush()
		pvalFile = _runEmmaScript_(snpsDataset, kinshipSnpsDataset, phed, phenotypeIndex, rFile, chr=chr, delim=delim, missingVal=missingVal, boundaries=boundaries, lrt=lrt)
		res = gwaResults.Result(pvalFile,name="EMMA_"+phenotypeName, phenotypeID=phenotypeIndex)
		res.filterMAF()
		res.negLogTransform()
		pngFile = pvalFile+".png"
		plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False)	
		srInput = pvalFile

	if sr:
		_secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,kinshipSnpsDataset)
		print "Generating second run GW plot."
		res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex)
		res.filterMAF()
		res.negLogTransform()
		srRes = gwaResults.Result(srOutput,name="EMMA_SR_"+phenotypeName, phenotypeID=phenotypeIndex)
		srRes.filterMAF()
		srRes.negLogTransform()
		srPngFile = pvalFile+".sr.png"
		plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)	
Example #5
0
def _run_():
    if len(sys.argv) == 1:
        print __doc__
        sys.exit(2)

    long_options_list = [
        "maxError=", "comparisonFile=", "maxMissing=", "removeEcotypeId=",
        "removeArrayId=", "first96", "removeIdentical", "onlyCommon", "delim=",
        "missingval=", "withArrayId=", "debug", "report", "help",
        "heterozygous2NA", "first192", "removeLer", "removeCol"
    ]
    try:
        opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:bh",
                                   long_options_list)

    except:
        traceback.print_exc()
        print sys.exc_info()
        print __doc__
        sys.exit(2)

    inputFile = args[0]
    output_fname = None
    delim = ","
    missingVal = "NA"
    comparisonFile = None
    maxMissing = 1.0
    maxError = 1.0
    removeEcotypes = None
    removeArray = None
    removeIdentical = False
    onlyCommon = False
    debug = None
    report = None
    help = 0
    withArrayIds = 1
    first96 = False
    first192 = False
    heterozygous2NA = False
    removeLer = False
    removeCol = False

    for opt, arg in opts:
        if opt in ('-o'):
            output_fname = arg
        elif opt in ("-h", "--help"):
            help = 1
            print __doc__
        elif opt in ("-a", "--withArrayId"):
            withArrayIds = int(arg)
        elif opt in ("--comparisonFile"):
            comparisonFile = arg
        elif opt in ("--maxError"):
            maxError = float(arg)
        elif opt in ("--maxMissing"):
            maxMissing = float(arg)
        elif opt in ("--heterozygous2NA"):
            heterozygous2NA = True
        elif opt in ("--removeEcotypeId"):
            removeEcotypes = arg.split(",")
            removeEcotypes = map(int, removeEcotypes)
        elif opt in ("--removeArrayId"):
            removeArray = int(arg)
        elif opt in ("--removeIdentical"):
            removeIdentical = True
        elif opt in ("--onlyCommon"):
            onlyCommon = True
        elif opt in ("--first96"):
            first96 = True
        elif opt in ("--first192"):
            first192 = True
        elif opt in ("--removeLer"):
            removeLer = True
        elif opt in ("--removeCol"):
            removeCol = True
        elif opt in ("-d", "--delim"):
            delim = arg
        elif opt in ("-m", "--missingval"):
            missingVal = arg
        elif opt in ("-b", "--debug"):
            debug = 1
        elif opt in ("-r", "--report"):
            report = 1
        else:
            if help == 0:
                print "Unkown option!!\n"
                print __doc__
            sys.exit(2)

    if not output_fname:
        output_fname
        if help == 0:
            print "Output file missing!!\n"
            print __doc__
        sys.exit(2)

    waid1 = withArrayIds == 1 or withArrayIds == 2
    waid2 = withArrayIds == 2 or withArrayIds == 3

    import dataParsers
    snpsds = dataParsers.parseCSVData(inputFile,
                                      format=1,
                                      deliminator=delim,
                                      missingVal=missingVal,
                                      withArrayIds=waid1)

    accessionsToRemove = []
    arraysToRemove = None

    if first96:
        import dataParsers
        d = dataParsers.getEcotypeToAccessionDictionary(defaultValue='-1',
                                                        user="******",
                                                        passwd="bamboo123")
        ecotd = dataParsers.getEcotypeToNameDictionary(defaultValue='-1',
                                                       user="******",
                                                       passwd="bamboo123")
        print "Dictionaries loaded"
        names = []
        first96Names = []
        for i in range(0, len(snpsds[0].accessions)):
            ecotype = snpsds[0].accessions[i]
            arrayID = snpsds[0].arrayIds[i]
            names.append((arrayID, ecotd[ecotype], ecotype))
            if int(d[ecotype][0]) > 97 or int(d[ecotype][0]) < 0:
                accessionsToRemove.append(ecotype)
            else:
                first96Names.append(
                    (arrayID, d[ecotype][1], d[ecotype][0], ecotype))

        first96Names.sort()
        print "First 96 accessions, len:", len(first96Names), ":"
        for name in first96Names:
            print name
        names.sort()
        print "All accessions:"
        for name in names:
            print name
    elif first192:
        import phenotypeData
        ecotypes_192 = map(str, phenotypeData._getFirst192Ecotypes_())
        print ecotypes_192, snpsds[0].accessions
        for acc in snpsds[0].accessions:
            if acc not in ecotypes_192:
                accessionsToRemove.append(acc)
        print "found", len(ecotypes_192), '"192" ecotypes... removing', len(
            accessionsToRemove), "ecotypes."

    if removeLer:
        import analyzeHaplotype as ah
        accessionsToRemove += ah.getLerAndColAccessions(snpsds)[0]
    if removeCol:
        import analyzeHaplotype as ah
        accessionsToRemove += ah.getLerAndColAccessions(snpsds)[1]

    #Retrieve comparison list of accessions.  (Error rates for accessions)
    if (removeIdentical or maxError < 1.0) and comparisonFile:
        sys.stderr.write("Loading comparison file:")
        snpsds2 = dataParsers.parseCSVData(comparisonFile,
                                           format=1,
                                           deliminator=delim,
                                           missingVal=missingVal,
                                           withArrayIds=waid2)
        res = []
        sys.stderr.write("Comparing accessions.")
        for i in range(0, len(snpsds)):
            res.append(snpsds[i].compareWith(snpsds2[i],
                                             withArrayIds=withArrayIds,
                                             verbose=False,
                                             heterozygous2NA=heterozygous2NA))
            sys.stderr.write(".")
        sys.stderr.write("\n")

        totalAccessionCounts = [0] * len(res[0][2])
        accErrorRate = [0] * len(res[0][2])
        for i in range(0, len(snpsds)):
            r = res[i]
            for j in range(0, len(r[2])):
                totalAccessionCounts[j] += r[6][j]
                accErrorRate[j] += r[3][j] * float(r[6][j])

        for i in range(0, len(accErrorRate)):
            accErrorRate[i] = accErrorRate[i] / float(totalAccessionCounts[i])

        accErrAndID = []
        if 0 < withArrayIds < 3:
            for i in range(0, len(r[2])):
                accErrAndID.append((accErrorRate[i], r[2][i], r[5][i]))
        else:
            for i in range(0, len(r[2])):
                accErrAndID.append((accErrorRate[i], r[2][i]))
        accErrAndID.sort()
        accErrAndID.reverse()

    #Figure out which accessions are too erroraneous
    if maxError < 1.0 and comparisonFile:
        if withArrayIds:
            arraysToRemove = []
            for (error, ecotype, array) in accErrAndID:
                if error > maxError:
                    accessionsToRemove.append(ecotype)
                    arraysToRemove.append(array)

        else:
            for (error, ecotype) in accErrAndID:
                if error > maxError:
                    accessionsToRemove.append(ecotype)

    if removeIdentical and comparisonFile and withArrayIds:
        print "Locating identical accessions"
        accErrAndID.sort()
        if not arraysToRemove:
            arraysToRemove = []
        for accession in set(snpsds[0].accessions):
            if snpsds[0].accessions.count(accession) > 1:
                found = 0
                for (error, ecotype, array) in accErrAndID:
                    if ecotype == accession:
                        if found > 0:
                            accessionsToRemove.append(ecotype)
                            arraysToRemove.append(array)
                        found += 1

    if onlyCommon and comparisonFile:
        print "Locating accessions which are not shared"
        snpsds2 = dataParsers.parseCSVData(comparisonFile,
                                           format=1,
                                           deliminator=delim,
                                           missingVal=missingVal,
                                           withArrayIds=waid2)
        #print snpsds2[0].accessions,'\n',snpsds[0].accessions,'\n',len(set(snpsds2[0].accessions).intersection(set(snpsds[0].accessions)))
        if not arraysToRemove:
            arraysToRemove = []
        for i in range(0, len(snpsds[0].accessions)):
            acc = snpsds[0].accessions[i]
            if not acc in snpsds2[0].accessions:
                accessionsToRemove.append(acc)
                if 0 < withArrayIds < 3:
                    arraysToRemove.append(snpsds[0].arrayIds[i])

    if maxMissing < 1.0:
        missingCounts = [0] * len(snpsds[0].accessions)
        numSnps = 0
        for snpsd in snpsds:
            mc = snpsd.accessionsMissingCounts()
            numSnps += len(snpsd.positions)
            for i in range(0, len(snpsds[0].accessions)):
                missingCounts[i] += mc[i]

        missingRates = []
        if withArrayIds:
            arraysToRemove = []
            for i in range(0, len(snpsds[0].accessions)):
                missingRates.append(
                    (missingCounts[i] / float(numSnps),
                     snpsds[0].accessions[i], snpsds[0].arrayIds[i]))
            missingRates.sort()
            missingRates.reverse()
            for (mrate, ecotype, array) in missingRates:
                if mrate > maxMissing:
                    accessionsToRemove.append(ecotype)
                    arraysToRemove.append(array)
        else:
            for i in range(0, len(snpsds[0].accessions)):
                missingRates.append((missingCounts[i] / float(numSnps),
                                     snpsds[0].accessions[i]))
            missingRates.sort()
            missingRates.reverse()
            for (mrate, ecotype) in missingRates:
                if mrate > maxMissing:
                    accessionsToRemove.append(ecotype)

    if removeEcotypes:
        for removeEcotype in removeEcotypes:
            accessionsToRemove.append(str(int(removeEcotype)))
        print "Removing", len(accessionsToRemove), "accessions."
    if removeArray:
        if not arraysToRemove:
            arraysToRemove = []
        arraysToRemove.append(str(removeArray))
        print "Removing", len(arraysToRemove), " arrays."

    numAccessions = len(snpsds[0].accessions)
    sys.stderr.write("Removing accessions.")
    for snpsd in snpsds:
        snpsd.removeAccessions(accessionsToRemove, arrayIds=arraysToRemove)
        sys.stderr.write(".")
    print "\n", (
        numAccessions - len(snpsds[0].accessions)
    ), "accessions out of " + str(numAccessions) + " were removed."

    import snpsdata
    snpsdata.writeRawSnpsDatasToFile(output_fname,
                                     snpsds,
                                     chromosomes=[1, 2, 3, 4, 5],
                                     deliminator=delim,
                                     missingVal=missingVal,
                                     withArrayIds=waid1)
Example #6
0
def _run_():
	if len(sys.argv)==1:
		print __doc__
		sys.exit(2)
	
	long_options_list=["outputFile=", "delim=", "missingval=", "withArrayId=", "phenotypeFileType=", 
					"help", "parallel=", "parallelAll", "addToDB", 
					"callMethodID=", "comment=", "onlyOriginal192","onlyOriginal96", "subSample=" , 
					"subSampleLikePhenotype=", "subsampleTest=", "complement", "onlyBelowLatidue=", 
					"onlyAboveLatidue=", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun",
					"permTest=", "savePermutations", "permutationFilter=", "testRobustness"]
	try:
		opts, args=getopt.getopt(sys.argv[1:], "o:c:d:m:a:h", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
		phenotypeFileType=1
		outputFile=None
	delim=","
	missingVal="NA"
	help=0
	withArrayIds=1
	parallel=None
	parallelAll=False
	addToDB=False
	callMethodID=None
	comment=""
	subSample=None
	onlyOriginal96=False
	onlyOriginal192 = False
	subSampleLikePhenotype = None
	subsampleTest = False
	numSubSamples = None
	complement = False
	onlyBelowLatidue = None
	onlyAboveLatidue = None

	sr = False
	srOutput = False
	srInput = False
	srSkipFirstRun = False
	srTopQuantile = 0.95
	srWindowSize = 30000
	
	permTest = None
	savePermutations = False
	permutationFilter = 1.0
	
	testRobustness = False

	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help=1
			print __doc__
		elif opt in ("-a", "--withArrayId"):
			withArrayIds=int(arg)
		elif opt in ("-o", "--outputFile"):
			outputFile=arg
		elif opt in ("--phenotypeFileType"):
			phenotypeFileType=int(arg)
		elif opt in ("--parallel"):
			parallel=arg
		elif opt in ("--parallelAll"):
			parallelAll=True
		elif opt in ("--addToDB"):
			addToDB=True
  		elif opt in ("--onlyOriginal96"):
			onlyOriginal96=True
  		elif opt in ("--onlyOriginal192"):
			onlyOriginal192=True
		elif opt in ("--complement"):
			complement=True
		elif opt in ("--subSample"):
			subSample=int(arg)
		elif opt in ("--subsampleTest"):
			subsampleTest = True
			l = arg.split(",")
			subSample=int(l[0])
			numSubSamples=int(l[1])
		elif opt in ("--onlyBelowLatidue"):
			onlyBelowLatidue=float(arg)
		elif opt in ("--onlyAboveLatidue"):
			onlyAboveLatidue=float(arg)
		elif opt in ("--subSampleLikePhenotype"):
			subSampleLikePhenotype=int(arg)
		elif opt in ("--callMethodID"):
			callMethodID=int(arg)
		elif opt in ("--comment"):
			comment=arg
		elif opt in ("-d", "--delim"):
			delim=arg
		elif opt in ("-m", "--missingval"):
			missingVal=arg
		elif opt in ("--sr"):
			sr = True
		elif opt in ("--testRobustness"):
			testRobustness = True
		elif opt in ("--permTest"):
			permTest = int(arg)
		elif opt in ("--savePermutations"):
			savePermutations = True
		elif opt in ("--permutationFilter"):
			permutationFilter = float(arg)
		elif opt in ("--srSkipFirstRun"):
			srSkipFirstRun = True
		elif opt in ("--srInput"):
			srInput = arg
		elif opt in ("--srOutput"):
			srOutput = arg
		elif opt in ("--srPar"):
			vals = arg.split(",")
			srTopQuantile = float(vals[0]) 
			srWindowSize = int(vals[1]) 
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)

	if len(args)<3 and not parallel:
		if help==0:
			print "Arguments are missing!!\n"
			print __doc__
		sys.exit(2)

	snpsDataFile=args[0]
	phenotypeDataFile=args[1]

	print "Kruskal-Wallis is being set up with the following parameters:"
	print "phenotypeDataFile:",phenotypeDataFile
	print "snpsDataFile:",snpsDataFile
	print "parallel:",parallel
	print "parallelAll:",parallelAll
	print "onlyOriginal96:",onlyOriginal96
	print "onlyOriginal192:",onlyOriginal192
	print "onlyBelowLatidue:",onlyBelowLatidue
	print "onlyAboveLatidue:",onlyAboveLatidue
	print "subSampleLikePhenotype:",subSampleLikePhenotype
	print "subsampleTest:",subsampleTest
	print "numSubSamples:",numSubSamples
	print "subSample:",subSample
	print "sr:",sr
	print "srSkipFirstRun:",srSkipFirstRun
	print "srInput:",srInput
	print "srOutput:",srOutput
	print "srTopQuantile:",srTopQuantile
	print "srWindowSize:",srWindowSize
	print "permTest:",permTest
	print "savePermutations:",savePermutations
	print "permutationFilter:",permutationFilter
	print "testRobustness:",testRobustness
	

	def runParallel(phenotypeIndex,id=""):
		#Cluster specific parameters
		phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
		phenName=phed.getPhenotypeName(phenotypeIndex)
		phenName=phenName.replace("/", "_div_")
		phenName=phenName.replace("*", "_star_")
		outputFile=resultDir+"KW_"+parallel+"_"+phenName+id

		shstr="""#!/bin/csh
#PBS -l walltime=100:00:00
#PBS -l mem=4g 
#PBS -q cmb
"""
		
		shstr+="#PBS -N K"+phenName+"_"+parallel+"\n"
		shstr+="set phenotypeName="+parallel+"\n"
		shstr+="set phenotype="+str(phenotypeIndex)+"\n"
		shstr+="(python "+scriptDir+"KW.py -o "+outputFile+" "
		shstr+=" -a "+str(withArrayIds)+" "			
		if subSample:
			shstr+=" --subSample="+str(subSample)+" "			
		elif onlyOriginal96:
			shstr+=" --onlyOriginal96 "			
		elif onlyOriginal192:
			shstr+=" --onlyOriginal192 "
		if onlyBelowLatidue:
			shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" "
		elif onlyAboveLatidue:
			shstr+=" --onlyAboveLatidue="+str(onlyAboveLatidue)+" "
		if complement: 			
			shstr+=" --complement "
		if permTest:
			shstr+=" --permTest="+str(permTest)+" "
			if savePermutations:
				shstr+=" --savePermutations "
		
		shstr+=" --permutationFilter="+str(permutationFilter)+" "
		if testRobustness:
			shstr+=" --testRobustness "
			
		if sr:
			shstr += " --sr "			
			if not srOutput:
				output = resultDir+"KW_"+parallel+"_"+phenName+".sr.pvals"				
			shstr += " --srOutput="+str(output)+" "
			if srSkipFirstRun:
				if not srInput:
					output = resultDir+"KW_"+parallel+"_"+phenName+".pvals"
				shstr += " --srInput="+str(output)+" "
				shstr += " --srSkipFirstRun "				
			shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" "


		shstr+=snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" "
		shstr+="> "+outputFile+"_job"+".out) >& "+outputFile+"_job"+".err\n"

		f=open(parallel+".sh", 'w')
		f.write(shstr)
		f.close()

		#Execute qsub script
		os.system("qsub "+parallel+".sh ")

	if parallel:  #Running on the cluster..
		if parallelAll:
			phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
			for phenotypeIndex in phed.phenIds:
				runParallel(phenotypeIndex)
		elif subsampleTest:
			phenotypeIndex=int(args[2])
			for i in range(0,numSubSamples):
				runParallel(phenotypeIndex,id="_r"+str(subSample)+"_"+str(i))
		else:
			phenotypeIndex=int(args[2])
			runParallel(phenotypeIndex)
		return
	else:
		phenotypeIndex=int(args[2])


	print "phenotypeIndex:",phenotypeIndex
	print "output:",outputFile
	print "\nStarting program now!\n"


	#Load phenotype file
	phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
	
	#If onlyOriginal96, then remove all other phenotypes..
	if onlyOriginal96: 
		print "Filtering for the first 96 accessions"
		original_96_ecotypes = phenotypeData._getFirst96Ecotypes_()
		original_96_ecotypes = map(str,original_96_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_96_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_96_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	if onlyOriginal192: 
		print "Filtering for the first 192 accessions"
		original_192_ecotypes = phenotypeData._getFirst192Ecotypes_()
		original_192_ecotypes = map(str,original_192_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_192_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_192_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
	
	if onlyBelowLatidue:
		print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)

	elif onlyAboveLatidue:
		print "Filtering for the accessions which orginate above latitude",onlyAboveLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]>onlyAboveLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)

	
	if subSampleLikePhenotype:
		p_name = phed.getPhenotypeName(subSampleLikePhenotype)
		print "Picking sample as in",p_name
		ecotypes = phed.getNonNAEcotypes(subSampleLikePhenotype)
		print ecotypes
		phed.filterAccessions(ecotypes)
		print "len(phed.accessions)", len(phed.accessions)


	if subSample: 
		sample_ecotypes = []
		ecotypes = phed.getNonNAEcotypes(phenotypeIndex)
		sample_ecotypes = random.sample(ecotypes,subSample)			
		phed.filterAccessions(sample_ecotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	sys.stdout.write("Finished prefiltering phenotype accessions.\n")
	sys.stdout.flush()
	
	
	
	#Load genotype file
	snpsds=dataParsers.parseCSVData(snpsDataFile, format = 1, deliminator = delim, missingVal = missingVal, withArrayIds = withArrayIds)


	#Checking overlap between phenotype and genotype accessions. 
	phenotype=phed.getPhenIndex(phenotypeIndex)
	accIndicesToKeep=[]			
	phenAccIndicesToKeep=[]
	numAcc=len(snpsds[0].accessions)
	sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".")
	sys.stdout.flush()
	for i in range(0, len(snpsds[0].accessions)):
		acc1=snpsds[0].accessions[i]
		for j in range(0, len(phed.accessions)):
			acc2=phed.accessions[j]
			if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA':
				accIndicesToKeep.append(i)
				phenAccIndicesToKeep.append(j)
				break	


	#Filter accessions which do not have the phenotype value.
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		snpsd.removeAccessionIndices(accIndicesToKeep)
	print ""
	print numAcc-len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all."
		
	print "Filtering phenotype data."
	phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values
	
	#Ordering accessions according to the order of accessions in the genotype file
	accessionMapping=[]
	i=0
	for acc in snpsds[0].accessions:
		if acc in phed.accessions:
			accessionMapping.append((phed.accessions.index(acc), i))
			i+=1
	phed.orderAccessions(accessionMapping)

		#Filtering monomorphic
	print "Filtering monomorphic SNPs"
	for snpsd in snpsds:
		print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

	#Converting format to 01
	newSnpsds=[]
	sys.stdout.write("Converting data format")
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		newSnpsds.append(snpsd.getSnpsData())
	print ""
	
	#Double check genotype file:
	problems = 0
	for i in range(0,len(newSnpsds)):
		snpsd = newSnpsds[i]
		for j in range(0,len(snpsd.snps)):
			snp = snpsd.snps[j]
			sc = snp.count(0)
			if sc==0 or sc==len(snp):
				print "Problem in file found at chr,pos",(i+1),",",snpsd.positions[i]
				problems += 1
	if problems >0:
		print "Genotype file appears to have potential problems"
	else:
		print "Genotype file appears to be good"

	if permTest:
		print "Starting a permutation test"
		allSNPs = []
		for snpsd in newSnpsds:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		test_type = "KW"
		if phed.isBinary(phenotypeIndex):
			test_type = "Fisher"
			permTest = 100	
		_perm_test_(allSNPs,phenVals,permTest,outputFile, test_type=test_type,savePermutations=savePermutations, filter=permutationFilter)
		sys.exit(0)
	
	if testRobustness:
		print "Starting a robustness test"
		allSNPs = []
		for snpsd in newSnpsds:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		test_type = "KW"
		if phed.isBinary(phenotypeIndex):
			test_type = "Fisher"
		_robustness_test_(allSNPs,phenVals,outputFile, test_type=test_type, filter=permutationFilter)
		sys.exit(0)
		

	sys.stdout.flush()
	print "sr:",sr, ", srSkipFirstRun:",srSkipFirstRun
	if (not sr) or (sr and not srSkipFirstRun):
		#Writing files
		if env.user=="bjarni":
			tempfile.tempdir='/tmp'
		(fId, phenotypeTempFile)=tempfile.mkstemp()
		os.close(fId)
		(fId, genotypeTempFile)=tempfile.mkstemp()
		os.close(fId)
		
		phed.writeToFile(phenotypeTempFile, [phenotype])	
		sys.stdout.write("Phenotype file written\n")
		sys.stdout.flush()
		snpsDataset=snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5])
		decoder={1:1, 0:0,-1:'NA'}	
		snpsDataset.writeToFile(genotypeTempFile, deliminator = delim, missingVal = missingVal, withArrayIds = 0, decoder = decoder)
		sys.stdout.write("Genotype file written\n")
		sys.stdout.flush()
	
		phenotypeName=phed.getPhenotypeName(phenotypeIndex)
	
		rDataFile=outputFile+".rData"
		pvalFile=outputFile+".pvals"
		#Is the phenotype binary?
		binary=phed.isBinary(phenotypeIndex)
		rstr=_generateRScript_(genotypeTempFile, phenotypeTempFile, rDataFile, pvalFile, name = phenotypeName, binary = binary)
		rFileName=outputFile+".r"
		f=open(rFileName, 'w')
		f.write(rstr)
		f.close()
		outRfile=rFileName+".out"
		errRfile=rFileName+".err"
		print "Running R file:"
		cmdStr="(R --vanilla < "+rFileName+" > "+outRfile+") >& "+errRfile
		sys.stdout.write(cmdStr+"\n")
		sys.stdout.flush()	
		gc.collect() 
		os.system(cmdStr)
		#print "Emma output saved in R format in", rDataFile
		print "Generating a GW plot."
		res = gwaResults.Result(pvalFile,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex)
		res.negLogTransform()
		pngFile = pvalFile+".png"
		plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False)	
		srInput = pvalFile
		
	else:
		print "Skipping first stage analysis."
		sys.stdout.flush()

	if sr:
		_secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,binary=binary)
		print "Generating second run GW plot."
		res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex)
		res.negLogTransform()
		srRes = gwaResults.Result(srOutput,name="KW_SR_"+phenotypeName, phenotypeID=phenotypeIndex)
		srRes.negLogTransform()
		srPngFile = pvalFile+".sr.png"
		plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)