Ejemplo n.º 1
0
def getPhenotypes(host="papaya.usc.edu", user=None, passwd=None, onlyBinary=False, onlyQuantitative=False, onlyCategorical=False, onlyReplicates=False, includeSD=False, rawPhenotypes=False, onlyPublishable=False):
	print "onlyPublishable:",onlyPublishable
	import dataParsers
	e2a = dataParsers.getEcotypeToAccessionDictionary(host,user=user,passwd=passwd,defaultValue='100')

	import MySQLdb
	print "Connecting to db, host="+host
	try:
		conn = MySQLdb.connect (host = host, user = user, passwd = passwd, db = "at")
	except MySQLdb.Error, e:
		print "Error %d: %s" % (e.args[0], e.args[1])
		sys.exit (1)
Ejemplo n.º 2
0
def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["hostname=", "user="******"passwd=", "version=", "delim=", "missingval=", "accname", "debug", "report", "help", "only96"]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "z:u:p:o:v:d:m:abrh", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
	hostname = 'papaya.usc.edu'
	user = None
	passwd = None
	output_fname = None
	version = "3"
	delim = ","
	missingVal = "NA"
	useAccessionName = False
	debug = None
	report = None
	help = 0
	only96 = False
	
	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help = 1
			print __doc__
		elif opt in ("-z", "--hostname"):
			hostname = arg
		elif opt in ("-u", "--user"):
			user = arg
		elif opt in ("-p", "--passwd"):
			passwd = arg
		elif opt in ("-o",):
			output_fname = arg
		elif opt in ("-v","--version"):
			version = arg
		elif opt in ("-d","--delim"):
			delim = arg
		elif opt in ("-m","--missingval"):
			missingVal = arg
		elif opt in ("-a","--accname"):
			useAccessionName = True		
		elif opt in ("-b", "--debug"):
			debug = 1
		elif opt in ("-r", "--report"):
			report = 1
		elif opt in ("--only96"):
			only96 = True
	

	if not output_fname:
		output_fname
		if help==0:
			print "Output file missing!!\n"
			print __doc__
		sys.exit(2)


	import dataParsers
	snpsds = dataParsers.get2010DataFromDb(host=hostname,chromosomes=[1,2,3,4,5], dataVersion=version, only96accessions=only96, user=user, passwd=passwd)
	
	accDecoder=None
	if useAccessionName:
		tmpDecoder = dataParsers.getEcotypeToAccessionDictionary(user="******",passwd="bamboo123")
		accDecoder={}
		for acc in tmpDecoder:
			accDecoder[acc]=tmpDecoder[acc][1]
	import snpsdata
	snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds,chromosomes=[1,2,3,4,5], deliminator=delim, missingVal = missingVal, accDecoder=accDecoder)
Ejemplo n.º 3
0
def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["hostname=", "user="******"passwd=", "version=", "delim=", "missingval=", "accname", "debug", "report", "help", "only96"]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "z:u:p:o:v:d:m:abrh", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
	hostname = 'papaya.usc.edu'
	user = None
	passwd = None
	output_fname = None
	version = "3"
	delim = ","
	missingVal = "NA"
	useAccessionName = False
	debug = None
	report = None
	help = 0
	only96 = False
	
	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help = 1
			print __doc__
		elif opt in ("-z", "--hostname"):
			hostname = arg
		elif opt in ("-u", "--user"):
			user = arg
		elif opt in ("-p", "--passwd"):
			passwd = arg
		elif opt in ("-o",):
			output_fname = arg
		elif opt in ("-v","--version"):
			version = arg
		elif opt in ("-d","--delim"):
			delim = arg
		elif opt in ("-m","--missingval"):
			missingVal = arg
		elif opt in ("-a","--accname"):
			useAccessionName = True		
		elif opt in ("-b", "--debug"):
			debug = 1
		elif opt in ("-r", "--report"):
			report = 1
		elif opt in ("--only96"):
			only96 = True
	

	if not output_fname:
		output_fname
		if help==0:
			print "Output file missing!!\n"
			print __doc__
		sys.exit(2)


	import dataParsers
	snpsds = dataParsers.get2010DataFromDb(host=hostname,chromosomes=[1,2,3,4,5], dataVersion=version, only96accessions=only96, user=user, passwd=passwd)
	
	accDecoder=None
	if useAccessionName:
		tmpDecoder = dataParsers.getEcotypeToAccessionDictionary(user="******",passwd="bamboo123")
		accDecoder={}
		for acc in tmpDecoder:
			accDecoder[acc]=tmpDecoder[acc][1]
	import snpsdata
	snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds,chromosomes=[1,2,3,4,5], deliminator=delim, missingVal = missingVal, accDecoder=accDecoder)
Ejemplo n.º 4
0
def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["maxError=", "comparisonFile=", "maxMissing=", "removeEcotypeId=", "removeArrayId=", "first96", 
			     "removeIdentical", "onlyCommon", "delim=", "missingval=", "withArrayId=", "debug", "report", 
			     "help", "heterozygous2NA"]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:bh", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
	inputFile = args[0]
	output_fname = None
	delim = ","
	missingVal = "NA"
	comparisonFile = None
	maxMissing = 1.0
	maxError = 1.0
	removeEcotypes = None
	removeArray = None
	removeIdentical = False
	onlyCommon = False
	debug = None
	report = None
	help = 0
	withArrayIds = 1
	first96 = False
	heterozygous2NA = False
	
	for opt, arg in opts:
		if opt in ('-o'):
			output_fname = arg
		elif opt in ("-h", "--help"):
			help = 1
			print __doc__
		elif opt in ("-a","--withArrayId"):
			withArrayIds = int(arg)
		elif opt in ("--comparisonFile"):
			comparisonFile = arg
		elif opt in ("--maxError"):
			maxError = float(arg)
		elif opt in ("--maxMissing"):
			maxMissing = float(arg)
		elif opt in ("--heterozygous2NA"):
			heterozygous2NA = True
		elif opt in ("--removeEcotypeId"):
			removeEcotypes = arg.split(",")
			removeEcotypes = map(int,removeEcotypes)
		elif opt in ("--removeArrayId"):
			removeArray = int(arg)
		elif opt in ("--removeIdentical"):
			removeIdentical = True
		elif opt in ("--onlyCommon"):
			onlyCommon = True
		elif opt in ("--first96"):
			first96 = True
		elif opt in ("-d","--delim"):
			delim = arg
		elif opt in ("-m","--missingval"):
			missingVal = arg
		elif opt in ("-b", "--debug"):
			debug = 1
		elif opt in ("-r", "--report"):
			report = 1
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)


	if not output_fname:
		output_fname
		if help==0:
			print "Output file missing!!\n"
			print __doc__
		sys.exit(2)

	waid1 = withArrayIds==1 or withArrayIds==2
	waid2 = withArrayIds==2 or withArrayIds==3

	import dataParsers
	snpsds = dataParsers.parseCSVData(inputFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1)
	
	accessionsToRemove = []
	arraysToRemove = None

	if first96:
		import dataParsers
		d = dataParsers.getEcotypeToAccessionDictionary(defaultValue='-1',user="******",passwd="bamboo123")
		ecotd = dataParsers.getEcotypeToNameDictionary(defaultValue='-1',user="******",passwd="bamboo123")
		print "Dictionaries loaded"
		names = []
		first96Names = []
		for i in range(0,len(snpsds[0].accessions)):
			ecotype = snpsds[0].accessions[i]
			arrayID = snpsds[0].arrayIds[i]
			names.append((arrayID,ecotd[ecotype],ecotype))
			if int(d[ecotype][0]) > 97 or int(d[ecotype][0]) < 0:
				accessionsToRemove.append(ecotype)
			else:
				first96Names.append((arrayID,d[ecotype][1],d[ecotype][0],ecotype))

		first96Names.sort()
		print "First 96 accessions, len:",len(first96Names),":"
		for name in first96Names:
			print name
		names.sort()
		print "All accessions:"
		for name in names:
			print name


	#Retrieve comparison list of accessions.  (Error rates for accessions)
	if (removeIdentical or maxError<1.0) and comparisonFile:
		sys.stderr.write("Loading comparison file:")
		snpsds2 = dataParsers.parseCSVData(comparisonFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2)
		res = []
		sys.stderr.write("Comparing accessions.")
		for i in range(0,len(snpsds)):
			res.append(snpsds[i].compareWith(snpsds2[i],withArrayIds=withArrayIds,verbose=False,heterozygous2NA=heterozygous2NA))
			sys.stderr.write(".")
		sys.stderr.write("\n")

		totalAccessionCounts = [0]*len(res[0][2])
		accErrorRate = [0]*len(res[0][2])
		for i in range(0,len(snpsds)):
			r = res[i]
			for j in range(0,len(r[2])):
				totalAccessionCounts[j] += r[6][j]
				accErrorRate[j]+=r[3][j]*float(r[6][j])
		
		for i in range(0,len(accErrorRate)):
			accErrorRate[i]=accErrorRate[i]/float(totalAccessionCounts[i])

		accErrAndID = []
		if 0<withArrayIds<3:
			for i in range(0,len(r[2])):
				accErrAndID.append((accErrorRate[i], r[2][i], r[5][i]))
		else:
			for i in range(0,len(r[2])):
				accErrAndID.append((accErrorRate[i], r[2][i]))
		accErrAndID.sort()
		accErrAndID.reverse()


   
	#Figure out which accessions are too erroraneous
	if maxError<1.0 and comparisonFile:
		if withArrayIds:
			arraysToRemove = []
			for (error,ecotype,array) in accErrAndID:
				if error> maxError:
					accessionsToRemove.append(ecotype)
					arraysToRemove.append(array)

		else:
			for (error,ecotype) in accErrAndID:
				if error> maxError:
					accessionsToRemove.append(ecotype)


	if removeIdentical and comparisonFile and withArrayIds:
		print "Locating identical accessions"
		accErrAndID.sort()
		if not arraysToRemove:
			arraysToRemove = []
		for accession in set(snpsds[0].accessions):
			if snpsds[0].accessions.count(accession)>1:
				found = 0
				for (error,ecotype,array) in accErrAndID:
					if ecotype==accession:
						if found>0:
							accessionsToRemove.append(ecotype)
							arraysToRemove.append(array)
						found += 1

	if onlyCommon and comparisonFile:
		print "Locating accessions which are not shared"
		snpsds2 = dataParsers.parseCSVData(comparisonFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2)
		#print snpsds2[0].accessions,'\n',snpsds[0].accessions,'\n',len(set(snpsds2[0].accessions).intersection(set(snpsds[0].accessions)))
		if not arraysToRemove:
			arraysToRemove = []
		for i in range(0,len(snpsds[0].accessions)):
			acc = snpsds[0].accessions[i]
			if not acc in snpsds2[0].accessions:
				accessionsToRemove.append(acc)
				if 0<withArrayIds<3:
					arraysToRemove.append(snpsds[0].arrayIds[i])


	if maxMissing<1.0:
		missingCounts = [0]*len(snpsds[0].accessions)
		numSnps = 0
		for snpsd in snpsds:
			mc = snpsd.accessionsMissingCounts()
			numSnps += len(snpsd.positions)
			for i in range(0,len(snpsds[0].accessions)):
				missingCounts[i] += mc[i]
		
		missingRates = []		
		if withArrayIds:
			arraysToRemove = []
			for i in range(0,len(snpsds[0].accessions)):
				missingRates.append((missingCounts[i]/float(numSnps),snpsds[0].accessions[i],snpsds[0].arrayIds[i]))
			missingRates.sort()
			missingRates.reverse()
			for (mrate,ecotype,array) in missingRates:
				if mrate>maxMissing:
					accessionsToRemove.append(ecotype)
					arraysToRemove.append(array)
		else:
			for i in range(0,len(snpsds[0].accessions)):
				missingRates.append((missingCounts[i]/float(numSnps),snpsds[0].accessions[i]))
			missingRates.sort()
			missingRates.reverse()
			for (mrate,ecotype) in missingRates:
				if mrate>maxMissing:
					accessionsToRemove.append(ecotype)


	if removeEcotypes:
		for removeEcotype in removeEcotypes:
			accessionsToRemove.append(str(int(removeEcotype)))
		print "Removing", len(accessionsToRemove), "accessions."
	if removeArray:
		if not arraysToRemove:
			arraysToRemove = []
		arraysToRemove.append(str(removeArray))
		print "Removing", len(arraysToRemove)," arrays."

	numAccessions = len(snpsds[0].accessions)
	sys.stderr.write("Removing accessions.")
	for snpsd in snpsds:
		snpsd.removeAccessions(accessionsToRemove,arrayIds=arraysToRemove)
		sys.stderr.write(".")
	print "\n", (numAccessions-len(snpsds[0].accessions)), "accessions out of "+str(numAccessions)+" were removed."
		
	import snpsdata
	snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds,chromosomes=[1,2,3,4,5], deliminator=delim, missingVal = missingVal, withArrayIds = waid1)
Ejemplo n.º 5
0
def _run_():
    if len(sys.argv) == 1:
        print __doc__
        sys.exit(2)

    long_options_list = [
        "maxError=", "comparisonFile=", "maxMissing=", "removeEcotypeId=",
        "removeArrayId=", "first96", "removeIdentical", "onlyCommon", "delim=",
        "missingval=", "withArrayId=", "debug", "report", "help",
        "heterozygous2NA", "first192", "removeLer", "removeCol"
    ]
    try:
        opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:bh",
                                   long_options_list)

    except:
        traceback.print_exc()
        print sys.exc_info()
        print __doc__
        sys.exit(2)

    inputFile = args[0]
    output_fname = None
    delim = ","
    missingVal = "NA"
    comparisonFile = None
    maxMissing = 1.0
    maxError = 1.0
    removeEcotypes = None
    removeArray = None
    removeIdentical = False
    onlyCommon = False
    debug = None
    report = None
    help = 0
    withArrayIds = 1
    first96 = False
    first192 = False
    heterozygous2NA = False
    removeLer = False
    removeCol = False

    for opt, arg in opts:
        if opt in ('-o'):
            output_fname = arg
        elif opt in ("-h", "--help"):
            help = 1
            print __doc__
        elif opt in ("-a", "--withArrayId"):
            withArrayIds = int(arg)
        elif opt in ("--comparisonFile"):
            comparisonFile = arg
        elif opt in ("--maxError"):
            maxError = float(arg)
        elif opt in ("--maxMissing"):
            maxMissing = float(arg)
        elif opt in ("--heterozygous2NA"):
            heterozygous2NA = True
        elif opt in ("--removeEcotypeId"):
            removeEcotypes = arg.split(",")
            removeEcotypes = map(int, removeEcotypes)
        elif opt in ("--removeArrayId"):
            removeArray = int(arg)
        elif opt in ("--removeIdentical"):
            removeIdentical = True
        elif opt in ("--onlyCommon"):
            onlyCommon = True
        elif opt in ("--first96"):
            first96 = True
        elif opt in ("--first192"):
            first192 = True
        elif opt in ("--removeLer"):
            removeLer = True
        elif opt in ("--removeCol"):
            removeCol = True
        elif opt in ("-d", "--delim"):
            delim = arg
        elif opt in ("-m", "--missingval"):
            missingVal = arg
        elif opt in ("-b", "--debug"):
            debug = 1
        elif opt in ("-r", "--report"):
            report = 1
        else:
            if help == 0:
                print "Unkown option!!\n"
                print __doc__
            sys.exit(2)

    if not output_fname:
        output_fname
        if help == 0:
            print "Output file missing!!\n"
            print __doc__
        sys.exit(2)

    waid1 = withArrayIds == 1 or withArrayIds == 2
    waid2 = withArrayIds == 2 or withArrayIds == 3

    import dataParsers
    snpsds = dataParsers.parseCSVData(inputFile,
                                      format=1,
                                      deliminator=delim,
                                      missingVal=missingVal,
                                      withArrayIds=waid1)

    accessionsToRemove = []
    arraysToRemove = None

    if first96:
        import dataParsers
        d = dataParsers.getEcotypeToAccessionDictionary(defaultValue='-1',
                                                        user="******",
                                                        passwd="bamboo123")
        ecotd = dataParsers.getEcotypeToNameDictionary(defaultValue='-1',
                                                       user="******",
                                                       passwd="bamboo123")
        print "Dictionaries loaded"
        names = []
        first96Names = []
        for i in range(0, len(snpsds[0].accessions)):
            ecotype = snpsds[0].accessions[i]
            arrayID = snpsds[0].arrayIds[i]
            names.append((arrayID, ecotd[ecotype], ecotype))
            if int(d[ecotype][0]) > 97 or int(d[ecotype][0]) < 0:
                accessionsToRemove.append(ecotype)
            else:
                first96Names.append(
                    (arrayID, d[ecotype][1], d[ecotype][0], ecotype))

        first96Names.sort()
        print "First 96 accessions, len:", len(first96Names), ":"
        for name in first96Names:
            print name
        names.sort()
        print "All accessions:"
        for name in names:
            print name
    elif first192:
        import phenotypeData
        ecotypes_192 = map(str, phenotypeData._getFirst192Ecotypes_())
        print ecotypes_192, snpsds[0].accessions
        for acc in snpsds[0].accessions:
            if acc not in ecotypes_192:
                accessionsToRemove.append(acc)
        print "found", len(ecotypes_192), '"192" ecotypes... removing', len(
            accessionsToRemove), "ecotypes."

    if removeLer:
        import analyzeHaplotype as ah
        accessionsToRemove += ah.getLerAndColAccessions(snpsds)[0]
    if removeCol:
        import analyzeHaplotype as ah
        accessionsToRemove += ah.getLerAndColAccessions(snpsds)[1]

    #Retrieve comparison list of accessions.  (Error rates for accessions)
    if (removeIdentical or maxError < 1.0) and comparisonFile:
        sys.stderr.write("Loading comparison file:")
        snpsds2 = dataParsers.parseCSVData(comparisonFile,
                                           format=1,
                                           deliminator=delim,
                                           missingVal=missingVal,
                                           withArrayIds=waid2)
        res = []
        sys.stderr.write("Comparing accessions.")
        for i in range(0, len(snpsds)):
            res.append(snpsds[i].compareWith(snpsds2[i],
                                             withArrayIds=withArrayIds,
                                             verbose=False,
                                             heterozygous2NA=heterozygous2NA))
            sys.stderr.write(".")
        sys.stderr.write("\n")

        totalAccessionCounts = [0] * len(res[0][2])
        accErrorRate = [0] * len(res[0][2])
        for i in range(0, len(snpsds)):
            r = res[i]
            for j in range(0, len(r[2])):
                totalAccessionCounts[j] += r[6][j]
                accErrorRate[j] += r[3][j] * float(r[6][j])

        for i in range(0, len(accErrorRate)):
            accErrorRate[i] = accErrorRate[i] / float(totalAccessionCounts[i])

        accErrAndID = []
        if 0 < withArrayIds < 3:
            for i in range(0, len(r[2])):
                accErrAndID.append((accErrorRate[i], r[2][i], r[5][i]))
        else:
            for i in range(0, len(r[2])):
                accErrAndID.append((accErrorRate[i], r[2][i]))
        accErrAndID.sort()
        accErrAndID.reverse()

    #Figure out which accessions are too erroraneous
    if maxError < 1.0 and comparisonFile:
        if withArrayIds:
            arraysToRemove = []
            for (error, ecotype, array) in accErrAndID:
                if error > maxError:
                    accessionsToRemove.append(ecotype)
                    arraysToRemove.append(array)

        else:
            for (error, ecotype) in accErrAndID:
                if error > maxError:
                    accessionsToRemove.append(ecotype)

    if removeIdentical and comparisonFile and withArrayIds:
        print "Locating identical accessions"
        accErrAndID.sort()
        if not arraysToRemove:
            arraysToRemove = []
        for accession in set(snpsds[0].accessions):
            if snpsds[0].accessions.count(accession) > 1:
                found = 0
                for (error, ecotype, array) in accErrAndID:
                    if ecotype == accession:
                        if found > 0:
                            accessionsToRemove.append(ecotype)
                            arraysToRemove.append(array)
                        found += 1

    if onlyCommon and comparisonFile:
        print "Locating accessions which are not shared"
        snpsds2 = dataParsers.parseCSVData(comparisonFile,
                                           format=1,
                                           deliminator=delim,
                                           missingVal=missingVal,
                                           withArrayIds=waid2)
        #print snpsds2[0].accessions,'\n',snpsds[0].accessions,'\n',len(set(snpsds2[0].accessions).intersection(set(snpsds[0].accessions)))
        if not arraysToRemove:
            arraysToRemove = []
        for i in range(0, len(snpsds[0].accessions)):
            acc = snpsds[0].accessions[i]
            if not acc in snpsds2[0].accessions:
                accessionsToRemove.append(acc)
                if 0 < withArrayIds < 3:
                    arraysToRemove.append(snpsds[0].arrayIds[i])

    if maxMissing < 1.0:
        missingCounts = [0] * len(snpsds[0].accessions)
        numSnps = 0
        for snpsd in snpsds:
            mc = snpsd.accessionsMissingCounts()
            numSnps += len(snpsd.positions)
            for i in range(0, len(snpsds[0].accessions)):
                missingCounts[i] += mc[i]

        missingRates = []
        if withArrayIds:
            arraysToRemove = []
            for i in range(0, len(snpsds[0].accessions)):
                missingRates.append(
                    (missingCounts[i] / float(numSnps),
                     snpsds[0].accessions[i], snpsds[0].arrayIds[i]))
            missingRates.sort()
            missingRates.reverse()
            for (mrate, ecotype, array) in missingRates:
                if mrate > maxMissing:
                    accessionsToRemove.append(ecotype)
                    arraysToRemove.append(array)
        else:
            for i in range(0, len(snpsds[0].accessions)):
                missingRates.append((missingCounts[i] / float(numSnps),
                                     snpsds[0].accessions[i]))
            missingRates.sort()
            missingRates.reverse()
            for (mrate, ecotype) in missingRates:
                if mrate > maxMissing:
                    accessionsToRemove.append(ecotype)

    if removeEcotypes:
        for removeEcotype in removeEcotypes:
            accessionsToRemove.append(str(int(removeEcotype)))
        print "Removing", len(accessionsToRemove), "accessions."
    if removeArray:
        if not arraysToRemove:
            arraysToRemove = []
        arraysToRemove.append(str(removeArray))
        print "Removing", len(arraysToRemove), " arrays."

    numAccessions = len(snpsds[0].accessions)
    sys.stderr.write("Removing accessions.")
    for snpsd in snpsds:
        snpsd.removeAccessions(accessionsToRemove, arrayIds=arraysToRemove)
        sys.stderr.write(".")
    print "\n", (
        numAccessions - len(snpsds[0].accessions)
    ), "accessions out of " + str(numAccessions) + " were removed."

    import snpsdata
    snpsdata.writeRawSnpsDatasToFile(output_fname,
                                     snpsds,
                                     chromosomes=[1, 2, 3, 4, 5],
                                     deliminator=delim,
                                     missingVal=missingVal,
                                     withArrayIds=waid1)