def run(self): """ 2008-5-12 """ if self.debug: import pdb pdb.set_trace() #database connection and etc db = self.db_250k session = db.session session.begin() delimiter = figureOutDelimiter(self.inputFname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data(self.inputFname, delimiter=delimiter,\ matrix_data_type=int) if self.snp_id_type == 1: #2011-2-27 translate the db_id into chr_pos because the new StrainXSNP dataset uses db_id to identify SNPs. # but if col-id is already chr_pos, it's fine. new_header = header[:2] data_matrix_col_index_to_be_kept = [] for i in range(2, len(header)): snp_id = header[i] chr_pos = db.get_chr_pos_given_db_id2chr_pos(snp_id, ) if chr_pos is not None: data_matrix_col_index_to_be_kept.append(i - 2) new_header.append(chr_pos) # to remove no-db_id columns from data matrix data_matrix = numpy.array(data_matrix) data_matrix = data_matrix[:, data_matrix_col_index_to_be_kept] header = new_header if self.array_id_2nd_column: snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) else: snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\ data_matrix=data_matrix) #ignore category_list rawSnpsData_ls = SNPData2RawSnpsData_ls(snpData, need_transposeSNPData=1, report=self.report) chromosomes = [ rawSnpsData.chromosome for rawSnpsData in rawSnpsData_ls ] snpsdata.writeRawSnpsDatasToFile(self.outputFname, rawSnpsData_ls, chromosomes=chromosomes, deliminator=',', withArrayIds=self.array_id_2nd_column)
def run(self): """ 2008-5-12 """ if self.debug: import pdb pdb.set_trace() delimiter = figureOutDelimiter(self.input_fname, report=self.report) header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname, delimiter=delimiter) if self.array_id_2nd_column: snpData = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) else: snpData = SNPData(header=header, strain_acc_list=strain_acc_list,\ data_matrix=data_matrix) #ignore category_list rawSnpsData_ls = SNPData2RawSnpsData_ls(snpData, need_transposeSNPData=1, report=self.report) chromosomes = [rawSnpsData.chromosome for rawSnpsData in rawSnpsData_ls] snpsdata.writeRawSnpsDatasToFile(self.output_fname, rawSnpsData_ls, chromosomes=chromosomes, deliminator=',', withArrayIds=self.array_id_2nd_column)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["hostname=", "user="******"passwd=", "version=", "delim=", "missingval=", "accname", "debug", "report", "help", "only96"] try: opts, args = getopt.getopt(sys.argv[1:], "z:u:p:o:v:d:m:abrh", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) hostname = 'papaya.usc.edu' user = None passwd = None output_fname = None version = "3" delim = "," missingVal = "NA" useAccessionName = False debug = None report = None help = 0 only96 = False for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-z", "--hostname"): hostname = arg elif opt in ("-u", "--user"): user = arg elif opt in ("-p", "--passwd"): passwd = arg elif opt in ("-o",): output_fname = arg elif opt in ("-v","--version"): version = arg elif opt in ("-d","--delim"): delim = arg elif opt in ("-m","--missingval"): missingVal = arg elif opt in ("-a","--accname"): useAccessionName = True elif opt in ("-b", "--debug"): debug = 1 elif opt in ("-r", "--report"): report = 1 elif opt in ("--only96"): only96 = True if not output_fname: output_fname if help==0: print "Output file missing!!\n" print __doc__ sys.exit(2) import dataParsers snpsds = dataParsers.get2010DataFromDb(host=hostname,chromosomes=[1,2,3,4,5], dataVersion=version, only96accessions=only96, user=user, passwd=passwd) accDecoder=None if useAccessionName: tmpDecoder = dataParsers.getEcotypeToAccessionDictionary(user="******",passwd="bamboo123") accDecoder={} for acc in tmpDecoder: accDecoder[acc]=tmpDecoder[acc][1] import snpsdata snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds,chromosomes=[1,2,3,4,5], deliminator=delim, missingVal = missingVal, accDecoder=accDecoder)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["newBatch","hostname=", "user="******"passwd=", "method=", "delim=", "missingval=", "withArrayId=", "callProbFile=", "help"] try: opts, args = getopt.getopt(sys.argv[1:], "z:u:p:o:t:d:m:a:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) hostname = 'papaya.usc.edu' user = None passwd = None output_fname = None method = 1 delim = "," missingVal = "NA" help = 0 withArrayId = False callProbFile = None newBatch = False for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-z", "--hostname"): hostname = arg elif opt in ("-u", "--user"): user = arg elif opt in ("-p", "--passwd"): passwd = arg elif opt in ("-o",): output_fname = arg elif opt in ("-t","--method"): method = int(arg) elif opt in ("-d","--delim"): delim = arg elif opt in ("-m","--missingval"): missingVal = arg elif opt in ("-a","--withArrayId"): withArrayId = bool(arg) elif opt in ("--callProbFile"): callProbFile =arg elif opt in ("--newBatch"): newBatch = True if not output_fname: output_fname if help==0: print "Output file missing!!\n" print __doc__ sys.exit(2) import dataParsers import snpsdata if callProbFile: snpsds = dataParsers.get250KDataFromDb(host=hostname,chromosomes=[1,2,3,4,5], methodId=method, user=user, passwd=passwd, withArrayIds=withArrayId, callProb=True, newBatch=newBatch) snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds,chromosomes=[1,2,3,4,5], deliminator=delim, missingVal = missingVal, withArrayIds=withArrayId, callProbFile=callProbFile) else: snpsds = dataParsers.get250KDataFromDb(host=hostname,chromosomes=[1,2,3,4,5], methodId=method, user=user, passwd=passwd, withArrayIds=withArrayId, newBatch=newBatch) snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds,chromosomes=[1,2,3,4,5], deliminator=delim, missingVal = missingVal, withArrayIds=withArrayId)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["hostname=", "user="******"passwd=", "delim=", "missingval=", "accname", "debug", "report", "help"] try: opts, args = getopt.getopt(sys.argv[1:], "z:u:p:o:d:m:abrh", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) hostname = 'papaya.usc.edu' user = None passwd = None output_fname = None delim = ", " missingVal = "NA" useAccessionName = False debug = None report = None help = 0 for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-z", "--hostname"): hostname = arg elif opt in ("-u", "--user"): user = arg elif opt in ("-p", "--passwd"): passwd = arg elif opt in ("-o",): output_fname = arg elif opt in ("-d","--delim"): delim = arg elif opt in ("-m","--missingval"): missingVal = arg elif opt in ("-a","--accname"): useAccessionName = True elif opt in ("-b", "--debug"): debug = 1 elif opt in ("-r", "--report"): report = 1 if not output_fname: output_fname if help==0: print "Output file missing!!\n" print __doc__ sys.exit(2) import dataParsers snpsds = dataParsers.getPerlgenDataFromDb(host=hostname,chromosomes=[1,2,3,4,5], user=user, passwd=passwd) accDecoder=None if useAccessionName: accDecoder = dataParsers.ecotypeId2Name import snpsdata snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds,chromosomes=[1,2,3,4,5], deliminator=delim, missingVal = missingVal, accDecoder=accDecoder)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["priority=", "delim=", "missingval=", "union=", "intersection=", "debug", "report", "help", "withArrayId="] try: opts, args = getopt.getopt(sys.argv[1:], "o:p:d:m:u:i:a:brh", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) if len(args)!=2: raise Exception("Number of arguments isn't correct.") inputFile1 = args[0] inputFile2 = args[1] priority = 1 union = 0 intersection = 0 output_fname = None delim = "," missingVal = "NA" debug = None report = None withArrayIds = 0 chromosomes = [1,2,3,4,5] for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-p", "--priority"): priority = int(arg) elif opt in ("-u", "--union"): union = int(arg) elif opt in ("-i", "--intersection"): intersection = int(arg) elif opt in ("-o",): output_fname = arg elif opt in ("-d","--delim"): delim = arg elif opt in ("-m","--missingval"): missingVal = arg elif opt in ("-a","--withArrayId"): withArrayIds = int(arg) elif opt in ("-b", "--debug"): debug = 1 elif opt in ("-r", "--report"): report = 1 else: if help==0: print "Unkown option!!\n" print __doc__ sys.exit(2) if not output_fname: if help==0: print "Output file missing!!\n" print __doc__ sys.exit(2) waid1 = withArrayIds==1 or withArrayIds==2 waid2 = withArrayIds==2 import dataParsers (snpsds1,chromosomes1) = dataParsers.parseCSVData(inputFile1, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1,returnChromosomes=True) (snpsds2,chromosomes2) = dataParsers.parseCSVData(inputFile2, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2,returnChromosomes=True) withArrayIds = waid1 if len(snpsds1) != len(snpsds2): print("Warning: Unequal number of chromosomes.") #raise Exception("Unequal number of chromosomes.") import snpsdata if union==0 and intersection==0: for i in range(0,len(chromosomes1)): chr1 = chromosomes1[i] for j in range(0,len(chromosomes2)): chr2 = chromosomes2[j] if chr1==chr2: snpsds1[i].mergeData(snpsds2[j],priority=priority) chromosomes = chromosomes1 snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds1,chromosomes=chromosomes, deliminator=delim, missingVal = missingVal, withArrayIds = waid1) elif 0<union<4 and intersection==0: for i in range(0,len(chromosomes1)): chr1 = chromosomes1[i] for j in range(0,len(chromosomes2)): chr2 = chromosomes2[j] if chr1==chr2: snpsds1[i].mergeDataUnion(snpsds2[j], priority=priority, unionType=union) if union==1 or union==3: chromosomes = set(chromosomes1).union(set(chromosomes2)) chromosomes = list(chromosomes) chromosomes.sort() elif union==2: chromosomes = chromosomes1 snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds1,chromosomes=chromosomes, deliminator=delim, missingVal = missingVal) elif 0<intersection<4 and union==0: for i in range(0,len(snpsds1)): snpsds1[i].mergeDataIntersection(snpsds2[i], priority=priority, intersectionType=intersection) if intersection==1 or intersection==3: chromosomes = set(chromosomes1).intersection(set(chromosomes2)) chromosomes = list(chromosomes) chromosomes.sort() elif intersection==2: chromosomes = chromosomes1 snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds1,chromosomes=chromosomes, deliminator=delim, missingVal = missingVal) else: if help==0: print "The union or intersection options used are wrong!!\n" print __doc__ sys.exit(2)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = [ "hostname=", "user="******"passwd=", "delim=", "missingval=", "accname", "debug", "report", "help" ] try: opts, args = getopt.getopt(sys.argv[1:], "z:u:p:o:d:m:abrh", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) hostname = 'papaya.usc.edu' user = None passwd = None output_fname = None delim = ", " missingVal = "NA" useAccessionName = False debug = None report = None help = 0 for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-z", "--hostname"): hostname = arg elif opt in ("-u", "--user"): user = arg elif opt in ("-p", "--passwd"): passwd = arg elif opt in ("-o", ): output_fname = arg elif opt in ("-d", "--delim"): delim = arg elif opt in ("-m", "--missingval"): missingVal = arg elif opt in ("-a", "--accname"): useAccessionName = True elif opt in ("-b", "--debug"): debug = 1 elif opt in ("-r", "--report"): report = 1 if not output_fname: output_fname if help == 0: print "Output file missing!!\n" print __doc__ sys.exit(2) import dataParsers snpsds = dataParsers.getPerlgenDataFromDb(host=hostname, chromosomes=[1, 2, 3, 4, 5], user=user, passwd=passwd) accDecoder = None if useAccessionName: accDecoder = dataParsers.ecotypeId2Name import snpsdata snpsdata.writeRawSnpsDatasToFile(output_fname, snpsds, chromosomes=[1, 2, 3, 4, 5], deliminator=delim, missingVal=missingVal, accDecoder=accDecoder)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["maxError=", "comparisonFile=", "maxMissing=", "monomorphic", "onlyBinary", "delim=", "missingval=", "withArrayId=", "callProbFile=", "minMAF=", "minCallProb=", "debug", "report", "help", "output01Format", "filterRegion="] try: opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:brh", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) inputFile = args[0] output_fname = None delim = "," missingVal = "NA" comparisonFile = None maxMissing = 1.0 maxError = 1.0 monomorphic = False debug = None report = None help = 0 withArrayIds = 0 minCallProb=None minMAF=None callProbFile = None onlyBinary = False output01Format = False filterRegion = False startPos = None endPos = None chromosome = None chromosomes=[1,2,3,4,5] for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-a","--withArrayId"): withArrayIds = int(arg) elif opt in ("-o",): output_fname = arg elif opt in ("-m","--missingval"): missingVal = arg elif opt in ("--comparisonFile"): comparisonFile = arg elif opt in ("--maxError"): maxError = float(arg) elif opt in ("--maxMissing"): maxMissing = float(arg) elif opt in ("--minCallProb"): minCallProb = float(arg) elif opt in ("--minMAF"): minMAF = float(arg) elif opt in ("--callProbFile"): callProbFile = arg elif opt in ("-d","--delim"): delim = arg elif opt in ("-b", "--debug"): debug = 1 elif opt in ("-r", "--report"): report = 1 elif opt in ("--monomorphic"): monomorphic = True elif opt in ("--onlyBinary"): onlyBinary = True elif opt in ("--output01Format"): output01Format = True elif opt in ("--filterRegion"): filterRegion = True region = arg.split(",") region = map(int,region) chromosome = region[0] startPos = region[1] endPos = region[2] else: if help==0: print "Unkown option!!\n" print __doc__ sys.exit(2) if not output_fname: output_fname if help==0: print "Output file missing!!\n" print __doc__ sys.exit(2) waid1 = withArrayIds==1 or withArrayIds==2 waid2 = withArrayIds==2 if callProbFile and minCallProb: #Read prob file into SNPsdatas. #snpsds = dataParsers.parseCSVDataWithCallProb(inputFile, callProbFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1) pass else: snpsds = dataParsers.parseCSVData(inputFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1) #Filtering monomorphic if monomorphic: print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()),"Snps" if onlyBinary or output01Format: print "Filtering non-binary SNPs" for snpsd in snpsds: print "Removed", str(snpsd.onlyBinarySnps()),"Snps" #Filtering missing values if maxMissing<1.0 and maxMissing>=0.0: print "Filtering SNPs with missing values" numAccessions = len(snpsds[0].accessions) for snpsd in snpsds: print "Removed", str(snpsd.filterMissingSnps(int(maxMissing*numAccessions))),"Snps" #Filtering bad SNPs if comparisonFile and maxError<1.0: print "Filtering erroneous SNPs, with maxError=",maxError snpsds2 = dataParsers.parseCSVData(comparisonFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2) for i in range(0,len(snpsds)): snpsds[i].filterBadSnps(snpsds2[i],maxError) if minMAF: print "Removing SNPs withe MAF <",minMAF for snpsd in snpsds: print "Removed", str(snpsd.filterMinMAF(minMAF)),"Snps" #Output specific region.. if filterRegion: chromosomes = [chromosome] snpsd = snpsds[chromosome-1] snpsd.filterRegion(startPos,endPos) snpsds = [snpsd] #Converting lousy calls to NAs if callProbFile and minCallProb: print "Converting base calls with call prob. lower than",minCallProb,"to NAs" #To avoid memory problems, the file/data is processed one line at a time. gInFile = open(inputFile,"r") pInFile = open(callProbFile,"r") outFile = open(output_fname,"w") if withArrayIds==2: gline = gInFile.readline() outFile.write(gline) pInFile.readline() gline = gInFile.readline() outFile.write(gline) pInFile.readline() i = 0 totalCount = 0.0 convertedCount = 0.0 while(1): i += 1 gline = gInFile.readline() pline = pInFile.readline() #print gline if gline and pline: snp = gline.strip().split(delim) probs = pline.strip().split(delim) probs = map(float,probs) newSNP = [] totalCount += len(snp) for (nt,prob) in zip(snp,probs): if prob>minCallProb: newSNP.append(nt) convertedCount += 1.0 else: newSNP.append('NA') outFile.write(delim.join(newSNP)+"\n") else: print i,gline,pline break if i%10000==0: print i print i gInFile.close() pInFile.close() outFile.close() print "Fraction converted =",convertedCount/totalCount else: if output01Format: snpsds01format = [] for snpsd in snpsds: snpsds01format.append(snpsd.getSnpsData(missingVal=missingVal)) #FINISH snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds01format,chromosomes=chromosomes, deliminator=delim, missingVal = missingVal, withArrayIds = waid1) else: snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds,chromosomes=chromosomes, deliminator=delim, missingVal = missingVal, withArrayIds = waid1)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["maxError=", "comparisonFile=", "maxMissing=", "removeEcotypeId=", "removeArrayId=", "first96", "removeIdentical", "onlyCommon", "delim=", "missingval=", "withArrayId=", "debug", "report", "help", "heterozygous2NA"] try: opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:bh", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) inputFile = args[0] output_fname = None delim = "," missingVal = "NA" comparisonFile = None maxMissing = 1.0 maxError = 1.0 removeEcotypes = None removeArray = None removeIdentical = False onlyCommon = False debug = None report = None help = 0 withArrayIds = 1 first96 = False heterozygous2NA = False for opt, arg in opts: if opt in ('-o'): output_fname = arg elif opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-a","--withArrayId"): withArrayIds = int(arg) elif opt in ("--comparisonFile"): comparisonFile = arg elif opt in ("--maxError"): maxError = float(arg) elif opt in ("--maxMissing"): maxMissing = float(arg) elif opt in ("--heterozygous2NA"): heterozygous2NA = True elif opt in ("--removeEcotypeId"): removeEcotypes = arg.split(",") removeEcotypes = map(int,removeEcotypes) elif opt in ("--removeArrayId"): removeArray = int(arg) elif opt in ("--removeIdentical"): removeIdentical = True elif opt in ("--onlyCommon"): onlyCommon = True elif opt in ("--first96"): first96 = True elif opt in ("-d","--delim"): delim = arg elif opt in ("-m","--missingval"): missingVal = arg elif opt in ("-b", "--debug"): debug = 1 elif opt in ("-r", "--report"): report = 1 else: if help==0: print "Unkown option!!\n" print __doc__ sys.exit(2) if not output_fname: output_fname if help==0: print "Output file missing!!\n" print __doc__ sys.exit(2) waid1 = withArrayIds==1 or withArrayIds==2 waid2 = withArrayIds==2 or withArrayIds==3 import dataParsers snpsds = dataParsers.parseCSVData(inputFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1) accessionsToRemove = [] arraysToRemove = None if first96: import dataParsers d = dataParsers.getEcotypeToAccessionDictionary(defaultValue='-1',user="******",passwd="bamboo123") ecotd = dataParsers.getEcotypeToNameDictionary(defaultValue='-1',user="******",passwd="bamboo123") print "Dictionaries loaded" names = [] first96Names = [] for i in range(0,len(snpsds[0].accessions)): ecotype = snpsds[0].accessions[i] arrayID = snpsds[0].arrayIds[i] names.append((arrayID,ecotd[ecotype],ecotype)) if int(d[ecotype][0]) > 97 or int(d[ecotype][0]) < 0: accessionsToRemove.append(ecotype) else: first96Names.append((arrayID,d[ecotype][1],d[ecotype][0],ecotype)) first96Names.sort() print "First 96 accessions, len:",len(first96Names),":" for name in first96Names: print name names.sort() print "All accessions:" for name in names: print name #Retrieve comparison list of accessions. (Error rates for accessions) if (removeIdentical or maxError<1.0) and comparisonFile: sys.stderr.write("Loading comparison file:") snpsds2 = dataParsers.parseCSVData(comparisonFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2) res = [] sys.stderr.write("Comparing accessions.") for i in range(0,len(snpsds)): res.append(snpsds[i].compareWith(snpsds2[i],withArrayIds=withArrayIds,verbose=False,heterozygous2NA=heterozygous2NA)) sys.stderr.write(".") sys.stderr.write("\n") totalAccessionCounts = [0]*len(res[0][2]) accErrorRate = [0]*len(res[0][2]) for i in range(0,len(snpsds)): r = res[i] for j in range(0,len(r[2])): totalAccessionCounts[j] += r[6][j] accErrorRate[j]+=r[3][j]*float(r[6][j]) for i in range(0,len(accErrorRate)): accErrorRate[i]=accErrorRate[i]/float(totalAccessionCounts[i]) accErrAndID = [] if 0<withArrayIds<3: for i in range(0,len(r[2])): accErrAndID.append((accErrorRate[i], r[2][i], r[5][i])) else: for i in range(0,len(r[2])): accErrAndID.append((accErrorRate[i], r[2][i])) accErrAndID.sort() accErrAndID.reverse() #Figure out which accessions are too erroraneous if maxError<1.0 and comparisonFile: if withArrayIds: arraysToRemove = [] for (error,ecotype,array) in accErrAndID: if error> maxError: accessionsToRemove.append(ecotype) arraysToRemove.append(array) else: for (error,ecotype) in accErrAndID: if error> maxError: accessionsToRemove.append(ecotype) if removeIdentical and comparisonFile and withArrayIds: print "Locating identical accessions" accErrAndID.sort() if not arraysToRemove: arraysToRemove = [] for accession in set(snpsds[0].accessions): if snpsds[0].accessions.count(accession)>1: found = 0 for (error,ecotype,array) in accErrAndID: if ecotype==accession: if found>0: accessionsToRemove.append(ecotype) arraysToRemove.append(array) found += 1 if onlyCommon and comparisonFile: print "Locating accessions which are not shared" snpsds2 = dataParsers.parseCSVData(comparisonFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2) #print snpsds2[0].accessions,'\n',snpsds[0].accessions,'\n',len(set(snpsds2[0].accessions).intersection(set(snpsds[0].accessions))) if not arraysToRemove: arraysToRemove = [] for i in range(0,len(snpsds[0].accessions)): acc = snpsds[0].accessions[i] if not acc in snpsds2[0].accessions: accessionsToRemove.append(acc) if 0<withArrayIds<3: arraysToRemove.append(snpsds[0].arrayIds[i]) if maxMissing<1.0: missingCounts = [0]*len(snpsds[0].accessions) numSnps = 0 for snpsd in snpsds: mc = snpsd.accessionsMissingCounts() numSnps += len(snpsd.positions) for i in range(0,len(snpsds[0].accessions)): missingCounts[i] += mc[i] missingRates = [] if withArrayIds: arraysToRemove = [] for i in range(0,len(snpsds[0].accessions)): missingRates.append((missingCounts[i]/float(numSnps),snpsds[0].accessions[i],snpsds[0].arrayIds[i])) missingRates.sort() missingRates.reverse() for (mrate,ecotype,array) in missingRates: if mrate>maxMissing: accessionsToRemove.append(ecotype) arraysToRemove.append(array) else: for i in range(0,len(snpsds[0].accessions)): missingRates.append((missingCounts[i]/float(numSnps),snpsds[0].accessions[i])) missingRates.sort() missingRates.reverse() for (mrate,ecotype) in missingRates: if mrate>maxMissing: accessionsToRemove.append(ecotype) if removeEcotypes: for removeEcotype in removeEcotypes: accessionsToRemove.append(str(int(removeEcotype))) print "Removing", len(accessionsToRemove), "accessions." if removeArray: if not arraysToRemove: arraysToRemove = [] arraysToRemove.append(str(removeArray)) print "Removing", len(arraysToRemove)," arrays." numAccessions = len(snpsds[0].accessions) sys.stderr.write("Removing accessions.") for snpsd in snpsds: snpsd.removeAccessions(accessionsToRemove,arrayIds=arraysToRemove) sys.stderr.write(".") print "\n", (numAccessions-len(snpsds[0].accessions)), "accessions out of "+str(numAccessions)+" were removed." import snpsdata snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds,chromosomes=[1,2,3,4,5], deliminator=delim, missingVal = missingVal, withArrayIds = waid1)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = [ "maxError=", "comparisonFile=", "maxMissing=", "removeEcotypeId=", "removeArrayId=", "first96", "removeIdentical", "onlyCommon", "delim=", "missingval=", "withArrayId=", "debug", "report", "help", "heterozygous2NA", "first192", "removeLer", "removeCol" ] try: opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:bh", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) inputFile = args[0] output_fname = None delim = "," missingVal = "NA" comparisonFile = None maxMissing = 1.0 maxError = 1.0 removeEcotypes = None removeArray = None removeIdentical = False onlyCommon = False debug = None report = None help = 0 withArrayIds = 1 first96 = False first192 = False heterozygous2NA = False removeLer = False removeCol = False for opt, arg in opts: if opt in ('-o'): output_fname = arg elif opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-a", "--withArrayId"): withArrayIds = int(arg) elif opt in ("--comparisonFile"): comparisonFile = arg elif opt in ("--maxError"): maxError = float(arg) elif opt in ("--maxMissing"): maxMissing = float(arg) elif opt in ("--heterozygous2NA"): heterozygous2NA = True elif opt in ("--removeEcotypeId"): removeEcotypes = arg.split(",") removeEcotypes = map(int, removeEcotypes) elif opt in ("--removeArrayId"): removeArray = int(arg) elif opt in ("--removeIdentical"): removeIdentical = True elif opt in ("--onlyCommon"): onlyCommon = True elif opt in ("--first96"): first96 = True elif opt in ("--first192"): first192 = True elif opt in ("--removeLer"): removeLer = True elif opt in ("--removeCol"): removeCol = True elif opt in ("-d", "--delim"): delim = arg elif opt in ("-m", "--missingval"): missingVal = arg elif opt in ("-b", "--debug"): debug = 1 elif opt in ("-r", "--report"): report = 1 else: if help == 0: print "Unkown option!!\n" print __doc__ sys.exit(2) if not output_fname: output_fname if help == 0: print "Output file missing!!\n" print __doc__ sys.exit(2) waid1 = withArrayIds == 1 or withArrayIds == 2 waid2 = withArrayIds == 2 or withArrayIds == 3 import dataParsers snpsds = dataParsers.parseCSVData(inputFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1) accessionsToRemove = [] arraysToRemove = None if first96: import dataParsers d = dataParsers.getEcotypeToAccessionDictionary(defaultValue='-1', user="******", passwd="bamboo123") ecotd = dataParsers.getEcotypeToNameDictionary(defaultValue='-1', user="******", passwd="bamboo123") print "Dictionaries loaded" names = [] first96Names = [] for i in range(0, len(snpsds[0].accessions)): ecotype = snpsds[0].accessions[i] arrayID = snpsds[0].arrayIds[i] names.append((arrayID, ecotd[ecotype], ecotype)) if int(d[ecotype][0]) > 97 or int(d[ecotype][0]) < 0: accessionsToRemove.append(ecotype) else: first96Names.append( (arrayID, d[ecotype][1], d[ecotype][0], ecotype)) first96Names.sort() print "First 96 accessions, len:", len(first96Names), ":" for name in first96Names: print name names.sort() print "All accessions:" for name in names: print name elif first192: import phenotypeData ecotypes_192 = map(str, phenotypeData._getFirst192Ecotypes_()) print ecotypes_192, snpsds[0].accessions for acc in snpsds[0].accessions: if acc not in ecotypes_192: accessionsToRemove.append(acc) print "found", len(ecotypes_192), '"192" ecotypes... removing', len( accessionsToRemove), "ecotypes." if removeLer: import analyzeHaplotype as ah accessionsToRemove += ah.getLerAndColAccessions(snpsds)[0] if removeCol: import analyzeHaplotype as ah accessionsToRemove += ah.getLerAndColAccessions(snpsds)[1] #Retrieve comparison list of accessions. (Error rates for accessions) if (removeIdentical or maxError < 1.0) and comparisonFile: sys.stderr.write("Loading comparison file:") snpsds2 = dataParsers.parseCSVData(comparisonFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2) res = [] sys.stderr.write("Comparing accessions.") for i in range(0, len(snpsds)): res.append(snpsds[i].compareWith(snpsds2[i], withArrayIds=withArrayIds, verbose=False, heterozygous2NA=heterozygous2NA)) sys.stderr.write(".") sys.stderr.write("\n") totalAccessionCounts = [0] * len(res[0][2]) accErrorRate = [0] * len(res[0][2]) for i in range(0, len(snpsds)): r = res[i] for j in range(0, len(r[2])): totalAccessionCounts[j] += r[6][j] accErrorRate[j] += r[3][j] * float(r[6][j]) for i in range(0, len(accErrorRate)): accErrorRate[i] = accErrorRate[i] / float(totalAccessionCounts[i]) accErrAndID = [] if 0 < withArrayIds < 3: for i in range(0, len(r[2])): accErrAndID.append((accErrorRate[i], r[2][i], r[5][i])) else: for i in range(0, len(r[2])): accErrAndID.append((accErrorRate[i], r[2][i])) accErrAndID.sort() accErrAndID.reverse() #Figure out which accessions are too erroraneous if maxError < 1.0 and comparisonFile: if withArrayIds: arraysToRemove = [] for (error, ecotype, array) in accErrAndID: if error > maxError: accessionsToRemove.append(ecotype) arraysToRemove.append(array) else: for (error, ecotype) in accErrAndID: if error > maxError: accessionsToRemove.append(ecotype) if removeIdentical and comparisonFile and withArrayIds: print "Locating identical accessions" accErrAndID.sort() if not arraysToRemove: arraysToRemove = [] for accession in set(snpsds[0].accessions): if snpsds[0].accessions.count(accession) > 1: found = 0 for (error, ecotype, array) in accErrAndID: if ecotype == accession: if found > 0: accessionsToRemove.append(ecotype) arraysToRemove.append(array) found += 1 if onlyCommon and comparisonFile: print "Locating accessions which are not shared" snpsds2 = dataParsers.parseCSVData(comparisonFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2) #print snpsds2[0].accessions,'\n',snpsds[0].accessions,'\n',len(set(snpsds2[0].accessions).intersection(set(snpsds[0].accessions))) if not arraysToRemove: arraysToRemove = [] for i in range(0, len(snpsds[0].accessions)): acc = snpsds[0].accessions[i] if not acc in snpsds2[0].accessions: accessionsToRemove.append(acc) if 0 < withArrayIds < 3: arraysToRemove.append(snpsds[0].arrayIds[i]) if maxMissing < 1.0: missingCounts = [0] * len(snpsds[0].accessions) numSnps = 0 for snpsd in snpsds: mc = snpsd.accessionsMissingCounts() numSnps += len(snpsd.positions) for i in range(0, len(snpsds[0].accessions)): missingCounts[i] += mc[i] missingRates = [] if withArrayIds: arraysToRemove = [] for i in range(0, len(snpsds[0].accessions)): missingRates.append( (missingCounts[i] / float(numSnps), snpsds[0].accessions[i], snpsds[0].arrayIds[i])) missingRates.sort() missingRates.reverse() for (mrate, ecotype, array) in missingRates: if mrate > maxMissing: accessionsToRemove.append(ecotype) arraysToRemove.append(array) else: for i in range(0, len(snpsds[0].accessions)): missingRates.append((missingCounts[i] / float(numSnps), snpsds[0].accessions[i])) missingRates.sort() missingRates.reverse() for (mrate, ecotype) in missingRates: if mrate > maxMissing: accessionsToRemove.append(ecotype) if removeEcotypes: for removeEcotype in removeEcotypes: accessionsToRemove.append(str(int(removeEcotype))) print "Removing", len(accessionsToRemove), "accessions." if removeArray: if not arraysToRemove: arraysToRemove = [] arraysToRemove.append(str(removeArray)) print "Removing", len(arraysToRemove), " arrays." numAccessions = len(snpsds[0].accessions) sys.stderr.write("Removing accessions.") for snpsd in snpsds: snpsd.removeAccessions(accessionsToRemove, arrayIds=arraysToRemove) sys.stderr.write(".") print "\n", ( numAccessions - len(snpsds[0].accessions) ), "accessions out of " + str(numAccessions) + " were removed." import snpsdata snpsdata.writeRawSnpsDatasToFile(output_fname, snpsds, chromosomes=[1, 2, 3, 4, 5], deliminator=delim, missingVal=missingVal, withArrayIds=waid1)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = [ "newBatch", "hostname=", "user="******"passwd=", "method=", "delim=", "missingval=", "withArrayId=", "callProbFile=", "help" ] try: opts, args = getopt.getopt(sys.argv[1:], "z:u:p:o:t:d:m:a:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) hostname = env.env['default_lookup_db'] #'papaya.usc.edu' user = env.env['db_user'] passwd = env.env['db_user'] output_fname = None method = 1 delim = "," missingVal = "NA" help = 0 withArrayId = False callProbFile = None newBatch = False for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-z", "--hostname"): hostname = arg elif opt in ("-u", "--user"): user = arg elif opt in ("-p", "--passwd"): passwd = arg elif opt in ("-o", ): output_fname = arg elif opt in ("-t", "--method"): method = int(arg) elif opt in ("-d", "--delim"): delim = arg elif opt in ("-m", "--missingval"): missingVal = arg elif opt in ("-a", "--withArrayId"): withArrayId = bool(arg) elif opt in ("--callProbFile"): callProbFile = arg elif opt in ("--newBatch"): newBatch = True if not output_fname: output_fname if help == 0: print "Output file missing!!\n" print __doc__ sys.exit(2) import dataParsers import snpsdata if callProbFile: snpsds = dataParsers.get250KDataFromDb(host=hostname, chromosomes=[1, 2, 3, 4, 5], methodId=method, user=user, passwd=passwd, withArrayIds=withArrayId, callProb=True, newBatch=newBatch) snpsdata.writeRawSnpsDatasToFile(output_fname, snpsds, chromosomes=[1, 2, 3, 4, 5], deliminator=delim, missingVal=missingVal, withArrayIds=withArrayId, callProbFile=callProbFile) else: snpsds = dataParsers.get250KDataFromDb(host=hostname, chromosomes=[1, 2, 3, 4, 5], methodId=method, user=user, passwd=passwd, withArrayIds=withArrayId, newBatch=newBatch) snpsdata.writeRawSnpsDatasToFile(output_fname, snpsds, chromosomes=[1, 2, 3, 4, 5], deliminator=delim, missingVal=missingVal, withArrayIds=withArrayId)