def singlerun(filename, outputFile, binsize, chop, modelOverride=None): fig = pylab.figure() galaxies = common.loadData(filename, dataType="CF2") velocities = [galaxy.v for galaxy in galaxies] #get a list of all the galaxies' velocities. This will let us send it directly to the histogram bins_orig = genBins(binsize, chop) #Make a histogram using pylab histogram function. n, bins, patches = pylab.hist( velocities, bins_orig, histtype="stepfilled", label="Galaxy Distribution,\n binsize={:.2f}Mpc".format(binsize)) #Change visual properties of the histogram pylab.setp(patches, 'facecolor', 'g', 'alpha', 0.75) #Add axis labels pylab.ylabel("Galaxy count") pylab.xlabel("Radial Velocity, km/s") pylab.title("Distribution of Galaxy radial velocities") pylab.axis([0, chop, 0, 1000]) with pdfback.PdfPages(outputFile + str(binsize)) as pdf: pdf.savefig(fig) pylab.show() pylab.close('all')
def prepareAllData (startDates, endDates, normalize): print "Preparing data..." allCourseData = {} for courseId in set(startDates.keys()).intersection(START_DATES.keys()): # For each course # Load data for this course print "Loading {}...".format(courseId) try: somePc, someSurvey, somePcd = loadData(courseId) T0, Tc = computeCourseDates(courseId, startDates) allCourseData[courseId] = [] print "...done" # We need at least 3 weeks' worth of data to both train and test the model. # We use the first 2 weeks' data to train a model (labels are determined by week 2, and # features are extracted from week 1). But then to *evaluate* that model, we need # another (3rd) week. Tcutoffs = np.arange(T0 + 3*WEEK, Tc, WEEK) print courseId, Tcutoffs for Tcutoff in Tcutoffs: # The users that we train/test on must have entered the course by the end of the # *first* week of the last 3 weeks in the time range. Hence, we subtract 2 weeks. usernames = getRelevantUsers(somePc, Tcutoff - 2*WEEK) allData = extractFeaturesAndTargets(somePc, somePcd, someSurvey, usernames, T0, Tcutoff, normalize) allCourseData[courseId].append(allData) except (IOError, ValueError): print "Skipping" continue print "...done" return allCourseData
def remoteLoadData(message): logger.info("=== " * 10) logger.info(message) return loadData(es, conn, message["index"], message["data"], message["doc_type"], message["download"], message["cui"], False, message["user"], message["outputformat"], OUTPUT_URL, OUTPUT_FOLDER)
def prepareAllData (startDates, endDates, demographicsOnly): print "Preparing data..." allCourseData = {} #for courseId in set(pcd.keys()).intersection(START_DATES.keys()): # For each course for courseId in set(startDates.keys()).intersection(START_DATES.keys()): # For each course # Load data for this course print "Loading {}...".format(courseId) try: somePc, someSurvey, somePcd = loadData(courseId) except (IOError, pandas.io.parsers.EmptyDataError): print "Skipping" continue # If no certifiers, then skip if (np.sum(somePc.certified) < MIN_EXAMPLES) or (np.sum(somePc.certified) >= len(somePc) - MIN_EXAMPLES): print "Skipping" continue T0, Tc = computeCourseDates(courseId, startDates) allCourseData[courseId] = [] print "...done" Tcutoffs = np.arange(T0 + 1*WEEK, Tc+np.timedelta64(1, 'D'), WEEK) for Tcutoff in Tcutoffs: usernames = getRelevantUsers(somePc, Tcutoff) allData = splitAndGetNormalizedFeatures(somePc, somePcd, someSurvey, usernames, T0, Tcutoff, demographicsOnly) allCourseData[courseId].append(allData) print "...done" return allCourseData
def main(*pArgs): l_Ctx = {} # Environmental context # Establish the context cmn.getOptions(l_Ctx, pArgs[0]) # Load the data as a pickle file from the input root directory cmn.loadData(l_Ctx) l_Data = l_Ctx["ServerData"] # Optionally, print the results if l_Ctx["PRINT_PICKLED_RESULTS"]: cmn.printFormattedData(l_Ctx, l_Data) # Generate disk stats listing generateDiskStatsListing(l_Ctx) return
def main(*pArgs): l_Ctx = {} # Environmental context # Establish the context cmn.getOptions(l_Ctx, pArgs[0]) # Load the data as a pickle file from the input root directory cmn.loadData(l_Ctx) l_Data = l_Ctx["ServerData"] # Optionally, print the results if l_Ctx["PRINT_PICKLED_RESULTS"]: cmn.printFormattedData(l_Ctx, l_Data) # Perform basic analysis performBasicAnalysis(l_Ctx) return
def main(*pArgs): l_Ctx = {} # Environmental context # Establish the context cmn.getOptions(l_Ctx, pArgs[0]) # Load the data as a pickle file from the input root directory cmn.loadData(l_Ctx) l_Data = l_Ctx["ElapsedTimeData"] # Optionally, print the results if l_Ctx["PRINT_PICKLED_RESULTS"]: cmn.printFormattedData(l_Ctx, l_Data) # Calculate/print the transfer rates calculateTransferRates(l_Ctx) return
def distanceOneBox(hugeFile, surveys, outFile): #Generate distance data for one sub-box - distance from each galaxy to each survey center #These distances are not returned, instead they are only written to the disk. galaxies = np.array(common.loadData(hugeFile, dataType='millPos')) for boxoffset in EDGES: distances = space.distance.cdist(galaxies + boxoffset * 500, surveys) np.save( outFile + "_{}_{}_{}.npy".format(boxoffset[0], boxoffset[1], boxoffset[2]), distances)
def getHash(filename,units): """Loads up CF2 files and uses them to rebuild the hash database. Returns a list of strings. The strings should be hashed with hashlib.md5(string.encode('utf-8')).hexdigest() I'm not sure what I meant when I put that second line there... It means that this just returns strings, then you hash them yourself.""" galaxies = common.loadData(filename, dataType = 'CF2') if units == 'Mpc/h': return myNpHash(np.array([(a.x,a.y,a.z,a.v,a.dv) for a in galaxies])) elif units == 'km/s': return myNpHash(np.array([a.getRedshiftXYZ() + (a.v,a.dv) for a in galaxies])) else: raise ValueError("Value of 'units' must be 'Mpc/h' or 'km/s'. Other unit schemes do not exist at present")
def main(*pArgs): l_Ctx = {} # Environmental context # Establish the context cmn.getOptions(l_Ctx, pArgs[0]) # Load the data as a pickle file from the input root directory cmn.loadData(l_Ctx) l_Data = l_Ctx["ServerData"] # Optionally, print the results if l_Ctx["PRINT_PICKLED_RESULTS"]: cmn.printFormattedData(l_Ctx, l_Data) # Perform all analysis performBasicAnalysis(l_Ctx) generateDiskStatsListing(l_Ctx) generateErrorsListing(l_Ctx) calculateTransferRates(l_Ctx) generateWorkQueueMgrDumps(l_Ctx) return
def prune(filename): with open(filename, 'r') as boxfile: data = boxfile.readlines() dataCopy = [line for line in data] #DON'T MODIFY THE DATACOPY! isComment = True offset = -1 while isComment: offset += 1 #print(offset,len(data)) isComment = (data[offset][0] == '#') or (data[offset][0] == 'x') #Note, after this script is done processing a millennium file, the offset method will no longer work #because there will be comments everywhere in the body of the csv file. positions = common.loadData(filename, 'millPos') kd = cKDTree(positions) pairs = kd.query_pairs(0.0001) for g1, g2 in pairs: #build a new galaxy, with averaged things, but summed mass #print(data[g1+offset]) #print(data[g2+offset]) gal1 = common.MillenniumGalaxy(dataCopy[g1 + offset]) gal2 = common.MillenniumGalaxy(dataCopy[g2 + offset]) totalMass = gal1.mvir + gal2.mvir weightedVelocity1 = np.array([gal1.velX, gal1.velY, gal1.velZ ]) * gal1.mvir weightedVelocity2 = np.array([gal2.velX, gal2.velY, gal2.velZ ]) * gal2.mvir averageGalaxy = common.MillenniumGalaxy([ gal1.x, gal1.y, gal1.z, (gal1.velX + gal2.velX) / 2, (gal1.velY + gal2.velY) / 2, (gal1.velZ + gal2.velZ) / 2, totalMass, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]) data[g1 + offset] = '#REMOVED BECAUSE IT IS A DUPLICATE#' + dataCopy[g1 + offset] data[g2 + offset] = '#REMOVED BECAUSE IT IS A DUPLICATE#' + dataCopy[g2 + offset] data.append('#AVERAGE GALAXY FOLLOWS#\n') data.append(averageGalaxy.toString() + '\n') data.append( '#The above galaxy was added as an average of two galaxies that were in the same location\n' ) data.append( '#The only information it has associated with it is POSITION, VELOCITY, and VIRIAL MASS, because none of the other attributes were currently in use (and I didn\'t know how to use them) when this file was created.\n' ) with open(filename, 'w') as newFile: data = newFile.writelines(data) return len(pairs)
def main(): data = common.loadData("../tully-fisher.csv", dataType="miscFloat") pairs = [(row[0], np.average(row[1:5])) for row in data] xs = [point[1] for point in pairs] ys = [point[0] for point in pairs] fig = plt.figure(figsize=(8, 6), dpi=90) ax = fig.add_subplot(111) ax.set_title("Tully-Fisher relationship") plt.xlabel("Galaxy absolute magnitude") plt.ylabel("Galaxy rotational velocity") ax.set_yscale("log", nonposx='clip') plt.plot(xs, ys, '.') ax.axis([max(xs) + 1, min(xs) - 1, min(ys) - 20, max(ys) + 100]) #print(min(xs),max(xs),min(ys),max(ys)) plt.show()
def onstart(self): logging.info(START_LOAD_DATA % (self.fund, self.strategyName, self.period)) datas = loadData(self.period, self.startDate) timestamp = 0 for data in datas: group = data['symbol'].split('--')[0] if not self.check(group, data['symbol']): continue data['period'] = 60 data['timestamp'] = data['timestamp']*1000 self.get_deque().appendleft((DateType.ON_MKT_KLINE, group, Bar(data))) if(timestamp and timestamp < data['timestamp']): self.get_deque().appendleft((DateType.ON_TIMESTAMP, group, {'timestamp': timestamp})) timestamp = data['timestamp'] logging.info(LOAD_DATA_OK % (self.fund, self.strategyName, self.period)) self.__loadingTag = True sendTradeMsg(INIT_MSG % (self.fund, self.strategyName, self.period, self.symbols()))
def filemanagement(rawInFile,infileindices): #This is soooo stupid. #If there are lots of files, set them up accordingly. inFileList = [rawInFile.format(x) for x in infileindices] d = [] for f in inFileList: galaxies = common.loadData(f,dataType='CF2') #print(galaxiess[1][2]) minid = np.array([(g.normx, g.normy, g.normz, g.redx, g.redy, g.redz, g.dv, g.d, g.v) for g in galaxies]) d.append(minid) #[np.save("tmp/galaxydata{}".format(i),data) for i,data in enumerate(d)] #d = ["tmp/galaxydata{}.npy".format(i) for i in range(100)] return d
def correlate_box(boxinfo, intervals): #load in the subbox xs, ys, zs = common.loadData(boxinfo[0]) #grab its THEORETICAL minimum and maximum values: #Need to add detection of actual minimum and maximum values. #I might already have the logic in the box cutter function... rect_min = boxinfo[1][0] rect_max = boxinfo[1][1] #and its length num_galax = len(xs) #make sure we don't have a jagged array somehow #assert(num_galax == len(ys) == len(zs)) #Never had a problem with that actual_galaxies = np.array(list(zip(xs, ys, zs))) #Make a thread-safe random number generator rng = np.random.RandomState() #and use it to make a list of random galaxy positions random_list = np.array( list( zip(rng.uniform(rect_min[0], rect_max[0], num_galax), rng.uniform(rect_min[1], rect_max[1], num_galax), rng.uniform(rect_min[2], rect_max[2], num_galax)))) #make kd trees actual_kd = space.cKDTree(actual_galaxies, 3) random_kd = space.cKDTree(random_list, 3) DDs = actual_kd.count_neighbors(actual_kd, intervals, cumulative=False)[1:] / 2 DRs = actual_kd.count_neighbors(random_kd, intervals, cumulative=False)[1:] / 2 RRs = random_kd.count_neighbors(random_kd, intervals, cumulative=False)[1:] / 2 #RDs = random_kd.count_neighbors(actual_kd,intervals) #Turns out that RDs == DRs always #Just think about it. # print('.',end='',flush=True) return ((DDs, DRs, RRs))
def distsurvey(hugeFile, surveys, binsize, boxMaxDistance): galaxies = np.array(common.loadData(hugeFile, dataType='millPos')) masterhist = [] firstrun = True for boxoffset in EDGES: distances = space.distance.cdist(galaxies + boxoffset * 500, surveys) numSurveys = distances.shape[1] assert (numSurveys == len(surveys)) bins = genBins(binsize, boxMaxDistance) histogram = [] for i in range(numSurveys): thisSurveyDistances = distances[:, i] #This means take distances for all the galaxies, but only the ith survey #That works since the first dimension of distances is the galaxies and the second dimension is surveys hist, edges = np.histogram(thisSurveyDistances, bins) histogram.append(hist) if firstrun: masterhist = histogram else: for i in range(len(masterhist)): masterhist[i] = histogram[i] + masterhist[i] firstrun = False print(".", end="", flush=True) return np.array(masterhist)
def surveyOneFile(hugeFile, surveys, selectionParams, histogram, boxMaxDistance): """ Given the original data, distances, wanted numbers, and other parameters we actually generate the mock surveys. This is currently the biggest bottleneck of the program, but I'm not entirely sure why. ^ That comment might be outdated (It's not). Since I wrote it, I've rearranged the code a lot for improved efficiency, including using the GPU for some otherwise expensive calculations. """ #Set up variables rng = np.random.RandomState() #Make a process-safe random number generator first = True mastersurvey = [] galax_pos = np.array(common.loadData(hugeFile, dataType='millPos')) for boxoffset in EDGES: # distances = np.load(distanceFile+"_{}_{}_{}.npy".format(boxoffset[0], # boxoffset[1], # boxoffset[2])) #load the distance file !! 7.3% !! distances = space.distance.cdist(galax_pos + boxoffset * 500, surveys) surveyContent = [[] for i in range(distances.shape[1]) ] #Make the skeleton structure for the end result galaxies = common.loadRawMillHybrid( hugeFile, boxoffset) #Load the galaxies !! 13.9% !! #WARNING: ^ Function assumes 500x500x500 box binsize = selectionParams['info'][ 'shell_thickness'] #Load the size of a bin bins = genBins(binsize, boxMaxDistance) numSurveys = distances.shape[1] #Do calculations wantDensity = compute_want_density( distances, binsize, **(selectionParams["constants"])) #!! 20 % !! #wantDensity =selection_values / common.shellVolCenter(distances,binsize) #!! 22 % !! tdistBin = (np.digitize(distances.flatten(), bins) - 1).reshape( distances.shape) #!!!!IMPORTANT!!!!! #tdistBin and distBin are the indexes of the bin each galaxy goes into #Remember these are only the galaxies in the subbox we're looking at (surveyONEFILE) distBin = np.transpose(tdistBin) # !! 6 % !! histogram = np.concatenate((histogram, np.zeros((100, 1)) - 1), axis=1) #When a galaxy is outside chop, its "number of galaxies around here" is set to -1 #That makes the density negative #Which makes the probability negative so it is impossible for that galaxy to be selected. something = [histogram[n][distBin[n]] for n in range(numSurveys)] #This is the number of galaxies in the same bin as the galaxy and in the same box as the galaxy originalCount = np.transpose(np.array(something)) # 2.2% #volumes = calcVolumes(np.transpose(np.array(distBin))*binsize + (binsize/2),binsize)# !! 23 % !! volumes = calcVolumes(tdistBin, binsize) # !! 23 % !! originalDensity = originalCount / volumes #I'm currently under the impression that this for loop is the main bottleneck in this function. #It isn't a complicated task, so it might be worthwhile to consider alternate implementations. #Let's see... #This for loop uses information from distances, binsize, histogram # for i,galaxy in enumerate(distances): # for j,surveyDist in enumerate(galaxy): # distBin = int(distBinNotAnInt[i][j]) # originalDensity[i][j] = histogram[j][distBin] / common.shellVolCenter(distBin*binsize + (binsize/2),binsize) probability = wantDensity / originalDensity dice = rng.random_sample(probability.shape) toAdd = dice < probability # for i,galaxy in enumerate(toAdd): # for j,addBool in enumerate(galaxy): # if addBool: # rawLine = galaxies[i][3] # surveyContent[j].append(rawLine) arrGalaxies = np.array(galaxies, dtype="object") surveyContent = [arrGalaxies[toAdd[:, n]] for n in range(numSurveys)] print(".", end="", flush=True) if first: mastersurvey = surveyContent first = False else: for i in range(len(surveyContent)): mastersurvey[i] = np.concatenate( (mastersurvey[i], surveyContent[i])) print("!") return mastersurvey
default=[0.005], help='Specify the learning rate.') parser.add_argument('--lambda', type=float, nargs='+', default=[0], dest='reg', help='Specify the regularization parameter.') parser.add_argument('--epochs', type=int, nargs=1, default=[5000], help='Training epoches.') parser.add_argument('--batch_size', type=int, nargs=1, default=[500], help='Batch size. For SGD only.') parser.add_argument('--beta1', type=float, nargs='+', default=[0.9], help='Specify the beta1 hyperparameter for Adam.') parser.add_argument('--beta2', type=float, nargs='+', default=[0.999], help='Specify the beta2 hyperparameter for Adam.') parser.add_argument('--epsilon', type=float, nargs='+', default=[1e-8], help='Specify the epsilon hyperparameter for Adam.') args = parser.parse_args() trainData, validData, testData, trainTarget, validTarget, testTarget = loadData() X, y = preprocess_data(trainData, trainTarget) N, d = X.shape Xt, yt = preprocess_data(testData, testTarget) Xv, yv = preprocess_data(validData, validTarget) epochs = args.epochs[0] # Output path path = args.path[0] create_dir(path) batch_size = args.batch_size[0] optimizer = args.optimizer[0] loss_type = args.loss[0]
def main(*pArgs): l_Ctx = {} # Environmental context l_TestOutput = [] # Establish the context cmn.getOptions(l_Ctx, pArgs[0]) # Load the data as a pickle file from the input root directory cmn.loadData(l_Ctx) l_Data = l_Ctx["ServerData"] l_StageInData = l_Ctx["StageInData"]["jobIds"] l_StageOutData = l_Ctx["StageOutData"]["jobIds"] # Print the results if l_Ctx["PRINT_PICKLED_RESULTS"]: cmn.printFormattedData(l_Ctx, l_Data) # Determine list of servers l_TestOutput.append("List of Servers%s" % (os.linesep)) l_Servers = cmn.getServers(l_Data) l_TestOutput.append("%d server(s), %s%s" % (len(l_Servers), `l_Servers`, os.linesep)) l_TestOutput.append("%s" % (os.linesep)) # Determine list of jobids l_TestOutput.append("List of JobIds%s" % (os.linesep)) l_JobIds = cmn.getJobIds(l_Data) l_TestOutput.append("%d jobid(s), %s%s" % (len(l_JobIds), `l_JobIds`, os.linesep)) l_TestOutput.append("%s" % (os.linesep)) # Determine jobids per server l_TestOutput.append("List of JobIds per Server%s" % (os.linesep)) for l_Server in l_Servers: l_JobIds = cmn.getJobIdsForServer(l_Data, l_Server) l_TestOutput.append("For server %s, %d jobid(s), %s%s" % (l_Server, len(l_JobIds), `l_JobIds`, os.linesep)) l_TestOutput.append("%s" % (os.linesep)) # Determinne servers per jobid l_TestOutput.append("List of Servers per JobId%s" % (os.linesep)) l_JobIds = cmn.getJobIds(l_Data) for l_JobId in l_JobIds: l_Servers = cmn.getServersForJobid(l_Data, l_JobId) l_TestOutput.append("For jobid %d, %d server(s), %s%s" % (l_JobId, len(l_Servers), `l_Servers`, os.linesep)) l_TestOutput.append("%s" % (os.linesep)) # Determine handles per jobid, per server, per jobstepid, per connection l_TestOutput.append("List of Handles per JobId, per Server, per JobStepId, per Connection%s" % (os.linesep)) for l_JobId in l_JobIds: l_Servers = cmn.getServersForJobid(l_Data, l_JobId) l_TestOutput.append("%sJobId %d, %d server(s)%s" % (2*" ", l_JobId, len(l_Servers), os.linesep)) for l_Server in l_Servers: l_JobStepIds = cmn.getJobStepIdsForServerJobId(l_Data, l_Server, l_JobId) l_TestOutput.append("%sJobId %d, server %s, %d jobstep(s)%s" % (4*" ", l_JobId, l_Server, len(l_JobStepIds), os.linesep)) for l_JobStepId in l_JobStepIds: l_ConnectionHandleData = cmn.getHandlesPerConnectionForJobIdJobStepId(l_Data, l_Server, l_JobId, l_JobStepId) l_ConnectionHandles = l_ConnectionHandleData.keys() l_ConnectionHandles.sort() l_TestOutput.append("%sJobStepId %d, %d connection(s)%s" % (6*" ", l_JobStepId, len(l_ConnectionHandles), os.linesep)) for l_Connection in l_ConnectionHandles: l_Handles = l_ConnectionHandleData[l_Connection] l_Handles.sort() l_TestOutput.append("%sConnection %s. %d handle(s), %s%s" % (8*" ", l_Connection, len(l_Handles), `l_Handles`, os.linesep)) l_TestOutput.append("%s" % (os.linesep)) # Print out stagein and stageout logs for the found jobids l_TestOutput.append("StageIn and StageOut Log Data for Found Jobs%s" % (os.linesep)) for l_JobId in l_JobIds: l_TestOutput.append("%sJobId %d%s" % (2*" ", l_JobId, os.linesep)) l_TestOutput.append("%sStageIn Log Data%s" % (4*" ", os.linesep)) l_OutputGenerated = False if l_JobId in l_StageInData: for l_Line in l_StageInData[l_JobId]: l_TestOutput.append("%s%s%s" % (6*" ", l_Line, os.linesep)) l_OutputGenerated = True if not l_OutputGenerated: l_TestOutput.append("%sNo stagein data found%s" % (6*" ", os.linesep)) l_TestOutput.append("%s" % (os.linesep)) l_TestOutput.append("%sStageOut Log Data%s" % (4*" ", os.linesep)) l_OutputGenerated = False if l_JobId in l_StageOutData: for l_Line in l_StageOutData[l_JobId]: l_TestOutput.append("%s%s%s" % (6*" ", l_Line, os.linesep)) l_OutputGenerated = True if not l_OutputGenerated: l_TestOutput.append("%sNo stageout data found%s" % (6*" ", os.linesep)) l_TestOutput.append("%s" % (os.linesep)) # Write out the basic results l_PathFileName = os.path.join(l_Ctx["ROOTDIR"], "Analysis", "BasicData.txt") cmn.writeOutput(l_Ctx, l_PathFileName, l_TestOutput) print "Basic results written to %s" % l_PathFileName print # Start detailed analysis per jobid/jobstepid, per server # # For each jobid... l_JobIds = cmn.getJobIds(l_Data) for l_JobId in l_JobIds: l_Output = {} l_Output[l_JobId] = {} l_NumberOfConnectionsForAllServers = 0 l_NumberOfHandlesForAllServers = 0 # For each server... l_Servers = cmn.getServersForJobid(l_Data, l_JobId) for l_Server in l_Servers: l_Output[l_JobId][l_Server] = {} l_NumberOfConnectionsForServer = 0 l_NumberOfHandlesForServer = 0 # For each jobstepid... l_JobStepIds = cmn.getJobStepIdsForServerJobId(l_Data, l_Server, l_JobId) for l_JobStepId in l_JobStepIds: l_Output[l_JobId][l_Server][l_JobStepId] = {} l_Output[l_JobId][l_Server][l_JobStepId]["handles"] = {} l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["NumberOfConnections"] = 0 l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["SizeTransferred"] = 0 l_Output[l_JobId][l_Server][l_JobStepId]["NotSuccessfulHandles"] = [] l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"] = {} # For each handle... l_Handles = cmn.getHandlesForServer(l_Data, l_Server) l_NumberOfHandles = 0 l_HandleProcessingTimes = [None,None] l_NotSuccessfulHandles = [] for l_Handle in l_Handles: if l_Data[l_Server]["Handles"][l_Handle]["JobId"] == l_JobId and \ l_Data[l_Server]["Handles"][l_Handle]["JobStepId"] == l_JobStepId: # JobId and JobStepId matches... l_NumberOfHandles = l_NumberOfHandles + 1 l_NumberOfHandlesForServer = l_NumberOfHandlesForServer + 1 l_NumberOfHandlesForAllServers = l_NumberOfHandlesForAllServers + 1 if "Status" in l_Data[l_Server]["Handles"][l_Handle]: if l_Data[l_Server]["Handles"][l_Handle]["Status"] != "BBFULLSUCCESS": l_NotSuccessfulHandles.append((l_Handle, l_Data[l_Server]["Handles"][l_Handle]["Status"])) else: l_NotSuccessfulHandles.append((l_Handle, "NOT_COMPLETED")) l_NumberOfConnections = l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["NumberOfConnections"] l_SizeTransferred = l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["SizeTransferred"] # For each connection... l_Connections = l_Data[l_Server]["Handles"][l_Handle]["Connections"] for l_Connection in l_Connections: if l_Connection not in l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"]: l_NumberOfConnections = l_NumberOfConnections + 1 l_NumberOfConnectionsForServer = l_NumberOfConnectionsForServer + 1 l_NumberOfConnectionsForAllServers = l_NumberOfConnectionsForAllServers + 1 l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection] = {} l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"] = {} l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["NumberOfContribIds"] = 0 l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["processingTimes (File Min/Max)"] = [None,None] l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["readTimes (File Min/Max)"] = [None,None] l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["writeTimes (File Min/Max)"] = [None,None] l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["syncTimes (File Min/Max)"] = [None,None] # For each LVUuid... l_LVUuids = l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"] for l_LVUuid in l_LVUuids: # For each ContribId... l_ContribIds = l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"] l_ProcessingTimes = l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["processingTimes (File Min/Max)"] l_NumberOfContribIds = l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["NumberOfContribIds"] l_NotSuccessfulContribIds = [] for l_ContribId in l_ContribIds: l_NumberOfContribIds = l_NumberOfContribIds + 1 if "Status" in l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]: if l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["Status"] != "BBFULLSUCCESS": l_NotSuccessfulContribIds.append((l_ContribId, l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["Status"])) else: l_NotSuccessfulContribIds.append((l_ContribId,"NOT_COMPLETED")) if "SizeTransferred" in l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]: l_SizeTransferred += l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["SizeTransferred"] if "ProcessingTime" in l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]: l_ProcessingTime = (l_ContribId, l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["ProcessingTime"]) if l_ProcessingTimes == [None,None]: l_ProcessingTimes = [l_ProcessingTime,l_ProcessingTime] else: if (l_ProcessingTime[1] < l_ProcessingTimes[0][1]): l_ProcessingTimes[0] = l_ProcessingTime if (l_ProcessingTime[1] > l_ProcessingTimes[1][1]): l_ProcessingTimes[1] = l_ProcessingTime if l_HandleProcessingTimes == [None,None]: l_HandleProcessingTimes = l_ProcessingTimes else: if (l_ProcessingTimes[0][1] < l_HandleProcessingTimes[0][1]): l_HandleProcessingTimes[0] = l_ProcessingTimes[0] if (l_ProcessingTimes[1][1] > l_HandleProcessingTimes[1][1]): l_HandleProcessingTimes[1] = l_ProcessingTimes[1] l_ReadTimes = l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["readTimes (File Min/Max)"] l_WriteTimes = l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["writeTimes (File Min/Max)"] l_SyncTimes = l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["syncTimes (File Min/Max)"] l_TransferTypes = set() if "Files" in l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]: l_Files = l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["Files"] # For each file... for l_File in l_Files: if "ReadCount" in l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["Files"][l_File]: l_ReadTime = (l_ContribId, l_File, (l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["Files"][l_File]["ReadCount"], l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["Files"][l_File]["ReadTime"])) if l_ReadTimes == [None,None]: l_ReadTimes = [l_ReadTime,l_ReadTime] else: if (l_ReadTime[2][1] < l_ReadTimes[0][2][1]): l_ReadTimes[0] = l_ReadTime if (l_ReadTime[2][1] > l_ReadTimes[1][2][1]): l_ReadTimes[1] = l_ReadTime l_WriteTime = (l_ContribId, l_File, (l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["Files"][l_File]["WriteCount"], l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["Files"][l_File]["WriteTime"])) if l_WriteTimes == [None,None]: l_WriteTimes = [l_WriteTime,l_WriteTime] else: if (l_WriteTime[2][1] < l_WriteTimes[0][2][1]): l_WriteTimes[0] = l_WriteTime if (l_WriteTime[2][1] > l_WriteTimes[1][2][1]): l_WriteTimes[1] = l_WriteTime l_TransferTypes.add(l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["Files"][l_File]["TransferType"]) # NOTE: The output will associate the sync count/time with the source file, when in fact that data is for the corresponding target files. if l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["Files"][l_File]["SyncCount"] != None: l_SyncTime = (l_ContribId, l_File, (l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["Files"][l_File]["SyncCount"], l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["Files"][l_File]["SyncTime"])) if l_SyncTimes == [None, None]: l_SyncTimes = [l_SyncTime, l_SyncTime] else: if (l_SyncTime[2][1] < l_SyncTimes[0][2][1]): l_SyncTimes[0] = l_SyncTime if (l_SyncTime[2][1] > l_SyncTimes[1][2][1]): l_SyncTimes[1] = l_SyncTime # End of files... l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["readTimes (File Min/Max)"] = l_ReadTimes l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["writeTimes (File Min/Max)"] = l_WriteTimes l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["syncTimes (File Min/Max)"] = l_SyncTimes l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["transferTypes"] = [l_TransferType for l_TransferType in l_TransferTypes] # End of contribids... if l_NotSuccessfulContribIds: if "NotSuccessfulContribIds" not in l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]: l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["NotSuccessfulContribIds"] = [] l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["NotSuccessfulContribIds"] = l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["NotSuccessfulContribIds"] + l_NotSuccessfulContribIds l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["NumberOfContribIds"] = l_NumberOfContribIds l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["processingTimes (File Min/Max)"] = l_ProcessingTimes # End of connections... l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["NumberOfConnections"] = l_NumberOfConnections if l_SizeTransferred: l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["SizeTransferred"] = l_SizeTransferred # End of handles... if l_NotSuccessfulHandles: l_Output[l_JobId][l_Server][l_JobStepId]["NotSuccessfulHandles"] = l_Output[l_JobId][l_Server][l_JobStepId]["NotSuccessfulHandles"] + l_NotSuccessfulHandles l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["NumberOfHandles"] = l_NumberOfHandles l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["ProcessingTimes (ContribId Min/Max)"] = l_HandleProcessingTimes l_Output[l_JobId][l_Server]["NumberOfConnectionsForServer"] = l_NumberOfConnectionsForServer l_Output[l_JobId][l_Server]["NumberOfHandlesForServer"] = l_NumberOfHandlesForServer # End of servers... l_Output[l_JobId]["NumberOfConnectionsForAllServers"] = l_NumberOfConnectionsForAllServers l_Output[l_JobId]["NumberOfHandlesForAllServers"] = l_NumberOfHandlesForAllServers # Calculate the min/max processing times for all contribids, across all servers l_ServerProcessingTimes = [None, None] for l_Server in l_Output[l_JobId].keys(): if type(l_Output[l_JobId][l_Server]) == dict: for l_JobStepId in l_Output[l_JobId][l_Server].keys(): if type(l_Output[l_JobId][l_Server][l_JobStepId]) == dict: if l_ServerProcessingTimes == [None, None]: l_ServerProcessingTimes = l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["ProcessingTimes (ContribId Min/Max)"] else: if (l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["ProcessingTimes (ContribId Min/Max)"][0][1] < l_ServerProcessingTimes[0][1]): l_ServerProcessingTimes[0] = l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["ProcessingTimes (ContribId Min/Max)"][0] if (l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["ProcessingTimes (ContribId Min/Max)"][1][1] > l_ServerProcessingTimes[1][1]): l_ServerProcessingTimes[1] = l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["ProcessingTimes (ContribId Min/Max)"][1] l_Output[l_JobId]["ProcessingTimes (All Servers ContribId Min/Max)"] = l_ServerProcessingTimes # Format data/add average size transferred per handle data for l_JobId in l_Output: for l_Server in l_Output[l_JobId]: if type(l_Output[l_JobId][l_Server]) == dict: for l_JobStepId in l_Output[l_JobId][l_Server]: # NOTE: Not every l_Server element is a server name. # We only want to process those with a "Handles" key. try: if "SizeTransferred" in l_Output[l_JobId][l_Server][l_JobStepId]["handles"]: if l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["SizeTransferred"]: l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["Avg SizeTransferred/Handle"] = cmn.numericFormat(round(float(l_Output[l_JobId][l_Server]["handles"]["SizeTransferred"])/float(l_Output[l_JobId][l_Server]["handles"]["NumberOfHandles"]),3)) l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["SizeTransferred"] = cmn.numericFormat(l_Output[l_JobId][l_Server]["handles"]["SizeTransferred"]) except Exception: pass # Output the results # pprint.pprint(l_Output) cmn.ensure(os.path.join(l_Ctx["ROOTDIR"], "Analysis", `l_JobId`)) l_PathFileName = os.path.join(l_Ctx["ROOTDIR"], "Analysis", `l_JobId`, "Details.txt") cmn.printFormattedFile(l_Ctx, l_PathFileName, l_Output) l_PathFileName = os.path.join(l_Ctx["ROOTDIR"], "Analysis", `l_JobId`, "Details.json") cmn.printFormattedFileAsJson(l_Ctx, l_PathFileName, l_Output) return
def main(args): np.seterr(divide='ignore',invalid='ignore') """ Compute the velocity correlations on one or many galaxy surveys. """ #Get setup information from the settings files settings = common.getdict(args.settings) if settings['num_files'] != 10000 and settings['use_npy']: print("Sorry! We can only handle 100x100 surveys. Try turning off the use_npy flag.") exit() numpoints = settings['numpoints'] dr = settings['dr'] min_r = settings['min_r'] orig_outfile = settings['output_file_name'] step_type = settings['step_type'] infile = settings['input_file'] unitslist = settings['binunits'] maxd_master = settings['max_distance'] numpy = settings['use_npy'] use_tmp = settings['use_tmp'] if settings['many_squared']: distance_args_master = list(zip(dr,min_r,numpoints)) file_schemes = list(zip(infile,orig_outfile,settings['readable_name'])) xintervals = [common.genBins(x[1],x[2],x[0],step_type) for x in distance_args_master] xs_master = [a[0] for a in xintervals] intervals_master = [a[1] for a in xintervals] else: #Everything is built around lists now, so we just build lists of length one! distance_args_master = [(dr,min_r,numpoints)] file_schemes = [(infile,orig_outfile,settings['readable_name'])] xs_master,intervals_master = common.genBins(min_r,numpoints,dr,step_type) xs_master = [xs_master] intervals_master = [intervals_master] if numpy: if args.override: print(args.override) indices = args.override.split(':') a = int(indices[0]) b = int(indices[1]) file_schemes = file_schemes[a:b] print(file_schemes) else: infileindices = [x + settings['offset'] for x in range(settings['num_files'])] for rawInFile, outfile, readName in file_schemes: for units in unitslist: if units == 'km/s': xs = [[x * 100 for x in y] for y in xs_master] intervals = [[x * 100 for x in y] for y in intervals_master] distance_args = [(x[0]*100,x[1]*100,x[2]) for x in distance_args_master] maxd = maxd_master * 100 else: xs = xs_master intervals = intervals_master distance_args = distance_args_master maxd = maxd_master if settings['many'] and not numpy: d = filemanagement(rawInFile,infileindices) i = d elif not numpy: galaxies = common.loadData(rawInFile,dataType='CF2') d = [np.array([(g.normx, g.normy, g.normz, g.redx, g.redy, g.redz, g.dv, g.d, g.v) for g in galaxies])] i = ['nothing'] else: print("Loading numpy file...") data = np.load(rawInFile) print("Handling NaNs") nansremoved = [ data[x][np.invert(np.isnan(data[x][:,0]))] for x in range(100)] del data #for x in range(100): # np.save('/tmp/c156r133-{}/vcorr-{}'.format(b,x),nansremoved[x]) #df = ['/tmp/c156r133-{}/vcorr-{}.npy'.format(b,x//100) for x in range(10000) ] d = [ nansremoved[x//100] for x in range(10000) ] i = [ x%100 for x in range(10000) ] #print(d[542].shape) print("Opening Pool...") gc.collect() with Pool(processes=NUM_PROCESSES) as pool: print("Generating Histograms...") histogramData = list(pool.starmap(turboRun,zip(d,i,itertools.repeat(numpy), itertools.repeat(maxd), itertools.repeat(units), itertools.repeat(xs), itertools.repeat(intervals), itertools.repeat(use_tmp) ))) """ Each turbo run returns a list of histograms [ 5-length histogram, 10-length histogram, 20-length etc] so histogramData is a list of turbo runs, which means data is a jagged array data = [ [ [ ----- ], [ ---------- ], [ -------------------- ] ], [ [ ----- ], [ ---------- ], [ -------------------- ] ], ] """ for scheme_index in range(len(intervals)): hist_for_scheme = np.array([turbo_data[scheme_index] for turbo_data in histogramData]) saveOutput(hist_for_scheme,outfile.format('',distance_args[scheme_index][0],units.replace('/',''))) print(" Done!")
def singlerun(filename, outputFile, binsize, chop, modelOverride=None): fig = plt.figure() galaxies = common.loadData(filename, dataType="CF2") distances = [galaxy.d for galaxy in galaxies] #get a list of all the distances to galaxies. This will let us send it directly to the histogram bins_orig = genBins(binsize, chop) #Make a histogram using pylab histogram function. n, bins, patches = plt.hist( distances, bins_orig, histtype="stepfilled", label="Galaxy Distribution,\n binsize={:.2f}Mpc".format(binsize)) #Change visual properties of the histogram plt.setp(patches, 'facecolor', 'g', 'alpha', 0.75) robot = chi_sq_solver(bins, n, selection_function) if modelOverride is None: #If we don't have an existing model to use, we find a best fit and plot it #Solve the chi squared optimization for the histogram and selection function params = robot.result.x #Plot the best fit domain = np.arange(0, chop, 1) model = [selection_function(r, *(robot.result.x)) for r in domain] plt.plot( domain, model, 'k--', linewidth=1.5, label= "Model fit: $A = {:.3f}$\n$r_0 = {:.3f}$\n$n_1 = {:.3f}$\n$n_2={:.3f}$\n$\chi^2={chisq:.3f}$" .format(*(robot.result.x), chisq=robot.result.fun)) chisq = robot.result.fun else: #Plot the model given in the settings function instead of calculating a new one mo = modelOverride["constants"] params = [mo['A'], mo['r_0'], mo['n_1'], mo['n_2']] chisq = robot.chi_sq(params) domain = np.arange(0, chop, 1) model = [selection_function(r, *params) for r in domain] plt.plot( domain, model, 'k--', linewidth=1.5, label= "Model fit: $A = {:.3f}$\n$r_0 = {:.3f}$\n$n_1 = {:.3f}$\n$n_2={:.3f}$\n$\chi^2={chisq:.3f}$" .format(*params, chisq=chisq)) #Add axis labels plt.ylabel("Galaxy count") plt.xlabel("Distance, Mpc/h") plt.title("Distribution of Galaxy Distance") plt.legend() plt.axis([0, chop, 0, 1300]) fig2 = plt.figure() shellVolume = [ common.shellVolCenter(robot.centerbins[i], binsize) for i in range(len(n)) ] plt.title("Galaxies per Cubic Mpc") plt.xlabel("Distance, Mpc/h") plt.ylabel("Density, galaxies/(Mpc/h)^3") density = [n[i] / shellVolume[i] for i in range(len(n))] plt.plot(robot.centerbins, density, 'o') #Save figure with pdfback.PdfPages(outputFile + str(binsize) + '.pdf') as pdf: pdf.savefig(fig) pdf.savefig(fig2) if modelOverride is None: #Write paramaters to a file for later use. common.writedict( outputFile + str(binsize) + '_params.json', { 'constants': { 'A': params[0], 'r_0': params[1], 'n_1': params[2], 'n_2': params[3] }, 'info': { 'shell_thickness': binsize, 'max_radius': chop, 'chisq': chisq } }) plt.close('all')
def perturb(infile, outfile, err, num, ptype, est, mod, lots): """Infile and outfile are filenames. Outfile always has a {} in it, and infile should have one too if you're using the 'lots' option. err is the error, usually like 0.2 for distance perturbations or 0.02 for modulus perturbations ptype, est, and mod are strings that describe the combination of perturbation type, modulus and estimator to use. ptype shouls be 'distance', 'modulus', or 'relative' est should be 'cz' or 'feldman' where cz=v+h*d and feldman is the unbiased log estimator. modulus only applies when using modulus and relative ptypes, and has choices 'ln','log', and 'textbook' if lots is not 0, False, or None then it should be an integer specifying the number of infiles. infile should then have a {} in it for the index. Outfiles will be numbered as follows: infile_index*num + outfile_index where infile_index goes from zero to lots and outfile_index goes from 0 to num """ if lots: infiles = [infile.format(x) for x in range(lots)] else: infiles = [infile] surveys = [] for in_i, infile in enumerate(infiles): num_acks = 0 second_order_acks = 0 num_errs = 0 hubble_constant = 100 galaxies = common.loadData(infile, 'CF2') perturbed_vs = [] delta_vs = [] for galaxy in galaxies: #q_0 = -0.595 #z = galaxy.cz/(3*10**8) #zmod = z*(1 + 0.5*(1-q_0)*z + (1/6)*(2-q_0-3q_0**2)*z**2) if abs(galaxy.v) > galaxy.cz / 10: num_acks += 1 if ptype == "distance": skewed_distance = np.random.normal(galaxy.d, abs(galaxy.d * err), num) elif ptype == "modulus": inmod = modulusify(galaxy.d, mod) pmod = np.random.normal(inmod, err, num) skewed_distance = unmodulusify(pmod, mod) elif ptype == "relative": inmod = modulusify(galaxy.d, mod) pmod = np.random.normal(inmod, np.abs(err * inmod), num) skewed_distance = unmodulusify(pmod, mod) if est == "cz": try: velocities = galaxy.cz - hubble_constant * skewed_distance dv = galaxy.d * err * hubble_constant except FloatingPointError: #I don't think it's possible to have a FP error here... Could be wrong? num_errs += 1 print("I was wrong") continue elif est == "feldman": try: velocities = galaxy.cz * np.log( galaxy.cz / (hubble_constant * skewed_distance)) dv = galaxy.cz * err #calculate_error(distance_modulus,galaxy.d,frac_error,args) for velocity in velocities: if abs(velocity) > galaxy.cz / 10: second_order_acks += 1 except FloatingPointError: num_errs += 1 continue perturbed_vs.append((velocities, dv, skewed_distance, galaxy)) print( "{} out of {} galaxies ({:.2f}) had true velocity NOT much less than redshift," .format(num_acks, len(galaxies), num_acks / len(galaxies))) print( "i.e. the condition on our estimator that v << cz was not satisfied." ) print("This happened to the random data {} times out of {}.".format( second_order_acks, num * len(galaxies))) print( "Also, {} FloatingPoint errors happened, even after taking out the close-by galaxies." .format(num_errs)) print() survey = [] for v, dv, d, galaxy in perturbed_vs: np1 = np.array((galaxy.normx, galaxy.normy, galaxy.normz, galaxy.redx, galaxy.redy, galaxy.redz, dv)) survey.append(np.concatenate((np1, d, v))) surveys.append(survey) maxlength = max([len(survey) for survey in surveys]) surveylength = len(surveys[0][0]) for survey in surveys: for x in range(len(survey), maxlength): filler = np.empty(surveylength) filler[:] = np.NAN survey.append(filler) surveysnp = np.array(surveys) print(surveysnp.shape) np.save(outfile, surveysnp)
def main(args): """ Compute the velocity correlations on one or many galaxy surveys. """ print("Incomplete function - see comments") exit() #Get setup information from the settings file settings = common.getdict(args.settings) numpoints = settings["numpoints"] outfolder = settings["output_data_folder"] outfile = settings["output_file_name"] rawInFile = settings["input_file"] step_type = settings["step_type"] dr = settings["dr"] min_r = settings["min_r"] if settings["many"]: #If there are lots of files, set them up accordingly. inFileList = [ rawInFile.format(x + settings['offset']) for x in range(settings["num_files"]) ] else: inFileList = [rawInFile] xs, intervals = common.genBins(min_r, numpoints, dr, step_type) for index, infile in enumerate(inFileList): #Load the survey galaxies = np.array(common.loadData(infile, dataType="millVel")) print(galaxies.shape) #Put just the galaxy positions into one array positions = galaxies[:, 0:3] # [(x,y,z),...] velocities = galaxies[:, 3:6] kd = cKDTree(positions) pairs = kd.query_pairs(max(intervals)) npPairs = np.array(list(pairs)) g1pos = positions[npPairs[:, 0]] g2pos = positions[npPairs[:, 1]] g1vs = velocities[npPairs[:, 0]] g2vs = velocities[npPairs[:, 1]] distBetweenG1G2 = np.linalg.norm(g2pos - g1pos, axis=1) velocityCorrelation = inner1d(g1vs, g2vs) / 10**4 c11 = g1vs[:, 0] * g2vs[:, 0] c12 = g1vs[:, 0] * g2vs[:, 1] c13 = g1vs[:, 0] * g2vs[:, 2] c21 = g1vs[:, 1] * g2vs[:, 0] c22 = g1vs[:, 1] * g2vs[:, 1] c23 = g1vs[:, 1] * g2vs[:, 2] c31 = g1vs[:, 2] * g2vs[:, 0] c32 = g1vs[:, 2] * g2vs[:, 1] c33 = g1vs[:, 2] * g2vs[:, 2] n, bins = np.histogram(distBetweenG1G2, bins=intervals) correlation11, bins = np.histogram(distBetweenG1G2, bins=intervals, weights=c11) correlation12, bins = np.histogram(distBetweenG1G2, bins=intervals, weights=c12) correlation13, bins = np.histogram(distBetweenG1G2, bins=intervals, weights=c13) correlation21, bins = np.histogram(distBetweenG1G2, bins=intervals, weights=c21) correlation22, bins = np.histogram(distBetweenG1G2, bins=intervals, weights=c22) correlation23, bins = np.histogram(distBetweenG1G2, bins=intervals, weights=c23) correlation31, bins = np.histogram(distBetweenG1G2, bins=intervals, weights=c31) correlation32, bins = np.histogram(distBetweenG1G2, bins=intervals, weights=c32) correlation33, bins = np.histogram(distBetweenG1G2, bins=intervals, weights=c33) a11 = correlation11 / n a12 = correlation12 / n a13 = correlation13 / n a21 = correlation21 / n a22 = correlation22 / n a23 = correlation23 / n a31 = correlation31 / n a32 = correlation32 / n a33 = correlation33 / n f, ((ax11, ax12, ax13), (ax21, ax22, ax23), (ax31, ax32, ax33)) = plt.subplots(3, 3, sharex='col', sharey='row', figsize=(11, 8.5)) ax11.plot(xs, a11) ax12.plot(xs, a12) ax13.plot(xs, a13) ax21.plot(xs, a21) ax22.plot(xs, a22) ax23.plot(xs, a23) ax31.plot(xs, a31) ax32.plot(xs, a32) ax33.plot(xs, a33) #set x axis and y axis to be the same #go out to until correlation is zero f.suptitle('3-D velocity correlation') ax31.set_xlabel('Distance, Mpc/h') ax32.set_xlabel('Distance, Mpc/h') ax33.set_xlabel('Distance, Mpc/h') ax11.set_ylabel('correlation, $(km/s)^2$') ax21.set_ylabel('correlation, $(km/s)^2$') ax31.set_ylabel('correlation, $(km/s)^2$') with pdfback.PdfPages(outfolder + outfile.format(index)) as pdf: pdf.savefig(f) pylab.close('all')