def singlerun(filename, outputFile, binsize, chop, modelOverride=None):
    fig = pylab.figure()
    galaxies = common.loadData(filename, dataType="CF2")
    velocities = [galaxy.v for galaxy in galaxies]
    #get a list of all the galaxies' velocities. This will let us send it directly to the histogram

    bins_orig = genBins(binsize, chop)

    #Make a histogram using pylab histogram function.
    n, bins, patches = pylab.hist(
        velocities,
        bins_orig,
        histtype="stepfilled",
        label="Galaxy Distribution,\n binsize={:.2f}Mpc".format(binsize))

    #Change visual properties of the histogram
    pylab.setp(patches, 'facecolor', 'g', 'alpha', 0.75)

    #Add axis labels
    pylab.ylabel("Galaxy count")
    pylab.xlabel("Radial Velocity, km/s")
    pylab.title("Distribution of Galaxy radial velocities")
    pylab.axis([0, chop, 0, 1000])

    with pdfback.PdfPages(outputFile + str(binsize)) as pdf:
        pdf.savefig(fig)
    pylab.show()
    pylab.close('all')
def prepareAllData (startDates, endDates, normalize):
	print "Preparing data..."
	allCourseData = {}
	for courseId in set(startDates.keys()).intersection(START_DATES.keys()):  # For each course
		# Load data for this course
		print "Loading {}...".format(courseId)
		try:
			somePc, someSurvey, somePcd = loadData(courseId)
			T0, Tc = computeCourseDates(courseId, startDates)
			allCourseData[courseId] = []
			print "...done"

			# We need at least 3 weeks' worth of data to both train and test the model.
			# We use the first 2 weeks' data to train a model (labels are determined by week 2, and
			# features are extracted from week 1). But then to *evaluate* that model, we need
			# another (3rd) week.
			Tcutoffs = np.arange(T0 + 3*WEEK, Tc, WEEK)
			print courseId, Tcutoffs
			for Tcutoff in Tcutoffs:
				# The users that we train/test on must have entered the course by the end of the
				# *first* week of the last 3 weeks in the time range. Hence, we subtract 2 weeks.
				usernames = getRelevantUsers(somePc, Tcutoff - 2*WEEK)
				allData = extractFeaturesAndTargets(somePc, somePcd, someSurvey, usernames, T0, Tcutoff, normalize)
				allCourseData[courseId].append(allData)
		except (IOError, ValueError):
			print "Skipping"
			continue
	print "...done"
	return allCourseData
Exemple #3
0
def remoteLoadData(message):
    logger.info("=== " * 10)
    logger.info(message)
    return loadData(es, conn, message["index"], message["data"],
                    message["doc_type"], message["download"], message["cui"],
                    False, message["user"], message["outputformat"],
                    OUTPUT_URL, OUTPUT_FOLDER)
def prepareAllData (startDates, endDates, demographicsOnly):
	print "Preparing data..."
	allCourseData = {}
	#for courseId in set(pcd.keys()).intersection(START_DATES.keys()):  # For each course
	for courseId in set(startDates.keys()).intersection(START_DATES.keys()):  # For each course
		# Load data for this course
		print "Loading {}...".format(courseId)
		try:
			somePc, someSurvey, somePcd = loadData(courseId)
		except (IOError, pandas.io.parsers.EmptyDataError):
			print "Skipping"
			continue
		# If no certifiers, then skip
		if (np.sum(somePc.certified) < MIN_EXAMPLES) or (np.sum(somePc.certified) >= len(somePc) - MIN_EXAMPLES):
			print "Skipping"
			continue

		T0, Tc = computeCourseDates(courseId, startDates)
		allCourseData[courseId] = []
		print "...done"

		Tcutoffs = np.arange(T0 + 1*WEEK, Tc+np.timedelta64(1, 'D'), WEEK)
		for Tcutoff in Tcutoffs:
			usernames = getRelevantUsers(somePc, Tcutoff)
			allData = splitAndGetNormalizedFeatures(somePc, somePcd, someSurvey, usernames, T0, Tcutoff, demographicsOnly)
			allCourseData[courseId].append(allData)
	print "...done"
	return allCourseData
def main(*pArgs):
    l_Ctx = {}  # Environmental context

    # Establish the context
    cmn.getOptions(l_Ctx, pArgs[0])

    # Load the data as a pickle file from the input root directory
    cmn.loadData(l_Ctx)
    l_Data = l_Ctx["ServerData"]

    # Optionally, print the results
    if l_Ctx["PRINT_PICKLED_RESULTS"]:
        cmn.printFormattedData(l_Ctx, l_Data)

    # Generate disk stats listing
    generateDiskStatsListing(l_Ctx)

    return
Exemple #6
0
def main(*pArgs):
    l_Ctx = {}  # Environmental context

    # Establish the context
    cmn.getOptions(l_Ctx, pArgs[0])

    # Load the data as a pickle file from the input root directory
    cmn.loadData(l_Ctx)
    l_Data = l_Ctx["ServerData"]

    # Optionally, print the results
    if l_Ctx["PRINT_PICKLED_RESULTS"]:
        cmn.printFormattedData(l_Ctx, l_Data)

    # Perform basic analysis
    performBasicAnalysis(l_Ctx)

    return
Exemple #7
0
def main(*pArgs):
    l_Ctx = {}  # Environmental context

    # Establish the context
    cmn.getOptions(l_Ctx, pArgs[0])

    # Load the data as a pickle file from the input root directory
    cmn.loadData(l_Ctx)
    l_Data = l_Ctx["ElapsedTimeData"]

    # Optionally, print the results
    if l_Ctx["PRINT_PICKLED_RESULTS"]:
        cmn.printFormattedData(l_Ctx, l_Data)

    # Calculate/print the transfer rates
    calculateTransferRates(l_Ctx)

    return
Exemple #8
0
def distanceOneBox(hugeFile, surveys, outFile):
    #Generate distance data for one sub-box - distance from each galaxy to each survey center
    #These distances are not returned, instead they are only written to the disk.
    galaxies = np.array(common.loadData(hugeFile, dataType='millPos'))
    for boxoffset in EDGES:
        distances = space.distance.cdist(galaxies + boxoffset * 500, surveys)
        np.save(
            outFile +
            "_{}_{}_{}.npy".format(boxoffset[0], boxoffset[1], boxoffset[2]),
            distances)
Exemple #9
0
def getHash(filename,units):
    """Loads up CF2 files and uses them to rebuild the hash database.
    Returns a list of strings. The strings should be hashed with hashlib.md5(string.encode('utf-8')).hexdigest()
    I'm not sure what I meant when I put that second line there... It means that this just returns strings,
    then you hash them yourself."""
    galaxies = common.loadData(filename, dataType = 'CF2')
    if units == 'Mpc/h':
        return myNpHash(np.array([(a.x,a.y,a.z,a.v,a.dv) for a in galaxies]))
    elif units == 'km/s':
        return myNpHash(np.array([a.getRedshiftXYZ() + (a.v,a.dv) for a in galaxies]))
    else:
        raise ValueError("Value of 'units' must be 'Mpc/h' or 'km/s'. Other unit schemes do not exist at present")
Exemple #10
0
def main(*pArgs):
    l_Ctx = {}  # Environmental context

    # Establish the context
    cmn.getOptions(l_Ctx, pArgs[0])

    # Load the data as a pickle file from the input root directory
    cmn.loadData(l_Ctx)
    l_Data = l_Ctx["ServerData"]

    # Optionally, print the results
    if l_Ctx["PRINT_PICKLED_RESULTS"]:
        cmn.printFormattedData(l_Ctx, l_Data)

    # Perform all analysis
    performBasicAnalysis(l_Ctx)
    generateDiskStatsListing(l_Ctx)
    generateErrorsListing(l_Ctx)
    calculateTransferRates(l_Ctx)
    generateWorkQueueMgrDumps(l_Ctx)

    return
def prune(filename):
    with open(filename, 'r') as boxfile:
        data = boxfile.readlines()
    dataCopy = [line for line in data]
    #DON'T MODIFY THE DATACOPY!

    isComment = True
    offset = -1
    while isComment:
        offset += 1
        #print(offset,len(data))
        isComment = (data[offset][0] == '#') or (data[offset][0] == 'x')
        #Note, after this script is done processing a millennium file, the offset method will no longer work
        #because there will be comments everywhere in the body of the csv file.
    positions = common.loadData(filename, 'millPos')
    kd = cKDTree(positions)
    pairs = kd.query_pairs(0.0001)
    for g1, g2 in pairs:
        #build a new galaxy, with averaged things, but summed mass
        #print(data[g1+offset])
        #print(data[g2+offset])
        gal1 = common.MillenniumGalaxy(dataCopy[g1 + offset])
        gal2 = common.MillenniumGalaxy(dataCopy[g2 + offset])
        totalMass = gal1.mvir + gal2.mvir
        weightedVelocity1 = np.array([gal1.velX, gal1.velY, gal1.velZ
                                      ]) * gal1.mvir
        weightedVelocity2 = np.array([gal2.velX, gal2.velY, gal2.velZ
                                      ]) * gal2.mvir
        averageGalaxy = common.MillenniumGalaxy([
            gal1.x, gal1.y, gal1.z, (gal1.velX + gal2.velX) / 2,
            (gal1.velY + gal2.velY) / 2, (gal1.velZ + gal2.velZ) / 2,
            totalMass, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        ])
        data[g1 +
             offset] = '#REMOVED BECAUSE IT IS A DUPLICATE#' + dataCopy[g1 +
                                                                        offset]
        data[g2 +
             offset] = '#REMOVED BECAUSE IT IS A DUPLICATE#' + dataCopy[g2 +
                                                                        offset]
        data.append('#AVERAGE GALAXY FOLLOWS#\n')
        data.append(averageGalaxy.toString() + '\n')
        data.append(
            '#The above galaxy was added as an average of two galaxies that were in the same location\n'
        )
        data.append(
            '#The only information it has associated with it is POSITION, VELOCITY, and VIRIAL MASS, because none of the other attributes were currently in use (and I didn\'t know how to use them) when this file was created.\n'
        )
    with open(filename, 'w') as newFile:
        data = newFile.writelines(data)
    return len(pairs)
Exemple #12
0
def main():
    data = common.loadData("../tully-fisher.csv", dataType="miscFloat")
    pairs = [(row[0], np.average(row[1:5])) for row in data]
    xs = [point[1] for point in pairs]
    ys = [point[0] for point in pairs]
    fig = plt.figure(figsize=(8, 6), dpi=90)
    ax = fig.add_subplot(111)
    ax.set_title("Tully-Fisher relationship")
    plt.xlabel("Galaxy absolute magnitude")
    plt.ylabel("Galaxy rotational velocity")

    ax.set_yscale("log", nonposx='clip')
    plt.plot(xs, ys, '.')

    ax.axis([max(xs) + 1, min(xs) - 1, min(ys) - 20, max(ys) + 100])
    #print(min(xs),max(xs),min(ys),max(ys))
    plt.show()
Exemple #13
0
 def onstart(self):
     logging.info(START_LOAD_DATA % (self.fund, self.strategyName, self.period))
     datas = loadData(self.period, self.startDate)
     timestamp = 0
     for data in datas:
         group = data['symbol'].split('--')[0]
         if not self.check(group, data['symbol']): 
             continue
         data['period'] = 60
         data['timestamp'] = data['timestamp']*1000 
         self.get_deque().appendleft((DateType.ON_MKT_KLINE, group, Bar(data)))
         if(timestamp and timestamp < data['timestamp']): 
             self.get_deque().appendleft((DateType.ON_TIMESTAMP, group, {'timestamp': timestamp}))
         timestamp = data['timestamp']
     logging.info(LOAD_DATA_OK % (self.fund, self.strategyName, self.period))
     self.__loadingTag = True
     sendTradeMsg(INIT_MSG % (self.fund, self.strategyName, self.period, self.symbols()))
Exemple #14
0
def filemanagement(rawInFile,infileindices):
    #This is soooo stupid.
    #If there are lots of files, set them up accordingly.
    inFileList = [rawInFile.format(x) for x in infileindices]
    d = []
    for f in inFileList:
        galaxies = common.loadData(f,dataType='CF2')
        #print(galaxiess[1][2])
        minid = np.array([(g.normx,
                           g.normy,
                           g.normz,
                           g.redx,
                           g.redy,
                           g.redz,
                           g.dv,
                           g.d,
                           g.v) for g in galaxies])
        d.append(minid)
    #[np.save("tmp/galaxydata{}".format(i),data) for i,data in enumerate(d)]
    #d = ["tmp/galaxydata{}.npy".format(i) for i in range(100)]
    
    return d
def correlate_box(boxinfo, intervals):
    #load in the subbox
    xs, ys, zs = common.loadData(boxinfo[0])
    #grab its THEORETICAL minimum and maximum values:
    #Need to add detection of actual minimum and maximum values.
    #I might already have the logic in the box cutter function...
    rect_min = boxinfo[1][0]
    rect_max = boxinfo[1][1]
    #and its length
    num_galax = len(xs)
    #make sure we don't have a jagged array somehow
    #assert(num_galax == len(ys) == len(zs))
    #Never had a problem with that
    actual_galaxies = np.array(list(zip(xs, ys, zs)))
    #Make a thread-safe random number generator
    rng = np.random.RandomState()
    #and use it to make a list of random galaxy positions
    random_list = np.array(
        list(
            zip(rng.uniform(rect_min[0], rect_max[0], num_galax),
                rng.uniform(rect_min[1], rect_max[1], num_galax),
                rng.uniform(rect_min[2], rect_max[2], num_galax))))
    #make kd trees
    actual_kd = space.cKDTree(actual_galaxies, 3)
    random_kd = space.cKDTree(random_list, 3)
    DDs = actual_kd.count_neighbors(actual_kd, intervals,
                                    cumulative=False)[1:] / 2
    DRs = actual_kd.count_neighbors(random_kd, intervals,
                                    cumulative=False)[1:] / 2
    RRs = random_kd.count_neighbors(random_kd, intervals,
                                    cumulative=False)[1:] / 2
    #RDs = random_kd.count_neighbors(actual_kd,intervals)
    #Turns out that RDs == DRs always
    #Just think about it.
    #    print('.',end='',flush=True)
    return ((DDs, DRs, RRs))
Exemple #16
0
def distsurvey(hugeFile, surveys, binsize, boxMaxDistance):
    galaxies = np.array(common.loadData(hugeFile, dataType='millPos'))
    masterhist = []
    firstrun = True
    for boxoffset in EDGES:
        distances = space.distance.cdist(galaxies + boxoffset * 500, surveys)
        numSurveys = distances.shape[1]
        assert (numSurveys == len(surveys))
        bins = genBins(binsize, boxMaxDistance)
        histogram = []
        for i in range(numSurveys):
            thisSurveyDistances = distances[:,
                                            i]  #This means take distances for all the galaxies, but only the ith survey
            #That works since the first dimension of distances is the galaxies and the second dimension is surveys
            hist, edges = np.histogram(thisSurveyDistances, bins)
            histogram.append(hist)
        if firstrun:
            masterhist = histogram
        else:
            for i in range(len(masterhist)):
                masterhist[i] = histogram[i] + masterhist[i]
        firstrun = False
    print(".", end="", flush=True)
    return np.array(masterhist)
Exemple #17
0
def surveyOneFile(hugeFile, surveys, selectionParams, histogram,
                  boxMaxDistance):
    """
    Given the original data, distances, wanted numbers, and other parameters we actually generate the
    mock surveys. This is currently the biggest bottleneck of the program, but I'm not entirely sure why.
                  ^ That comment might be outdated (It's not). Since I wrote it, I've rearranged the code a lot for
                  improved efficiency, including using the GPU for some otherwise expensive calculations.
    """
    #Set up variables
    rng = np.random.RandomState()  #Make a process-safe random number generator
    first = True
    mastersurvey = []
    galax_pos = np.array(common.loadData(hugeFile, dataType='millPos'))
    for boxoffset in EDGES:
        # distances = np.load(distanceFile+"_{}_{}_{}.npy".format(boxoffset[0],
        #                                                         boxoffset[1],
        #                                                         boxoffset[2])) #load the distance file !! 7.3% !!
        distances = space.distance.cdist(galax_pos + boxoffset * 500, surveys)
        surveyContent = [[] for i in range(distances.shape[1])
                         ]  #Make the skeleton structure for the end result
        galaxies = common.loadRawMillHybrid(
            hugeFile, boxoffset)  #Load the galaxies !! 13.9% !!
        #WARNING: ^ Function assumes 500x500x500 box
        binsize = selectionParams['info'][
            'shell_thickness']  #Load the size of a bin
        bins = genBins(binsize, boxMaxDistance)
        numSurveys = distances.shape[1]

        #Do calculations
        wantDensity = compute_want_density(
            distances, binsize, **(selectionParams["constants"]))  #!! 20 % !!
        #wantDensity  =selection_values / common.shellVolCenter(distances,binsize)         #!! 22 % !!
        tdistBin = (np.digitize(distances.flatten(), bins) - 1).reshape(
            distances.shape)
        #!!!!IMPORTANT!!!!!
        #tdistBin and distBin are the indexes of the bin each galaxy goes into
        #Remember these are only the galaxies in the subbox we're looking at (surveyONEFILE)

        distBin = np.transpose(tdistBin)  # !! 6 % !!
        histogram = np.concatenate((histogram, np.zeros((100, 1)) - 1), axis=1)

        #When a galaxy is outside chop, its "number of galaxies around here" is set to -1
        #That makes the density negative
        #Which makes the probability negative so it is impossible for that galaxy to be selected.
        something = [histogram[n][distBin[n]] for n in range(numSurveys)]
        #This is the number of galaxies in the same bin as the galaxy and in the same box as the galaxy
        originalCount = np.transpose(np.array(something))  # 2.2%
        #volumes = calcVolumes(np.transpose(np.array(distBin))*binsize + (binsize/2),binsize)# !! 23 % !!
        volumes = calcVolumes(tdistBin, binsize)  # !! 23 % !!
        originalDensity = originalCount / volumes
        #I'm currently under the impression that this for loop is the main bottleneck in this function.
        #It isn't a complicated task, so it might be worthwhile to consider alternate implementations.
        #Let's see...
        #This for loop uses information from distances, binsize, histogram

        # for i,galaxy in enumerate(distances):
        #     for j,surveyDist in enumerate(galaxy):
        #         distBin = int(distBinNotAnInt[i][j])
        #         originalDensity[i][j] = histogram[j][distBin] / common.shellVolCenter(distBin*binsize + (binsize/2),binsize)
        probability = wantDensity / originalDensity
        dice = rng.random_sample(probability.shape)
        toAdd = dice < probability
        # for i,galaxy in enumerate(toAdd):
        #     for j,addBool in enumerate(galaxy):
        #         if addBool:
        #             rawLine = galaxies[i][3]
        #             surveyContent[j].append(rawLine)
        arrGalaxies = np.array(galaxies, dtype="object")
        surveyContent = [arrGalaxies[toAdd[:, n]] for n in range(numSurveys)]
        print(".", end="", flush=True)
        if first:
            mastersurvey = surveyContent
            first = False
        else:
            for i in range(len(surveyContent)):
                mastersurvey[i] = np.concatenate(
                    (mastersurvey[i], surveyContent[i]))
    print("!")
    return mastersurvey
Exemple #18
0
                        default=[0.005], help='Specify the learning rate.')
    parser.add_argument('--lambda', type=float, nargs='+',
                        default=[0], dest='reg', help='Specify the regularization parameter.')
    parser.add_argument('--epochs', type=int, nargs=1,
                        default=[5000], help='Training epoches.')
    parser.add_argument('--batch_size', type=int, nargs=1,
                        default=[500], help='Batch size. For SGD only.')
    parser.add_argument('--beta1', type=float, nargs='+',
                        default=[0.9], help='Specify the beta1 hyperparameter for Adam.')
    parser.add_argument('--beta2', type=float, nargs='+',
                        default=[0.999], help='Specify the beta2 hyperparameter for Adam.')
    parser.add_argument('--epsilon', type=float, nargs='+',
                        default=[1e-8], help='Specify the epsilon hyperparameter for Adam.')

    args = parser.parse_args()
    trainData, validData, testData, trainTarget, validTarget, testTarget = loadData()

    X, y = preprocess_data(trainData, trainTarget)
    N, d = X.shape
    Xt, yt = preprocess_data(testData, testTarget)
    Xv, yv = preprocess_data(validData, validTarget)

    epochs = args.epochs[0]

    # Output path
    path = args.path[0]
    create_dir(path)

    batch_size = args.batch_size[0]
    optimizer = args.optimizer[0]
    loss_type = args.loss[0]
Exemple #19
0
def main(*pArgs):
    l_Ctx = {}     # Environmental context
    l_TestOutput = []

    # Establish the context
    cmn.getOptions(l_Ctx, pArgs[0])

    # Load the data as a pickle file from the input root directory
    cmn.loadData(l_Ctx)
    l_Data = l_Ctx["ServerData"]
    l_StageInData = l_Ctx["StageInData"]["jobIds"]
    l_StageOutData = l_Ctx["StageOutData"]["jobIds"]

    # Print the results
    if l_Ctx["PRINT_PICKLED_RESULTS"]:
        cmn.printFormattedData(l_Ctx, l_Data)

    # Determine list of servers
    l_TestOutput.append("List of Servers%s" % (os.linesep))
    l_Servers = cmn.getServers(l_Data)
    l_TestOutput.append("%d server(s), %s%s" % (len(l_Servers), `l_Servers`, os.linesep))
    l_TestOutput.append("%s" % (os.linesep))

    # Determine list of jobids
    l_TestOutput.append("List of JobIds%s" % (os.linesep))
    l_JobIds = cmn.getJobIds(l_Data)
    l_TestOutput.append("%d jobid(s), %s%s" % (len(l_JobIds), `l_JobIds`, os.linesep))
    l_TestOutput.append("%s" % (os.linesep))

    # Determine jobids per server
    l_TestOutput.append("List of JobIds per Server%s" % (os.linesep))
    for l_Server in l_Servers:
        l_JobIds = cmn.getJobIdsForServer(l_Data, l_Server)
        l_TestOutput.append("For server %s, %d jobid(s), %s%s" % (l_Server, len(l_JobIds), `l_JobIds`, os.linesep))
    l_TestOutput.append("%s" % (os.linesep))

    # Determinne servers per jobid
    l_TestOutput.append("List of Servers per JobId%s" % (os.linesep))
    l_JobIds = cmn.getJobIds(l_Data)
    for l_JobId in l_JobIds:
        l_Servers = cmn.getServersForJobid(l_Data, l_JobId)
        l_TestOutput.append("For jobid %d, %d server(s), %s%s" % (l_JobId, len(l_Servers), `l_Servers`, os.linesep))
    l_TestOutput.append("%s" % (os.linesep))

    # Determine handles per jobid, per server, per jobstepid, per connection
    l_TestOutput.append("List of Handles per JobId, per Server, per JobStepId, per Connection%s" % (os.linesep))
    for l_JobId in l_JobIds:
        l_Servers = cmn.getServersForJobid(l_Data, l_JobId)
        l_TestOutput.append("%sJobId %d, %d server(s)%s" % (2*" ", l_JobId, len(l_Servers), os.linesep))
        for l_Server in l_Servers:
            l_JobStepIds = cmn.getJobStepIdsForServerJobId(l_Data, l_Server, l_JobId)
            l_TestOutput.append("%sJobId %d, server %s, %d jobstep(s)%s" % (4*" ", l_JobId, l_Server, len(l_JobStepIds), os.linesep))
            for l_JobStepId in l_JobStepIds:
                l_ConnectionHandleData = cmn.getHandlesPerConnectionForJobIdJobStepId(l_Data, l_Server, l_JobId, l_JobStepId)
                l_ConnectionHandles = l_ConnectionHandleData.keys()
                l_ConnectionHandles.sort()
                l_TestOutput.append("%sJobStepId %d, %d connection(s)%s" % (6*" ", l_JobStepId, len(l_ConnectionHandles), os.linesep))
                for l_Connection in l_ConnectionHandles:
                    l_Handles = l_ConnectionHandleData[l_Connection]
                    l_Handles.sort()
                    l_TestOutput.append("%sConnection %s. %d handle(s), %s%s" % (8*" ", l_Connection, len(l_Handles), `l_Handles`, os.linesep))
            l_TestOutput.append("%s" % (os.linesep))

    # Print out stagein and stageout logs for the found jobids
    l_TestOutput.append("StageIn and StageOut Log Data for Found Jobs%s" % (os.linesep))
    for l_JobId in l_JobIds:
        l_TestOutput.append("%sJobId %d%s" % (2*" ", l_JobId, os.linesep))
        l_TestOutput.append("%sStageIn Log Data%s" % (4*" ", os.linesep))
        l_OutputGenerated = False
        if l_JobId in l_StageInData:
            for l_Line in l_StageInData[l_JobId]:
                l_TestOutput.append("%s%s%s" % (6*" ", l_Line, os.linesep))
                l_OutputGenerated = True
        if not l_OutputGenerated:
            l_TestOutput.append("%sNo stagein data found%s" % (6*" ", os.linesep))
        l_TestOutput.append("%s" % (os.linesep))

        l_TestOutput.append("%sStageOut Log Data%s" % (4*" ", os.linesep))
        l_OutputGenerated = False
        if l_JobId in l_StageOutData:
            for l_Line in l_StageOutData[l_JobId]:
                l_TestOutput.append("%s%s%s" % (6*" ", l_Line, os.linesep))
                l_OutputGenerated = True
        if not l_OutputGenerated:
            l_TestOutput.append("%sNo stageout data found%s" % (6*" ", os.linesep))
        l_TestOutput.append("%s" % (os.linesep))

    # Write out the basic results
    l_PathFileName = os.path.join(l_Ctx["ROOTDIR"], "Analysis", "BasicData.txt")
    cmn.writeOutput(l_Ctx, l_PathFileName, l_TestOutput)
    print "Basic results written to %s" % l_PathFileName
    print

    # Start detailed analysis per jobid/jobstepid, per server
    #
    # For each jobid...
    l_JobIds = cmn.getJobIds(l_Data)
    for l_JobId in l_JobIds:
        l_Output = {}
        l_Output[l_JobId] = {}
        l_NumberOfConnectionsForAllServers = 0
        l_NumberOfHandlesForAllServers = 0

        # For each server...
        l_Servers = cmn.getServersForJobid(l_Data, l_JobId)
        for l_Server in l_Servers:
            l_Output[l_JobId][l_Server] = {}
            l_NumberOfConnectionsForServer = 0
            l_NumberOfHandlesForServer = 0

            # For each jobstepid...
            l_JobStepIds = cmn.getJobStepIdsForServerJobId(l_Data, l_Server, l_JobId)
            for l_JobStepId in l_JobStepIds:
                l_Output[l_JobId][l_Server][l_JobStepId] = {}
                l_Output[l_JobId][l_Server][l_JobStepId]["handles"] = {}
                l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["NumberOfConnections"] = 0
                l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["SizeTransferred"] = 0
                l_Output[l_JobId][l_Server][l_JobStepId]["NotSuccessfulHandles"] = []
                l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"] = {}

                # For each handle...
                l_Handles = cmn.getHandlesForServer(l_Data, l_Server)
                l_NumberOfHandles = 0
                l_HandleProcessingTimes = [None,None]
                l_NotSuccessfulHandles = []
                for l_Handle in l_Handles:
                    if l_Data[l_Server]["Handles"][l_Handle]["JobId"] == l_JobId and \
                       l_Data[l_Server]["Handles"][l_Handle]["JobStepId"] == l_JobStepId:

                        # JobId and JobStepId matches...
                        l_NumberOfHandles = l_NumberOfHandles + 1
                        l_NumberOfHandlesForServer = l_NumberOfHandlesForServer + 1
                        l_NumberOfHandlesForAllServers = l_NumberOfHandlesForAllServers + 1
                        if "Status" in l_Data[l_Server]["Handles"][l_Handle]:
                            if l_Data[l_Server]["Handles"][l_Handle]["Status"] != "BBFULLSUCCESS":
                                l_NotSuccessfulHandles.append((l_Handle, l_Data[l_Server]["Handles"][l_Handle]["Status"]))
                        else:
                            l_NotSuccessfulHandles.append((l_Handle, "NOT_COMPLETED"))
                        l_NumberOfConnections = l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["NumberOfConnections"]
                        l_SizeTransferred = l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["SizeTransferred"]

                        # For each connection...
                        l_Connections = l_Data[l_Server]["Handles"][l_Handle]["Connections"]
                        for l_Connection in l_Connections:
                            if l_Connection not in l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"]:
                                l_NumberOfConnections = l_NumberOfConnections + 1
                                l_NumberOfConnectionsForServer = l_NumberOfConnectionsForServer + 1
                                l_NumberOfConnectionsForAllServers = l_NumberOfConnectionsForAllServers + 1
                                l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection] = {}
                                l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"] = {}
                                l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["NumberOfContribIds"] = 0
                                l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["processingTimes (File Min/Max)"] = [None,None]
                                l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["readTimes (File Min/Max)"] = [None,None]
                                l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["writeTimes (File Min/Max)"] = [None,None]
                                l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["syncTimes (File Min/Max)"] = [None,None]

                            # For each LVUuid...
                            l_LVUuids = l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"]
                            for l_LVUuid in l_LVUuids:

                                # For each ContribId...
                                l_ContribIds = l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"]
                                l_ProcessingTimes = l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["processingTimes (File Min/Max)"]
                                l_NumberOfContribIds = l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["NumberOfContribIds"]
                                l_NotSuccessfulContribIds = []
                                for l_ContribId in l_ContribIds:
                                    l_NumberOfContribIds = l_NumberOfContribIds + 1
                                    if "Status" in l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]:
                                        if l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["Status"] != "BBFULLSUCCESS":
                                            l_NotSuccessfulContribIds.append((l_ContribId, l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["Status"]))
                                    else:
                                        l_NotSuccessfulContribIds.append((l_ContribId,"NOT_COMPLETED"))
                                    if "SizeTransferred" in l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]:
                                        l_SizeTransferred += l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["SizeTransferred"]
                                    if "ProcessingTime" in l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]:
                                        l_ProcessingTime = (l_ContribId, l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["ProcessingTime"])
                                        if l_ProcessingTimes == [None,None]:
                                            l_ProcessingTimes = [l_ProcessingTime,l_ProcessingTime]
                                        else:
                                            if (l_ProcessingTime[1] < l_ProcessingTimes[0][1]):
                                                l_ProcessingTimes[0] = l_ProcessingTime
                                            if (l_ProcessingTime[1] > l_ProcessingTimes[1][1]):
                                                l_ProcessingTimes[1] = l_ProcessingTime

                                        if l_HandleProcessingTimes == [None,None]:
                                            l_HandleProcessingTimes = l_ProcessingTimes
                                        else:
                                            if (l_ProcessingTimes[0][1] < l_HandleProcessingTimes[0][1]):
                                                l_HandleProcessingTimes[0] = l_ProcessingTimes[0]
                                            if (l_ProcessingTimes[1][1] > l_HandleProcessingTimes[1][1]):
                                                l_HandleProcessingTimes[1] = l_ProcessingTimes[1]

                                    l_ReadTimes = l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["readTimes (File Min/Max)"]
                                    l_WriteTimes = l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["writeTimes (File Min/Max)"]
                                    l_SyncTimes = l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["syncTimes (File Min/Max)"]

                                    l_TransferTypes = set()
                                    if "Files" in l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]:
                                        l_Files = l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["Files"]

                                        # For each file...
                                        for l_File in l_Files:
                                            if "ReadCount" in l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["Files"][l_File]:
                                                l_ReadTime = (l_ContribId, l_File,
                                                              (l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["Files"][l_File]["ReadCount"],
                                                               l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["Files"][l_File]["ReadTime"]))
                                                if l_ReadTimes == [None,None]:
                                                    l_ReadTimes = [l_ReadTime,l_ReadTime]
                                                else:
                                                    if (l_ReadTime[2][1] < l_ReadTimes[0][2][1]):
                                                        l_ReadTimes[0] = l_ReadTime
                                                    if (l_ReadTime[2][1] > l_ReadTimes[1][2][1]):
                                                        l_ReadTimes[1] = l_ReadTime
                                                l_WriteTime = (l_ContribId, l_File,
                                                               (l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["Files"][l_File]["WriteCount"],
                                                                l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["Files"][l_File]["WriteTime"]))
                                                if l_WriteTimes == [None,None]:
                                                    l_WriteTimes = [l_WriteTime,l_WriteTime]
                                                else:
                                                    if (l_WriteTime[2][1] < l_WriteTimes[0][2][1]):
                                                        l_WriteTimes[0] = l_WriteTime
                                                    if (l_WriteTime[2][1] > l_WriteTimes[1][2][1]):
                                                        l_WriteTimes[1] = l_WriteTime
                                                l_TransferTypes.add(l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["Files"][l_File]["TransferType"])

                                                # NOTE:  The output will associate the sync count/time with the source file, when in fact that data is for the corresponding target files.
                                                if l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["Files"][l_File]["SyncCount"] != None:
                                                    l_SyncTime = (l_ContribId, l_File,
                                                                  (l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["Files"][l_File]["SyncCount"],
                                                                   l_Data[l_Server]["Handles"][l_Handle]["Connections"][l_Connection]["LVUuids"][l_LVUuid]["ContribIds"][l_ContribId]["Files"][l_File]["SyncTime"]))
                                                    if l_SyncTimes == [None, None]:
                                                        l_SyncTimes = [l_SyncTime, l_SyncTime]
                                                    else:
                                                        if (l_SyncTime[2][1] < l_SyncTimes[0][2][1]):
                                                            l_SyncTimes[0] = l_SyncTime
                                                        if (l_SyncTime[2][1] > l_SyncTimes[1][2][1]):
                                                            l_SyncTimes[1] = l_SyncTime
                                    # End of files...
                                    l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["readTimes (File Min/Max)"] = l_ReadTimes
                                    l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["writeTimes (File Min/Max)"] = l_WriteTimes
                                    l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["syncTimes (File Min/Max)"] = l_SyncTimes
                                    l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["transferTypes"] = [l_TransferType for l_TransferType in l_TransferTypes]
                                # End of contribids...
                                if l_NotSuccessfulContribIds:
                                    if "NotSuccessfulContribIds" not in l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]:
                                        l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["NotSuccessfulContribIds"] = []
                                    l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["NotSuccessfulContribIds"] = l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["NotSuccessfulContribIds"] + l_NotSuccessfulContribIds
                                l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["NumberOfContribIds"] = l_NumberOfContribIds
                                l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["connections"][l_Connection]["contribIds"]["processingTimes (File Min/Max)"] = l_ProcessingTimes
                        # End of connections...
                        l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["NumberOfConnections"] = l_NumberOfConnections
                        if l_SizeTransferred:
                            l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["SizeTransferred"] = l_SizeTransferred
                # End of handles...
                if l_NotSuccessfulHandles:
                    l_Output[l_JobId][l_Server][l_JobStepId]["NotSuccessfulHandles"] = l_Output[l_JobId][l_Server][l_JobStepId]["NotSuccessfulHandles"] + l_NotSuccessfulHandles
                l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["NumberOfHandles"] = l_NumberOfHandles
                l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["ProcessingTimes (ContribId Min/Max)"] = l_HandleProcessingTimes
            l_Output[l_JobId][l_Server]["NumberOfConnectionsForServer"] = l_NumberOfConnectionsForServer
            l_Output[l_JobId][l_Server]["NumberOfHandlesForServer"] = l_NumberOfHandlesForServer
        # End of servers...
        l_Output[l_JobId]["NumberOfConnectionsForAllServers"] = l_NumberOfConnectionsForAllServers
        l_Output[l_JobId]["NumberOfHandlesForAllServers"] = l_NumberOfHandlesForAllServers

        # Calculate the min/max processing times for all contribids, across all servers
        l_ServerProcessingTimes = [None, None]
        for l_Server in l_Output[l_JobId].keys():
            if type(l_Output[l_JobId][l_Server]) == dict:
                for l_JobStepId in l_Output[l_JobId][l_Server].keys():
                    if type(l_Output[l_JobId][l_Server][l_JobStepId]) == dict:
                        if l_ServerProcessingTimes == [None, None]:
                            l_ServerProcessingTimes = l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["ProcessingTimes (ContribId Min/Max)"]
                        else:
                            if (l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["ProcessingTimes (ContribId Min/Max)"][0][1] < l_ServerProcessingTimes[0][1]):
                                l_ServerProcessingTimes[0] = l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["ProcessingTimes (ContribId Min/Max)"][0]
                            if (l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["ProcessingTimes (ContribId Min/Max)"][1][1] > l_ServerProcessingTimes[1][1]):
                                l_ServerProcessingTimes[1] = l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["ProcessingTimes (ContribId Min/Max)"][1]
        l_Output[l_JobId]["ProcessingTimes (All Servers ContribId Min/Max)"] = l_ServerProcessingTimes

        # Format data/add average size transferred per handle data
        for l_JobId in l_Output:
            for l_Server in l_Output[l_JobId]:
                if type(l_Output[l_JobId][l_Server]) == dict:
                    for l_JobStepId in l_Output[l_JobId][l_Server]:
                        # NOTE: Not every l_Server element is a server name.
                        #       We only want to process those with a "Handles" key.
                        try:
                            if "SizeTransferred" in l_Output[l_JobId][l_Server][l_JobStepId]["handles"]:
                                if l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["SizeTransferred"]:
                                    l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["Avg SizeTransferred/Handle"] = cmn.numericFormat(round(float(l_Output[l_JobId][l_Server]["handles"]["SizeTransferred"])/float(l_Output[l_JobId][l_Server]["handles"]["NumberOfHandles"]),3))
                                    l_Output[l_JobId][l_Server][l_JobStepId]["handles"]["SizeTransferred"] = cmn.numericFormat(l_Output[l_JobId][l_Server]["handles"]["SizeTransferred"])
                        except Exception:
                            pass

        # Output the results
#        pprint.pprint(l_Output)

        cmn.ensure(os.path.join(l_Ctx["ROOTDIR"], "Analysis", `l_JobId`))
        l_PathFileName = os.path.join(l_Ctx["ROOTDIR"], "Analysis", `l_JobId`, "Details.txt")
        cmn.printFormattedFile(l_Ctx, l_PathFileName, l_Output)
        l_PathFileName = os.path.join(l_Ctx["ROOTDIR"], "Analysis", `l_JobId`, "Details.json")
        cmn.printFormattedFileAsJson(l_Ctx, l_PathFileName, l_Output)

    return
Exemple #20
0
def main(args):
    np.seterr(divide='ignore',invalid='ignore')
    """ Compute the velocity correlations on one or many galaxy surveys. 
    """
    #Get setup information from the settings files
    settings =   common.getdict(args.settings)
    if settings['num_files'] != 10000 and settings['use_npy']:
        print("Sorry! We can only handle 100x100 surveys. Try turning off the use_npy flag.")
        exit()
    numpoints =    settings['numpoints']
    dr =           settings['dr']
    min_r =        settings['min_r']
    orig_outfile = settings['output_file_name']
    step_type =    settings['step_type']
    infile =       settings['input_file']
    unitslist =    settings['binunits']
    maxd_master =  settings['max_distance']
    numpy =        settings['use_npy']
    use_tmp =      settings['use_tmp']
    if settings['many_squared']:
        distance_args_master = list(zip(dr,min_r,numpoints))
        file_schemes  = list(zip(infile,orig_outfile,settings['readable_name']))
        xintervals = [common.genBins(x[1],x[2],x[0],step_type) for x in distance_args_master]
        xs_master = [a[0] for a in xintervals]
        intervals_master = [a[1] for a in xintervals]
    else:
        #Everything is built around lists now, so we just build lists of length one!
        distance_args_master = [(dr,min_r,numpoints)]
        file_schemes = [(infile,orig_outfile,settings['readable_name'])]
        xs_master,intervals_master = common.genBins(min_r,numpoints,dr,step_type)
        xs_master = [xs_master]
        intervals_master = [intervals_master]
    if numpy:
        if args.override:
            print(args.override)
            indices = args.override.split(':')
            a = int(indices[0])
            b = int(indices[1])
            file_schemes = file_schemes[a:b]
        print(file_schemes)
    else:    
        infileindices = [x + settings['offset'] for x in range(settings['num_files'])]
    for rawInFile, outfile, readName in file_schemes:
        for units in unitslist:
            if units == 'km/s':
                
                xs = [[x * 100 for x in y] for y in xs_master]
                intervals = [[x * 100 for x in y] for y in intervals_master]
                distance_args = [(x[0]*100,x[1]*100,x[2]) for x in distance_args_master]
                maxd = maxd_master * 100
            else:
                xs = xs_master
                intervals = intervals_master
                distance_args = distance_args_master
                maxd = maxd_master
                
            if settings['many'] and not numpy:
                d = filemanagement(rawInFile,infileindices)
                i = d
            elif not numpy:
                galaxies = common.loadData(rawInFile,dataType='CF2')
                d = [np.array([(g.normx,
                                g.normy,
                                g.normz,
                                g.redx,
                                g.redy,
                                g.redz,
                                g.dv,
                                g.d,
                                g.v) for g in galaxies])]
                i = ['nothing']
            else:
                print("Loading numpy file...")
                data = np.load(rawInFile)
                print("Handling NaNs")
                nansremoved = [ data[x][np.invert(np.isnan(data[x][:,0]))] for x in range(100)]
                del data
                #for x in range(100):
                #    np.save('/tmp/c156r133-{}/vcorr-{}'.format(b,x),nansremoved[x])
                #df = ['/tmp/c156r133-{}/vcorr-{}.npy'.format(b,x//100) for x in range(10000) ]
                d = [ nansremoved[x//100] for x in range(10000) ]
                i = [ x%100  for x in range(10000) ]
                #print(d[542].shape)
            print("Opening Pool...")
            gc.collect()
            with Pool(processes=NUM_PROCESSES) as pool:
                print("Generating Histograms...")
                histogramData = list(pool.starmap(turboRun,zip(d,i,itertools.repeat(numpy),
                                                               itertools.repeat(maxd),
                                                               itertools.repeat(units),
                                                               itertools.repeat(xs),
                                                               itertools.repeat(intervals),
                                                               itertools.repeat(use_tmp)
                                                           )))
                """
                Each turbo run returns a list of histograms [ 5-length histogram, 10-length histogram, 20-length etc]
                so histogramData is a list of turbo runs, which means data is a jagged array
                data = [
                
                [ [ ----- ],
                  [ ---------- ],
                  [ -------------------- ] ],
                
                [ [ ----- ],
                  [ ---------- ],
                  [ -------------------- ] ],
                
                ]
                """
            for scheme_index in range(len(intervals)):
                hist_for_scheme = np.array([turbo_data[scheme_index] for turbo_data in histogramData])
                saveOutput(hist_for_scheme,outfile.format('',distance_args[scheme_index][0],units.replace('/','')))
            print(" Done!")
Exemple #21
0
def singlerun(filename, outputFile, binsize, chop, modelOverride=None):
    fig = plt.figure()
    galaxies = common.loadData(filename, dataType="CF2")
    distances = [galaxy.d for galaxy in galaxies]
    #get a list of all the distances to galaxies. This will let us send it directly to the histogram

    bins_orig = genBins(binsize, chop)

    #Make a histogram using pylab histogram function.
    n, bins, patches = plt.hist(
        distances,
        bins_orig,
        histtype="stepfilled",
        label="Galaxy Distribution,\n binsize={:.2f}Mpc".format(binsize))

    #Change visual properties of the histogram
    plt.setp(patches, 'facecolor', 'g', 'alpha', 0.75)
    robot = chi_sq_solver(bins, n, selection_function)
    if modelOverride is None:
        #If we don't have an existing model to use, we find a best fit and plot it
        #Solve the chi squared optimization for the histogram and selection function
        params = robot.result.x
        #Plot the best fit
        domain = np.arange(0, chop, 1)
        model = [selection_function(r, *(robot.result.x)) for r in domain]
        plt.plot(
            domain,
            model,
            'k--',
            linewidth=1.5,
            label=
            "Model fit: $A = {:.3f}$\n$r_0 = {:.3f}$\n$n_1 = {:.3f}$\n$n_2={:.3f}$\n$\chi^2={chisq:.3f}$"
            .format(*(robot.result.x), chisq=robot.result.fun))
        chisq = robot.result.fun
    else:
        #Plot the model given in the settings function instead of calculating a new one
        mo = modelOverride["constants"]
        params = [mo['A'], mo['r_0'], mo['n_1'], mo['n_2']]
        chisq = robot.chi_sq(params)
        domain = np.arange(0, chop, 1)
        model = [selection_function(r, *params) for r in domain]
        plt.plot(
            domain,
            model,
            'k--',
            linewidth=1.5,
            label=
            "Model fit: $A = {:.3f}$\n$r_0 = {:.3f}$\n$n_1 = {:.3f}$\n$n_2={:.3f}$\n$\chi^2={chisq:.3f}$"
            .format(*params, chisq=chisq))

    #Add axis labels
    plt.ylabel("Galaxy count")
    plt.xlabel("Distance, Mpc/h")
    plt.title("Distribution of Galaxy Distance")
    plt.legend()
    plt.axis([0, chop, 0, 1300])
    fig2 = plt.figure()
    shellVolume = [
        common.shellVolCenter(robot.centerbins[i], binsize)
        for i in range(len(n))
    ]
    plt.title("Galaxies per Cubic Mpc")
    plt.xlabel("Distance, Mpc/h")
    plt.ylabel("Density, galaxies/(Mpc/h)^3")
    density = [n[i] / shellVolume[i] for i in range(len(n))]
    plt.plot(robot.centerbins, density, 'o')
    #Save figure
    with pdfback.PdfPages(outputFile + str(binsize) + '.pdf') as pdf:
        pdf.savefig(fig)
        pdf.savefig(fig2)
    if modelOverride is None:
        #Write paramaters to a file for later use.
        common.writedict(
            outputFile + str(binsize) + '_params.json', {
                'constants': {
                    'A': params[0],
                    'r_0': params[1],
                    'n_1': params[2],
                    'n_2': params[3]
                },
                'info': {
                    'shell_thickness': binsize,
                    'max_radius': chop,
                    'chisq': chisq
                }
            })
    plt.close('all')
def perturb(infile, outfile, err, num, ptype, est, mod, lots):
    """Infile and outfile are filenames. Outfile always has a {} in it, and infile should have one too if you're
    using the 'lots' option.

    err is the error, usually like 0.2 for distance perturbations or 0.02 for modulus perturbations

    ptype, est, and mod are strings that describe the combination of perturbation type, modulus and estimator to use.

    ptype shouls be 'distance', 'modulus', or 'relative'
    est should be 'cz' or 'feldman' where cz=v+h*d and feldman is the unbiased log estimator.
    modulus only applies when using modulus and relative ptypes, and has choices 'ln','log', and 'textbook'
    
    if lots is not 0, False, or None then it should be an integer specifying the number of infiles. infile should
    then have a {} in it for the index. Outfiles will be numbered as follows: infile_index*num + outfile_index
    where infile_index goes from zero to lots and outfile_index goes from 0 to num
    """
    if lots:
        infiles = [infile.format(x) for x in range(lots)]
    else:
        infiles = [infile]
    surveys = []
    for in_i, infile in enumerate(infiles):
        num_acks = 0
        second_order_acks = 0
        num_errs = 0
        hubble_constant = 100
        galaxies = common.loadData(infile, 'CF2')
        perturbed_vs = []
        delta_vs = []

        for galaxy in galaxies:
            #q_0 = -0.595
            #z = galaxy.cz/(3*10**8)
            #zmod = z*(1 + 0.5*(1-q_0)*z + (1/6)*(2-q_0-3q_0**2)*z**2)
            if abs(galaxy.v) > galaxy.cz / 10:
                num_acks += 1

            if ptype == "distance":
                skewed_distance = np.random.normal(galaxy.d,
                                                   abs(galaxy.d * err), num)
            elif ptype == "modulus":
                inmod = modulusify(galaxy.d, mod)
                pmod = np.random.normal(inmod, err, num)
                skewed_distance = unmodulusify(pmod, mod)
            elif ptype == "relative":
                inmod = modulusify(galaxy.d, mod)
                pmod = np.random.normal(inmod, np.abs(err * inmod), num)
                skewed_distance = unmodulusify(pmod, mod)

            if est == "cz":
                try:
                    velocities = galaxy.cz - hubble_constant * skewed_distance
                    dv = galaxy.d * err * hubble_constant
                except FloatingPointError:  #I don't think it's possible to have a FP error here... Could be wrong?
                    num_errs += 1
                    print("I was wrong")
                    continue
            elif est == "feldman":
                try:
                    velocities = galaxy.cz * np.log(
                        galaxy.cz / (hubble_constant * skewed_distance))
                    dv = galaxy.cz * err  #calculate_error(distance_modulus,galaxy.d,frac_error,args)
                    for velocity in velocities:
                        if abs(velocity) > galaxy.cz / 10:
                            second_order_acks += 1
                except FloatingPointError:
                    num_errs += 1
                    continue
            perturbed_vs.append((velocities, dv, skewed_distance, galaxy))

        print(
            "{} out of {} galaxies ({:.2f}) had true velocity NOT much less than redshift,"
            .format(num_acks, len(galaxies), num_acks / len(galaxies)))
        print(
            "i.e. the condition on our estimator that v << cz was not satisfied."
        )
        print("This happened to the random data {} times out of {}.".format(
            second_order_acks, num * len(galaxies)))
        print(
            "Also, {} FloatingPoint errors happened, even after taking out the close-by galaxies."
            .format(num_errs))
        print()
        survey = []
        for v, dv, d, galaxy in perturbed_vs:
            np1 = np.array((galaxy.normx, galaxy.normy, galaxy.normz,
                            galaxy.redx, galaxy.redy, galaxy.redz, dv))

            survey.append(np.concatenate((np1, d, v)))
        surveys.append(survey)
    maxlength = max([len(survey) for survey in surveys])
    surveylength = len(surveys[0][0])
    for survey in surveys:
        for x in range(len(survey), maxlength):
            filler = np.empty(surveylength)
            filler[:] = np.NAN
            survey.append(filler)
    surveysnp = np.array(surveys)
    print(surveysnp.shape)
    np.save(outfile, surveysnp)
Exemple #23
0
def main(args):
    """ Compute the velocity correlations on one or many galaxy surveys. 
    """
    print("Incomplete function - see comments")
    exit()
    #Get setup information from the settings file
    settings = common.getdict(args.settings)
    numpoints = settings["numpoints"]
    outfolder = settings["output_data_folder"]
    outfile = settings["output_file_name"]
    rawInFile = settings["input_file"]
    step_type = settings["step_type"]
    dr = settings["dr"]
    min_r = settings["min_r"]
    if settings["many"]:
        #If there are lots of files, set them up accordingly.
        inFileList = [
            rawInFile.format(x + settings['offset'])
            for x in range(settings["num_files"])
        ]
    else:
        inFileList = [rawInFile]
    xs, intervals = common.genBins(min_r, numpoints, dr, step_type)
    for index, infile in enumerate(inFileList):
        #Load the survey
        galaxies = np.array(common.loadData(infile, dataType="millVel"))
        print(galaxies.shape)
        #Put just the galaxy positions into one array
        positions = galaxies[:, 0:3]  # [(x,y,z),...]
        velocities = galaxies[:, 3:6]

        kd = cKDTree(positions)
        pairs = kd.query_pairs(max(intervals))
        npPairs = np.array(list(pairs))
        g1pos = positions[npPairs[:, 0]]
        g2pos = positions[npPairs[:, 1]]

        g1vs = velocities[npPairs[:, 0]]
        g2vs = velocities[npPairs[:, 1]]

        distBetweenG1G2 = np.linalg.norm(g2pos - g1pos, axis=1)

        velocityCorrelation = inner1d(g1vs, g2vs) / 10**4

        c11 = g1vs[:, 0] * g2vs[:, 0]
        c12 = g1vs[:, 0] * g2vs[:, 1]
        c13 = g1vs[:, 0] * g2vs[:, 2]
        c21 = g1vs[:, 1] * g2vs[:, 0]
        c22 = g1vs[:, 1] * g2vs[:, 1]
        c23 = g1vs[:, 1] * g2vs[:, 2]
        c31 = g1vs[:, 2] * g2vs[:, 0]
        c32 = g1vs[:, 2] * g2vs[:, 1]
        c33 = g1vs[:, 2] * g2vs[:, 2]

        n, bins = np.histogram(distBetweenG1G2, bins=intervals)

        correlation11, bins = np.histogram(distBetweenG1G2,
                                           bins=intervals,
                                           weights=c11)
        correlation12, bins = np.histogram(distBetweenG1G2,
                                           bins=intervals,
                                           weights=c12)
        correlation13, bins = np.histogram(distBetweenG1G2,
                                           bins=intervals,
                                           weights=c13)
        correlation21, bins = np.histogram(distBetweenG1G2,
                                           bins=intervals,
                                           weights=c21)
        correlation22, bins = np.histogram(distBetweenG1G2,
                                           bins=intervals,
                                           weights=c22)
        correlation23, bins = np.histogram(distBetweenG1G2,
                                           bins=intervals,
                                           weights=c23)
        correlation31, bins = np.histogram(distBetweenG1G2,
                                           bins=intervals,
                                           weights=c31)
        correlation32, bins = np.histogram(distBetweenG1G2,
                                           bins=intervals,
                                           weights=c32)
        correlation33, bins = np.histogram(distBetweenG1G2,
                                           bins=intervals,
                                           weights=c33)

        a11 = correlation11 / n
        a12 = correlation12 / n
        a13 = correlation13 / n
        a21 = correlation21 / n
        a22 = correlation22 / n
        a23 = correlation23 / n
        a31 = correlation31 / n
        a32 = correlation32 / n
        a33 = correlation33 / n

        f, ((ax11, ax12, ax13), (ax21, ax22, ax23),
            (ax31, ax32, ax33)) = plt.subplots(3,
                                               3,
                                               sharex='col',
                                               sharey='row',
                                               figsize=(11, 8.5))

        ax11.plot(xs, a11)
        ax12.plot(xs, a12)
        ax13.plot(xs, a13)
        ax21.plot(xs, a21)
        ax22.plot(xs, a22)
        ax23.plot(xs, a23)
        ax31.plot(xs, a31)
        ax32.plot(xs, a32)
        ax33.plot(xs, a33)

        #set x axis and y axis to be the same
        #go out to until correlation is zero

        f.suptitle('3-D velocity correlation')
        ax31.set_xlabel('Distance, Mpc/h')
        ax32.set_xlabel('Distance, Mpc/h')
        ax33.set_xlabel('Distance, Mpc/h')

        ax11.set_ylabel('correlation, $(km/s)^2$')
        ax21.set_ylabel('correlation, $(km/s)^2$')
        ax31.set_ylabel('correlation, $(km/s)^2$')

        with pdfback.PdfPages(outfolder + outfile.format(index)) as pdf:
            pdf.savefig(f)
        pylab.close('all')