Exemple #1
0
 def __init__(self, df, params, distributed=True):
     self.params = params
     self.name = params.NAME
     self.query = params.CAT_QUERY
     self.do_variance_weighted = params.DO_VARIANCE_WEIGHTED
     self.N_objects_in_this_run = len(df)
     self.JK_Ngroups = params.JK_NGROUPS
     self.runJK(df, self.params, distributed)
     if 'tiled' in self.params.JK_RESAMPLING_METHOD:
         self.JK_Ngroups = self.kSZ_curveJK_realizations.shape[0]
     self.cov = JK_tools.getCovMatrix(self.bin_names,
                                      self.kSZ_curveJK_realizations,
                                      params)
     self.corr = JK_tools.getCorrMatrix(self.bin_names,
                                        self.kSZ_curveJK_realizations)
Exemple #2
0
    def runJK(self, df, params, distributed):
        t1 = time.time()
        if distributed is True:
            resampling_method = params.JK_RESAMPLING_METHOD.lower()
            do_massboosted = resampling_method == 'bs_dt_mass_boosted_est'
            do_massboosted_debiased = resampling_method == 'bs_dt_mass_boosted_est_debiased'  # noqa
            if do_massboosted or do_massboosted_debiased:
                res = run_JK_distributed_massboosted(df, params)  # noqa
            else:
                res = distributed_JK_kSZ.run_JK_distributed(df, params,
                                                            randomize=True)
        else:
            res = singleMachine_JK_kSZ.run_JK_local(df, params,
                                                    randomize=True)
        t2 = time.time()
        fullDataset_results, jk_results = res
        rsep = fullDataset_results[0]
        p_uk = fullDataset_results[1]

        jk_results = [jk_results[j][1] for j in range(len(jk_results))]
        jk_results = np.array(jk_results)

        self.rsep = rsep
        self.bin_edges = params.BIN_EDGES
        self.bin_names = JK_tools.getBinNamesFromBinEdges(params.BIN_EDGES)
        self.kSZ_curveFullDataset = p_uk
        self.kSZ_curveJK_realizations = jk_results
        self.errorbars = getErrorbars(jk_results, params)
        self.runtime = t2 - t1
Exemple #3
0
def test_getCorrMatrix():
    bin_names = ['0 - 5', '5 - 10', '10 - 15', '15 - 20']
    pests = np.random.random(size=[50, 4])
    corr = JK_tools.getCorrMatrix(bin_names, pests).values
    corr_numpy = np.corrcoef(pests.T)
    chi_sq = ((corr-corr_numpy)**2).flatten().sum()
    assert chi_sq < 1e-10
Exemple #4
0
def test_indicesToDrop():
    N = 20000
    a = np.random.random(N)
    df = pd.DataFrame({'a': a})
    groups = JK_tools.indicesToDrop(df, 4)
    assert len(groups) == 4
    assert len(groups[0]) == 5000
Exemple #5
0
def run_JK_local(df, params, randomize=True, multithreading=False):
    '''Receives the pandas df with objects with temp decrements and the
    parameter file object.

    Runs the ksz estimator and runs jackknifes.

    Everything runs locally, make sure you have requested the resources you
    are using.

    df: dataframe object with the variables for the calculation
    params: param file for this calculation
    NJK: how many subgroups for the run_JK
    '''
    print("Running a JK run on the local machine, this will take a while.")
    Ngroups = params.JK_NGROUPS
    fullDataset_results = pairwiser.get_pairwise_ksz(df,
                                                     params,
                                  multithreading=multithreading) # noqa
    indices_toDrop = JK_tools.indicesToDrop(df, Ngroups, randomize=randomize)
    jk_results = []
    for j in range(Ngroups):
        print "%i/%i" % (j, Ngroups)
        data_JK = df.drop(indices_toDrop[j], inplace=False)
        jk_results.append(pairwiser.get_pairwise_ksz(data_JK,
                                                     params,
                                    multithreading=multithreading)) # noqa
    return fullDataset_results, jk_results
Exemple #6
0
    def run_resample(self, df1, df2, params):
        t1 = time.time()
        res = cross_distributed.run_error_estimation_distributed(
            df1, df2, params)
        t2 = time.time()

        fullDataset_results11 = res['full11']
        fullDataset_results12 = res['full12']
        fullDataset_results22 = res['full22']

        rsep = fullDataset_results11[0]
        p_uk11 = fullDataset_results11[1]
        p_uk12 = fullDataset_results12[1]
        p_uk22 = fullDataset_results22[1]

        resampled_results11 = [
            res['resampled11'][j][1] for j in range(params.JK_NGROUPS)
        ]
        resampled_results12 = [
            res['resampled12'][j][1] for j in range(params.JK_NGROUPS)
        ]
        resampled_results22 = [
            res['resampled22'][j][1] for j in range(params.JK_NGROUPS)
        ]

        resampled_results11 = np.array(resampled_results11)
        resampled_results12 = np.array(resampled_results12)
        resampled_results22 = np.array(resampled_results22)

        self.rsep = rsep
        self.bin_edges = params.BIN_EDGES
        self.bin_names = JK_tools.getBinNamesFromBinEdges(params.BIN_EDGES)

        self.kSZ_curveFullDataset11 = p_uk11
        self.kSZ_curveFullDataset12 = p_uk12
        self.kSZ_curveFullDataset22 = p_uk22

        self.kSZ_curveJK_realizations11 = resampled_results11
        self.kSZ_curveJK_realizations12 = resampled_results12
        self.kSZ_curveJK_realizations22 = resampled_results22

        self.errorbars11 = JK_tools.getErrorbars(resampled_results11, params)
        self.errorbars12 = JK_tools.getErrorbars(resampled_results12, params)
        self.errorbars22 = JK_tools.getErrorbars(resampled_results22, params)

        self.runtime = t2 - t1
Exemple #7
0
def test_getCovMatrix():
    bin_names = ['0 - 5', '5 - 10', '10 - 15', '15 - 20']
    pests = np.random.random(size=[50, 4])
    N = 50
    cov = JK_tools.getCovMatrix(bin_names, pests)
    cov = cov.values
    cov_numpy = np.cov(pests.T) * (N-1)/N*(N-1)
    chi_sq = ((cov_numpy - cov)**2).flatten().sum()
    assert chi_sq < 1e-10
Exemple #8
0
    def __init__(self, ds, params):
        df1, df2 = ds.df1, ds.df2
        self.params = params
        self.name = params.NAME
        self.query = params.CAT_QUERY
        self.do_variance_weighted = params.DO_VARIANCE_WEIGHTED
        self.N_objects_in_this_run = len(df1)
        self.JK_Ngroups = params.JK_NGROUPS
        self.run_resample(df1, df2, self.params)

        self.cov11 = JK_tools.getCovMatrix(self.bin_names,
                                           self.kSZ_curveJK_realizations11,
                                           params)
        self.cov12 = JK_tools.getCovMatrix(self.bin_names,
                                           self.kSZ_curveJK_realizations12,
                                           params)
        self.cov22 = JK_tools.getCovMatrix(self.bin_names,
                                           self.kSZ_curveJK_realizations22,
                                           params)

        self.corr11 = JK_tools.getCorrMatrix(self.bin_names,
                                             self.kSZ_curveJK_realizations11)
        self.corr12 = JK_tools.getCorrMatrix(self.bin_names,
                                             self.kSZ_curveJK_realizations12)
        self.corr22 = JK_tools.getCorrMatrix(self.bin_names,
                                             self.kSZ_curveJK_realizations22)
Exemple #9
0
def test_getErrorbars():
    class p:
        def __init__(self, ngroups):
            self.JK_NGROUPS = ngroups
    howManyJKiterations = 50000
    fakePars = p(howManyJKiterations)
    res = np.random.normal(size=[howManyJKiterations, 20])

    errorbars = JK_tools.getErrorbars(res, fakePars)

    std_res = np.std(res, axis=0) * np.sqrt(howManyJKiterations - 1)
    diff_sq = (std_res - errorbars)**2
    assert diff_sq.sum() < 1e-10
Exemple #10
0
def test_getBinNames():
    rsep = np.array([5, 10, 15, 20])
    names = JK_tools.getBinNames(rsep)
    assert names == ['0 - 5', '5 - 10', '10 - 15', '15 - 20']
Exemple #11
0
def run_JK_distributed(df, param, randomize=True):
    '''Receives the pandas dataframe with the objects containing the
    temperature decrements and the parameter object and run the kSZ
    statistic and generate Jack Knifes.
    Everything runs in the cluster, so current terminal does not need
    to request many cpus.

    df: dataframe object containing the variables for the calculation
    params: param file for this calculation
    NJK: how many subgroups we will make to run the calculation
    randomize: shuffle data before running the JK'''

    Ncores = envVars.Ncores
    NWorkers = envVars.NWorkers
    Ngroups = param.JK_NGROUPS
    resampling_method = param.JK_RESAMPLING_METHOD.lower()

    #setup cluster
    cluster = SGECluster(walltime='172800', processes=1, cores=1,
                         env_extra=['#$-pe sge_pe %i' % Ncores,
                                    '-l m_core=%i' % Ncores,
                                    'mkdir -p /tmp/pag227/dask/dask-scratch',
                                    'export NUMBA_NUM_THREADS=%i' % Ncores,
                                    'export OMP_NUM_THREADS=%i' % Ncores
#                                    'export OMP_NUM_THREADS=1',  # noqa
                                    ])
    cluster.scale(NWorkers)
    client = Client(cluster)
    time.sleep(30)
    #end setting up cluster

    #send full dataset to the cluster
    future_fullDataset = client.scatter(df)
    future_params = client.scatter(param)
    res_fullDataset = client.submit(pairwiser.get_pairwise_ksz,
                                    future_fullDataset,
                                    future_params, multithreading=True)
    #done with the full dataset

    #iterate over partial dataset for the JK
    if JK == resampling_method:
        indices_toDrop = JK_tools.indicesToDrop(df, Ngroups,
                                                randomize=randomize)
    jk_results = []
    futureData = []  #data to be sent in jk or bootstrap in galaxy space

    if (JK == resampling_method) or (BS == resampling_method):
        for j in range(Ngroups):  # submit data to the cluster
            if JK in resampling_method:  # if method jk
                dataJK = df.drop(indices_toDrop[j], inplace=False)
                futureData.append(client.scatter(dataJK))
            elif BS in resampling_method:
                dataBS = df.sample(len(df), replace=True)
                futureData.append(client.scatter(dataBS))
        #Now do the JK calculation
        for j in range(Ngroups):
            jk_results.append(client.submit(pairwiser.get_pairwise_ksz,
                              futureData[j],
                              future_params, multithreading=True))

    if BS_PW == resampling_method:  # submit the same dataset
        futureData = client.scatter(df, broadcast=True)

        for j in range(Ngroups):
            jk_results.append(client.submit(bs_pw.get_bootstrap_pairwise,
                                            futureData,
                                            future_params,
                                            multithreading=True,
                                            pure=False))
    if resampling_method == BS_DT:
        for j in range(Ngroups):
            df_bs = df.copy()
            choose = np.random.choice(len(df), len(df))
            df_bs['dT'] = df.dT.values[choose]
            futureData.append(client.scatter(df_bs))
        for j in range(Ngroups):
            jk_results.append(client.submit(pairwiser.get_pairwise_ksz,
                                            futureData[j],
                                            future_params,
                                            multithreading=True))

    if resampling_method == TL_JK:
        tiled_JK.classify_grid(df)
        df = tiled_JK.remove_edge_galaxies(df, tol_sigma=1.5)
        Ntiles = tiled_JK.how_many_tiles(df)
        for j in range(Ntiles):
            df_tosubmit = tiled_JK.remove_tile(df, j)
            futureData.append(client.scatter(df_tosubmit))
        for j in range(Ntiles):
            jk_results.append(client.submit(pairwiser.get_pairwise_ksz,
                                            futureData[j],
                                            future_params,
                                            multithreading=True))
    #extract results
    fullDataset_results = res_fullDataset.result()
    jk_results = client.gather(jk_results)
    client.close()
#    cluster.close()

    return fullDataset_results, jk_results