Example #1
0
def test_indicesToDrop():
    N = 20000
    a = np.random.random(N)
    df = pd.DataFrame({'a': a})
    groups = JK_tools.indicesToDrop(df, 4)
    assert len(groups) == 4
    assert len(groups[0]) == 5000
Example #2
0
def run_JK_local(df, params, randomize=True, multithreading=False):
    '''Receives the pandas df with objects with temp decrements and the
    parameter file object.

    Runs the ksz estimator and runs jackknifes.

    Everything runs locally, make sure you have requested the resources you
    are using.

    df: dataframe object with the variables for the calculation
    params: param file for this calculation
    NJK: how many subgroups for the run_JK
    '''
    print("Running a JK run on the local machine, this will take a while.")
    Ngroups = params.JK_NGROUPS
    fullDataset_results = pairwiser.get_pairwise_ksz(df,
                                                     params,
                                  multithreading=multithreading) # noqa
    indices_toDrop = JK_tools.indicesToDrop(df, Ngroups, randomize=randomize)
    jk_results = []
    for j in range(Ngroups):
        print "%i/%i" % (j, Ngroups)
        data_JK = df.drop(indices_toDrop[j], inplace=False)
        jk_results.append(pairwiser.get_pairwise_ksz(data_JK,
                                                     params,
                                    multithreading=multithreading)) # noqa
    return fullDataset_results, jk_results
Example #3
0
def run_JK_distributed(df, param, randomize=True):
    '''Receives the pandas dataframe with the objects containing the
    temperature decrements and the parameter object and run the kSZ
    statistic and generate Jack Knifes.
    Everything runs in the cluster, so current terminal does not need
    to request many cpus.

    df: dataframe object containing the variables for the calculation
    params: param file for this calculation
    NJK: how many subgroups we will make to run the calculation
    randomize: shuffle data before running the JK'''

    Ncores = envVars.Ncores
    NWorkers = envVars.NWorkers
    Ngroups = param.JK_NGROUPS
    resampling_method = param.JK_RESAMPLING_METHOD.lower()

    #setup cluster
    cluster = SGECluster(walltime='172800', processes=1, cores=1,
                         env_extra=['#$-pe sge_pe %i' % Ncores,
                                    '-l m_core=%i' % Ncores,
                                    'mkdir -p /tmp/pag227/dask/dask-scratch',
                                    'export NUMBA_NUM_THREADS=%i' % Ncores,
                                    'export OMP_NUM_THREADS=%i' % Ncores
#                                    'export OMP_NUM_THREADS=1',  # noqa
                                    ])
    cluster.scale(NWorkers)
    client = Client(cluster)
    time.sleep(30)
    #end setting up cluster

    #send full dataset to the cluster
    future_fullDataset = client.scatter(df)
    future_params = client.scatter(param)
    res_fullDataset = client.submit(pairwiser.get_pairwise_ksz,
                                    future_fullDataset,
                                    future_params, multithreading=True)
    #done with the full dataset

    #iterate over partial dataset for the JK
    if JK == resampling_method:
        indices_toDrop = JK_tools.indicesToDrop(df, Ngroups,
                                                randomize=randomize)
    jk_results = []
    futureData = []  #data to be sent in jk or bootstrap in galaxy space

    if (JK == resampling_method) or (BS == resampling_method):
        for j in range(Ngroups):  # submit data to the cluster
            if JK in resampling_method:  # if method jk
                dataJK = df.drop(indices_toDrop[j], inplace=False)
                futureData.append(client.scatter(dataJK))
            elif BS in resampling_method:
                dataBS = df.sample(len(df), replace=True)
                futureData.append(client.scatter(dataBS))
        #Now do the JK calculation
        for j in range(Ngroups):
            jk_results.append(client.submit(pairwiser.get_pairwise_ksz,
                              futureData[j],
                              future_params, multithreading=True))

    if BS_PW == resampling_method:  # submit the same dataset
        futureData = client.scatter(df, broadcast=True)

        for j in range(Ngroups):
            jk_results.append(client.submit(bs_pw.get_bootstrap_pairwise,
                                            futureData,
                                            future_params,
                                            multithreading=True,
                                            pure=False))
    if resampling_method == BS_DT:
        for j in range(Ngroups):
            df_bs = df.copy()
            choose = np.random.choice(len(df), len(df))
            df_bs['dT'] = df.dT.values[choose]
            futureData.append(client.scatter(df_bs))
        for j in range(Ngroups):
            jk_results.append(client.submit(pairwiser.get_pairwise_ksz,
                                            futureData[j],
                                            future_params,
                                            multithreading=True))

    if resampling_method == TL_JK:
        tiled_JK.classify_grid(df)
        df = tiled_JK.remove_edge_galaxies(df, tol_sigma=1.5)
        Ntiles = tiled_JK.how_many_tiles(df)
        for j in range(Ntiles):
            df_tosubmit = tiled_JK.remove_tile(df, j)
            futureData.append(client.scatter(df_tosubmit))
        for j in range(Ntiles):
            jk_results.append(client.submit(pairwiser.get_pairwise_ksz,
                                            futureData[j],
                                            future_params,
                                            multithreading=True))
    #extract results
    fullDataset_results = res_fullDataset.result()
    jk_results = client.gather(jk_results)
    client.close()
#    cluster.close()

    return fullDataset_results, jk_results