Ejemplo n.º 1
0
def compute_core(arg_list):
    """
    call to be distributed onto the cluster
    """

    signal, method, fold_idx, train_idx, dev_idx, test_idx, grid_idx, grid_point = arg_list

    data = data_loader.load_all(signal)

    data_train = subset_mtl_data(data, train_idx)
    data_dev = subset_mtl_data(data, dev_idx)
    data_test = subset_mtl_data(data, test_idx)

    model = method(cost=grid_point["cost"])
    model.train(data_train)
    
    out_test = model.predict(data_test)
    out_dev = model.predict(data_dev)

    ret = []
    # return a list of key-value pairs
    for key in data_test.keys():
        perf_prc_dev = helper.calcroc(out_dev[key], data_dev[key]["labels"])[0]
        perf_prc_test = helper.calcroc(out_test[key], data_test[key]["labels"])[0]
        reduce_key = (key, fold_idx, grid_idx)
        ret.append({reduce_key : {"prc_dev": perf_prc_dev, "prc_test": perf_prc_test}})

    return ret
Ejemplo n.º 2
0
def compute_core(arg_list):
    """
    call to be distributed onto the cluster
    """

    signal, method, fold_idx, train_idx, dev_idx, test_idx, grid_idx, grid_point = arg_list

    data = data_loader.load_all(signal)

    data_train = subset_mtl_data(data, train_idx)
    data_dev = subset_mtl_data(data, dev_idx)
    data_test = subset_mtl_data(data, test_idx)

    model = method(cost=grid_point["cost"])
    model.train(data_train)

    out_test = model.predict(data_test)
    out_dev = model.predict(data_dev)

    ret = []
    # return a list of key-value pairs
    for key in data_test.keys():
        perf_prc_dev = helper.calcroc(out_dev[key], data_dev[key]["labels"])[0]
        perf_prc_test = helper.calcroc(out_test[key],
                                       data_test[key]["labels"])[0]
        reduce_key = (key, fold_idx, grid_idx)
        ret.append(
            {reduce_key: {
                "prc_dev": perf_prc_dev,
                "prc_test": perf_prc_test
            }})

    return ret
Ejemplo n.º 3
0
def setup_splits(signal, method_name, method, param, num_folds, test_size,
                 random_state):
    """
    splitting the example data into train/test/validation group 
    """

    data = data_loader.load_all(signal)
    sizes = dict((org, len(data[org]["labels"])) for org in data.keys())

    # set up splitting strategy
    kf = MultitaskShuffleSplitThreeWay(sizes,
                                       n_iter=num_folds,
                                       indices=True,
                                       test_size=test_size * 2,
                                       random_state=random_state)

    param_grid = list(ParameterGrid(param))
    argument_list = []

    for fold_idx, (train_idx, dev_idx, test_idx) in enumerate(kf):
        for grid_idx, grid_point in enumerate(param_grid):
            arg = [
                signal, method, fold_idx, train_idx, dev_idx, test_idx,
                grid_idx, grid_point
            ]
            argument_list.append(arg)

    local = False
    max_num_threads = 2

    if method_name in ['union', 'individual']:
        param = {
            'vmem': '4gb',
            'pvmem': '4gb',
            'pmem': '4gb',
            'mem': '4gb',
            'ppn': '1',
            'nodes': '1',
            'walltime': '2:00:00'
        }
        intermediate_ret = pg.pg_map(compute_core,
                                     argument_list,
                                     param=param,
                                     local=local,
                                     maxNumThreads=1,
                                     mem="4gb")

    #import ipdb
    #ipdb.set_trace()

    print "DONE with computation"

    flat_intermediate = list(chain.from_iterable(intermediate_ret))
    perf_dev, perf_test = reduce_result(flat_intermediate)

    print "DONE reducing"

    return perf_dev, perf_test
Ejemplo n.º 4
0
def setup_splits(signal, method_name, method, param, num_folds, test_size, random_state):
    """
    splitting the example data into train/test/validation group 
    """

    data = data_loader.load_all(signal)
    sizes = dict((org, len(data[org]["labels"])) for org in data.keys())

    # set up splitting strategy
    kf = MultitaskShuffleSplitThreeWay(sizes, n_iter=num_folds, indices=True, test_size=test_size*2, random_state=random_state)

    param_grid = list(ParameterGrid(param))
    argument_list = []

    for fold_idx, (train_idx, dev_idx, test_idx) in enumerate(kf):
        for grid_idx, grid_point in enumerate(param_grid):
            arg = [signal, method, fold_idx, train_idx, dev_idx, test_idx, grid_idx, grid_point]
            argument_list.append(arg)

    local = False 
    max_num_threads = 2

    if method_name in ['union', 'individual']:
        param = {'vmem':'4gb', 'pvmem':'4gb', 'pmem':'4gb', 'mem':'4gb', 'ppn':'1', 'nodes':'1', 'walltime':'2:00:00'}
        intermediate_ret = pg.pg_map(compute_core, argument_list, param=param, local=local, maxNumThreads=1, mem="4gb")

    #import ipdb 
    #ipdb.set_trace()

    print "DONE with computation"

    flat_intermediate = list(chain.from_iterable(intermediate_ret))
    perf_dev, perf_test = reduce_result(flat_intermediate)

    print "DONE reducing"

    return perf_dev, perf_test