Example #1
0
def k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, cat_dissim,
                 gamma, init, n_init, verbose, random_state, n_jobs):
    """k-prototypes algorithm"""
    random_state = check_random_state(random_state)
    if sparse.issparse(X):
        raise TypeError("k-prototypes does not support sparse data.")

    if categorical is None or not categorical:
        raise NotImplementedError(
            "No categorical data selected, effectively doing k-means. "
            "Present a list of categorical columns, or use scikit-learn's "
            "KMeans instead.")
    if isinstance(categorical, int):
        categorical = [categorical]
    assert len(categorical) != X.shape[1], \
        "All columns are categorical, use k-modes instead of k-prototypes."
    assert max(categorical) < X.shape[1], \
        "Categorical index larger than number of columns."

    ncatattrs = len(categorical)
    nnumattrs = X.shape[1] - ncatattrs
    n_points = X.shape[0]
    assert n_clusters <= n_points, "Cannot have more clusters ({}) " \
                                   "than data points ({}).".format(n_clusters, n_points)

    Xnum, Xcat = _split_num_cat(X, categorical)
    Xnum, Xcat = check_array(Xnum), check_array(Xcat, dtype=None)

    # Convert the categorical values in Xcat to integers for speed.
    # Based on the unique values in Xcat, we can make a mapping to achieve this.
    Xcat, enc_map = encode_features(Xcat)

    # Are there more n_clusters than unique rows? Then set the unique
    # rows as initial values and skip iteration.
    unique = get_unique_rows(X)
    n_unique = unique.shape[0]
    if n_unique <= n_clusters:
        max_iter = 0
        n_init = 1
        n_clusters = n_unique
        init = list(_split_num_cat(unique, categorical))
        init[1], _ = encode_features(init[1], enc_map)

    # Estimate a good value for gamma, which determines the weighing of
    # categorical values in clusters (see Huang [1997]).
    if gamma is None:
        gamma = 0.5 * Xnum.std()  # std计算矩阵标准差
        # print(gamma)
    results = []
    seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
    if n_jobs == 1:
        for init_no in range(n_init):
            results.append(
                k_prototypes_single(Xnum, Xcat, nnumattrs, ncatattrs,
                                    n_clusters, n_points, max_iter, num_dissim,
                                    cat_dissim, gamma, init, init_no, verbose,
                                    seeds[init_no]))
    else:
        results = Parallel(n_jobs=n_jobs, verbose=0)(
            delayed(k_prototypes_single)
            (Xnum, Xcat, nnumattrs, ncatattrs, n_clusters, n_points, max_iter,
             num_dissim, cat_dissim, gamma, init, init_no, verbose, seed)
            for init_no, seed in enumerate(seeds))
    all_centroids, all_labels, all_costs, all_n_iters, all_epoch_costs = zip(
        *results)

    best = np.argmin(all_costs)
    if n_init > 1 and verbose:
        print("Best run was number {}".format(best + 1))

    # Note: return gamma in case it was automatically determined.
    return all_centroids[best], enc_map, all_labels[best], all_costs[best], \
        all_n_iters[best], all_epoch_costs[best], gamma
    stream3c[1].stats['asdf'] = a2
    stream3c[2].stats['asdf'] = a3
    stream3c.trim2(-25, 75, 'onset')
    return stream3c


print "Lets start the show..."
data = read_rf('DATA/7X-event_waveforms_for_rf.h5', 'H5')
print "Data in..."
'''
# we can exclude bad stations
inc_set = list(set([tr.stats.inclination for tr in data]))
data_filtered = RFStream([tr for tr in data if tr.stats.inclination in inc_set and tr.stats.station not in ['MIJ2', 'MIL2']])
'''

stream = RFStream()

rf_streams = Parallel(n_jobs=-1,
                      verbose=1)(map(delayed(do_rf),
                                     IterMultipleComponents(data, 'onset', 3)))

for i, rf in enumerate(rf_streams):
    event_id = {'event_id': 0}
    event_id['event_id'] = i
    for tr in rf:
        tr.stats.update(event_id)
    stream.extend(rf)

stream.write('DATA/7X-rf_zrt', 'H5')
print "No worries, mate..."
Example #3
0
    def make_dataset_scada(self):
        X = dict()

        for project in self.projects:
            X[project['_id']] = pd.DataFrame()

        if self.isfortest:
            file_nwp = 'weather_data_test.csv'
        else:
            file_nwp = 'weather_data.csv'
        if not os.path.exists(
                os.path.join(self.projects[0]['static_data']['path_data'],
                             file_nwp)):

            lats, longs = self.lats_longs()

            nwp = self.stack_daily_nwps(self.dates[-1], self.data, lats, longs,
                                        self.path_nwp, self.nwp_model,
                                        self.projects, self.variables,
                                        self.compress)
            nwp_daily = Parallel(n_jobs=self.njobs)(
                delayed(self.stack_daily_nwps)(
                    t, self.data, lats, longs, self.path_nwp, self.nwp_model,
                    self.projects, self.variables, self.compress)
                for t in self.dates)

            for nwp in nwp_daily:
                for project in self.projects:
                    if nwp[0][project['_id']].shape[0] != 0:
                        X[project['_id']] = pd.concat(
                            [X[project['_id']], nwp[0][project['_id']]])

                        self.logger.info('All Inputs stacked for date %s',
                                         nwp[1])
            for project in self.projects:
                X[project['_id']].to_csv(
                    os.path.join(project['static_data']['path_data'],
                                 file_nwp))
        else:
            for project in self.projects:
                X[project['_id']] = pd.read_csv(os.path.join(
                    project['static_data']['path_data'], file_nwp),
                                                header=0,
                                                index_col=0,
                                                parse_dates=True,
                                                dayfirst=True)
        for project in self.projects:
            data_path = project['static_data']['path_data']

            if self.isfortest:
                dataset_X, dataset_y, X_3d = self.create_dataset(
                    X[project['_id']],
                    data_path,
                    start_index=9001,
                    test=self.isfortest)
                if dataset_y.isna().any().values[0]:
                    dataset_X = dataset_X.drop(dataset_y.index[np.where(
                        dataset_y.isna())[0]])

                    if len(X_3d.shape) > 1:
                        X_3d = np.delete(X_3d,
                                         np.where(dataset_y.isna())[0],
                                         axis=0)
                    dataset_y = dataset_y.drop(dataset_y.index[np.where(
                        dataset_y.isna())[0]])
                dataset_X.to_csv(
                    os.path.join(project['static_data']['path_data'],
                                 'dataset_X_test.csv'))
                dataset_y.to_csv(
                    os.path.join(project['static_data']['path_data'],
                                 'dataset_y_test.csv'))
                joblib.dump(
                    X_3d,
                    os.path.join(project['static_data']['path_data'],
                                 'dataset_lstm_test.pickle'))
                self.logger.info('Datasets saved for project %s',
                                 project['_id'])
            else:
                dataset_X, dataset_y, X_3d = self.create_dataset(
                    X[project['_id']],
                    data_path,
                    start_index=9001,
                    test=self.isfortest)
                if dataset_y.isna().any().values[0]:
                    dataset_X = dataset_X.drop(dataset_y.index[np.where(
                        dataset_y.isna())[0]])

                    if len(X_3d.shape) > 1:
                        X_3d = np.delete(X_3d,
                                         np.where(dataset_y.isna())[0],
                                         axis=0)
                    dataset_y = dataset_y.drop(dataset_y.index[np.where(
                        dataset_y.isna())[0]])
                dataset_X.to_csv(
                    os.path.join(project['static_data']['path_data'],
                                 'dataset_X.csv'))
                dataset_y.to_csv(
                    os.path.join(project['static_data']['path_data'],
                                 'dataset_y.csv'))
                joblib.dump(
                    X_3d,
                    os.path.join(project['static_data']['path_data'],
                                 'dataset_lstm.pickle'))
                self.logger.info('Datasets saved for project %s',
                                 project['_id'])
Example #4
0
        opfname = f.replace('.json', '.WL' + str(h))
    else:
        opfname = f.replace('.gexf', '.WL' + str(h))

    subgraph2vec_sentences = get_graph_as_bow(g, h)
    with open(opfname, 'w') as fh:
        for w in subgraph2vec_sentences:
            print >> fh, w

    logging.debug('dumped wlk file in {} sec'.format(round(time() - T0, 2)))


if __name__ == '__main__':
    # if sys.argv[1] in ['-h','--help']:
    #     print 'command line args: <gexf/json graph_dir> <height of WL kernel> <num of cpu cores for multi-processing>'
    #     exit (0)

    graph_dir = "/home/annamalai/OLMD/OLMD/MKLDroid/tmp/amd_dataset_graphs_wlfiles/adgs"  #folder containing the graph's gexf/json format files
    h = 2  #height of WL kernel (i.e., degree of neighbourhood to consdider)
    n_cpus = 36  # number of cpus to be used for multiprocessing
    extn = '.gexf'

    files_to_process = get_files(dirname=graph_dir, extn=extn)
    print files_to_process
    raw_input(
        'have to procees a total of {} files with {} parallel processes... hit any key to proceed...'
        .format(len(files_to_process), n_cpus))

    Parallel(n_jobs=n_cpus)(delayed(dump_subgraph2vec_sentences)(f, h)
                            for f in files_to_process)
Example #5
0
def parallelNelderMead(
    objFunc,
    guess,
    perturb=None,
    P=1,
    ftol=0.000001,
    xtol=0.00000001,
    maxiter=np.inf,
    maxeval=np.inf,
    r_param=1.0,
    e_param=1.0,
    c_param=0.5,
    s_param=0.5,
    maxthreads=None,
    name=None,
    resume=False,
    savefreq=None,
    verbose=1,
):
    """

    A parallel implementation of the Nelder-Mead minimization algorithm, as
    described in Lee and Wiswall.  For long optimization procedures, it can
    save progress between iterations and resume later.
    
    Parameters
    ----------
    objFunc : function
        The objective function to be minimized. Takes a single 1D array as input.
    guess : np.array
        Initial starting point for the simplex, representing an input for objFunc.
    perturb : np.array
        Perturbation vector for the simplex, of the same length as an input to
        objFunc.  If perturb[j] is non-zero, a simplex point will be created
        that perturbs the j-th element of guess by perturb[j]; if it is zero,
        then the j-th parameter of objFunc will not be optimized over.  By
        default, perturb=None, indicating that all parameters should be optimized,
        with an initial perturbation of 0.1*guess.
    P : int
        Degree of parallelization: the number of vertices of the simplex to try
        to update on each iteration of the process.
    ftol : float
        Absolute tolerance of the objective function for convergence.  If suc-
        cessive iterations return minimum function values that differ by less
        than ftol, the process terminates successfully.
    xtol : float
        Absolute tolerance of the input values for convergence.  If the maximum
        distance between the current minimum point and the worst point in the
        simplex is less than xtol, then the process terminates successfully.
    maxiter : int
        Maximum number of Nelder-Mead iterations; reaching iters=maxiter is
        reported as an "unsuccessful" minimization.
    maxeval : int
        Maximum number of evaluations of objFunc (across all processes); reaching
        evals=maxeval is reported as an "unsuccessful" minimization.
    r_param: float
        Parameter indicating magnitude of the reflection point calculation.
    e_param: float
        Parameter indicating magnitude of the expansion point calculation.
    c_param: float
        Parameter indicating magnitude of the contraction point calculation.
    s_param: float
        Parameter indicating magnitude of the shrink calculation.
    maxthreads : int
        The maximum number of CPU cores that the optimization should use,
        regardless of the size of the problem.
    name : string
        A filename for (optionally) saving the progress of the Nelder-Mead search,
        and for resuming a previous search (when resume=True).  Useful for long
        searches that could potentially be interrupted by computer down time.
    resume : boolean
        An indicator for whether the search should resume from earlier progress.
        When True, the process will load a progress file named in input name.
    savefreq : int
        When not None, search progress will be saved to name.txt every savefreq
        iterations, to be loaded later with resume=True).
    verbose : int
        Indicator for the verbosity of the optimization routine.  Higher values
        generate more text output; verbose=0 produces no text output.
        
    Returns
    -------
    min_point : np.array
        The input that minimizes objFunc, as found by the minimization.
    fmin : float
        The minimum of objFunc; fmin = objFunc(min_point).
    """
    # If this is a resumed search, load the data
    if resume:
        simplex, fvals, iters, evals = loadNelderMeadData(name)
        dim_count = fvals.size - 1
        N = dim_count + 1  # Number of points in simplex
        K = simplex.shape[1]  # Total number of parameters

    # Otherwise, construct the initial simplex and array of function values
    else:
        if perturb is None:  # Default: perturb each parameter by 10%
            perturb = 0.1 * guess
            guess[guess == 0] = 0.1

        params_to_opt = np.where(
            perturb != 0)[0]  # Indices of which parameters to optimize
        dim_count = params_to_opt.size  # Number of parameters to search over
        N = dim_count + 1  # Number of points in simplex
        K = guess.size  # Total number of parameters
        simplex = np.tile(guess, (N, 1))
        for j in range(
                dim_count
        ):  # Perturb each parameter to optimize by the specified distance
            simplex[j + 1,
                    params_to_opt[j]] = (simplex[j + 1, params_to_opt[j]] +
                                         perturb[params_to_opt[j]])

        # Initialize iteration and evaluation counts, plus a 1D array of function values
        fvals = np.zeros(dim_count + 1) + np.nan

        iters = 0
        evals = 0

    # Make sure degree of parallelization is not illegal
    if P > N - 1:
        print("Requested degree of simplex parallelization is " + str(P) +
              ", but dimension of optimization problem is only " + str(N - 1) +
              ".")
        print("Degree of parallelization has been reduced to " + str(N - 1) +
              ".")
        P = N - 1

    # Create the pool of worker processes
    cpu_cores = multiprocessing.cpu_count(
    )  # Total number of available CPU cores
    cores_to_use = min(cpu_cores, dim_count)
    if maxthreads is not None:  # Cap the number of cores if desired
        cores_to_use = min(cores_to_use, maxthreads)
    parallel = Parallel(n_jobs=cores_to_use)

    # Begin a new Nelder-Mead search
    if not resume:
        temp_simplex = list(simplex)  # Evaluate the initial simplex
        fvals = np.array(
            parallel(delayed(objFunc)(params) for params in temp_simplex))
        evals += N
        # Reorder the initial simplex
        order = np.argsort(fvals)
        fvals = fvals[order]
        simplex = simplex[order, :]
        fmin = fvals[0]
        f_dist = np.abs(fmin - fvals[-1])
        x_dist = np.max(
            np.sqrt(
                np.sum((simplex - np.tile(simplex[0, :], (N, 1)))**2.0,
                       axis=1)))
        if verbose > 0:
            print("Evaluated the initial simplex: fmin=" + str(fmin) +
                  ", f_dist=" + str(f_dist) + ", x_dist=" + str(x_dist))
        if savefreq is not None:
            saveNelderMeadData(name, simplex, fvals, iters, evals)
            if verbose > 0:
                print("Saved search progress in " + name + ".txt")
    else:  # Resume an existing search that was cut short
        if verbose > 0:
            print("Resuming search after " + str(iters) + " iterations and " +
                  str(evals) + " function evaluations.")

    # Initialize some inputs for the multithreader
    j_list = range(N - P, N)
    opt_params = [r_param, c_param, e_param]

    # Run the Nelder-Mead algorithm until a terminal condition is met
    go = True
    while go:
        t_start = time()
        iters += 1
        if verbose > 0:
            print("Beginning iteration #" + str(iters) + " now.")

        # Update the P worst points of the simplex
        output = parallel(
            delayed(parallelNelderMeadWorker)(objFunc, simplex, fvals, j, P,
                                              opt_params) for j in j_list)
        new_subsimplex = np.zeros((P, K)) + np.nan
        new_vals = np.zeros(P) + np.nan
        new_evals = 0
        for i in range(P):
            new_subsimplex[i, :] = output[i][0]
            new_vals[i] = output[i][1]
            new_evals += output[i][2]
        evals += new_evals

        # Check whether any updates actually happened
        old_subsimplex = simplex[(N - P):N, :]
        if np.max(np.abs(new_subsimplex - old_subsimplex)) == 0:
            if verbose > 0:
                print("Updated the simplex, but must perform a shrink step.")
            # If every attempted update was unsuccessful, must shrink the simplex
            simplex = (s_param * np.tile(simplex[0, :],
                                         (N, 1)) + (1.0 - s_param) * simplex)
            temp_simplex = list(simplex[1:N, :])
            fvals = np.array([fvals[0]] + parallel(
                delayed(objFunc)(params) for params in temp_simplex))
            new_evals += N - 1
            evals += N - 1
        else:
            if verbose > 0:
                print("Updated the simplex successfully.")
            # Otherwise, update the simplex with the new results
            simplex[(N - P):N, :] = new_subsimplex
            fvals[(N - P):N] = new_vals

        # Reorder the simplex from best to worst
        order = np.argsort(fvals)
        fvals = fvals[order]
        simplex = simplex[order, :]
        fmin = fvals[0]
        f_dist = np.abs(fmin - fvals[-1])
        x_dist = np.max(
            np.sqrt(
                np.sum((simplex - np.tile(simplex[0, :], (N, 1)))**2.0,
                       axis=1)))
        t_end = time()
        if verbose > 0:
            t_iter = t_end - t_start
            print("Finished iteration #" + str(iters) + " with " +
                  str(new_evals) + " evaluations (" + str(evals) +
                  " cumulative) in " + str(t_iter) + " seconds.")
            print("Simplex status: fmin=" + str(fmin) + ", f_dist=" +
                  str(f_dist) + ", x_dist=" + str(x_dist))

        # Check for terminal conditions
        if iters >= maxiter:
            go = False
            print("Maximum iterations reached, terminating unsuccessfully.")
        if evals >= maxeval:
            go = False
            print("Maximum evaluations reached, terminating unsuccessfully.")
        if f_dist < ftol:
            go = False
            print("Function tolerance reached, terminating successfully.")
        if x_dist < xtol:
            go = False
            print("Parameter tolerance reached, terminating successfully.")

        # Save the progress of the estimation if desired
        if savefreq is not None:
            if (iters % savefreq) == 0:
                saveNelderMeadData(name, simplex, fvals, iters, evals)
                if verbose > 0:
                    print("Saved search progress in " + name + ".txt")

    # Return the results
    xopt = simplex[0, :]
    return xopt, fmin
clistfile_h.close()
# print(cfile_lines[0])
cline_basename = cfile_lines[0]
cline_path = os.path.join(preclinedir, cline_basename)
if not os.path.isfile(cline_path):
    print("cannot find", cline_path)
    sys.exit(-1)
clinedata, header = nrrd.read(cline_path)
clines = get_center_lines(clinedata, point_cnt=500)

slistfile_h = open(slistfile, "r")
sfile_lines = slistfile_h.readlines()
sfile_lines = [line.rstrip() for line in sfile_lines]
slistfile_h.close()

# if progress:
#     bar = Bar('Processing', max=len(sfile_lines))

if parallel:
    # Parallel(n_jobs=n_jobs, backend="multiprocessing", require='sharedmem')(
    #     delayed(crop_along_cline)(sfilename) for sfilename in sfile_lines)
    Parallel(n_jobs=n_jobs,
             require='sharedmem')(delayed(crop_along_cline)(sfilename)
                                  for sfilename in sfile_lines)
else:
    for sfilename in sfile_lines:
        crop_along_cline(sfilename)

# if progress:
#     bar.finish()
Example #7
0
def main():
    data_path = '/mnt/nvme102/alex/sat/any/data'

    log_dir = '/mnt/nvme102/alex/sat/any/logs2022/only_sat'

    batch_sz = 250
    epochs = 30
    seed = 42
    n_vars = 10
    n_ops = 55

    use_cuda = True

    data_debug = False

    print('loading train data')
    train_dataset = TreeFormulasDataset(os.path.join(data_path, 'train.txt'),
                                        n_ops,
                                        n_vars,
                                        data_debug,
                                        only_sat=True)
    print('loading test data')
    test_dataset = TreeFormulasDataset(os.path.join(data_path, 'test.txt'),
                                       n_ops, n_vars, data_debug)
    print('loading validation data')
    validation_dataset = TreeFormulasDataset(
        os.path.join(data_path, 'validation.txt'), n_ops, n_vars, data_debug)

    experiment_target = 'steps_dim_cl'

    dim_options = [  # 1, 2, 4, 8, 16, 32,
        16
    ]
    cl_options = [  # 1, 2, 3,
        5, 6, 7
    ]  # 5=min, 6=prod,7=luk
    rnn_steps_options = [  # 0, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50
        10
    ]

    experiments = itertools.product(dim_options, cl_options, rnn_steps_options)

    opt = 'adam'
    lr = 0.005 if opt == 'sgd' else 0.0005
    momentum = 0.75
    nesterov = False
    wdecay = 1e-4
    eps = 1e-8
    grad_clip = 0.2

    n_gpus = 1

    gpus = ['cuda:{}'.format(i) for i in range(n_gpus)]

    works = []

    skip_e = None

    for i, e in enumerate(experiments):
        if skip_e is not None:
            if e == skip_e:
                skip_e = None
            else:
                continue
        works.append((i, e))

    works_by_gpus = [(gpus[i], works[i::len(gpus)]) for i in range(len(gpus))]
    wlist = [
        delayed(list_worker)(gpu, wks, seed, opt, epochs, batch_sz, lr, wdecay,
                             nesterov, train_dataset, test_dataset,
                             validation_dataset, momentum, eps, log_dir,
                             grad_clip) for gpu, wks in works_by_gpus
    ]

    Parallel(len(wlist), 'threading')(wlist)
Example #8
0
        f"{args.feature_dir}/forecasting_features_{args.mode}.pkl")


if __name__ == "__main__":
    """Load sequences and save the computed features."""
    args = parse_arguments()

    start = time.time()

    map_features_utils_instance = MapFeaturesUtils()
    social_features_utils_instance = SocialFeaturesUtils()

    sequences = os.listdir(args.data_dir)
    temp_save_dir = tempfile.mkdtemp()

    num_sequences = _FEATURES_SMALL_SIZE if args.small else len(sequences)

    Parallel(n_jobs=-2)(delayed(load_seq_save_features)(
        i,
        sequences,
        temp_save_dir,
        map_features_utils_instance,
        social_features_utils_instance,
    ) for i in range(0, num_sequences, args.batch_size))
    merge_saved_features(temp_save_dir)
    shutil.rmtree(temp_save_dir)

    print(
        f"Feature computation for {args.mode} set completed in {(time.time()-start)/60.0} mins"
    )
                'shape_i': (data_i.shape[0], ys_i[i + 1] - ys_i[i],
                            xs_i[j + 1] - xs_i[j]),
                'shape_m': (data_m.shape[0], ys_m[i + 1] - ys_m[i],
                            xs_m[j + 1] - xs_m[j]),
                #'shape_p': (data_p.shape[0], ys_p[i+1] - ys_p[i], xs_p[j+1] - xs_p[j])
            }

            save_pickle('cache/meta/%s_%d_%d.pickle' % (loc, i, j), meta)

    write_location_images(loc, data_i, xs_i, ys_i, 'I')
    write_location_images(loc, data_m, xs_m, ys_m, 'M')
    #write_location_images(loc, data_p, xs_p, ys_p, 'P')

    write_location_images(loc, normalize(data_m), xs_m, ys_m,
                          'MN')  # Write location-normalized M channels

    write_location_images(loc, compute_filters(data_i), xs_i, ys_i, 'IF')
    write_location_images(loc, compute_indices(data_m), xs_m, ys_m, 'MI')

    #data_a, xs_a, ys_a = read_location_images(loc, 'sixteen_band', 'A', resize_to='shape_m')

    #write_location_images(loc, data_a, xs_a, ys_a, 'A')


print "Preparing image data..."

# Prepare locations
Parallel(n_jobs=2)(delayed(prepare_location)(loc) for loc in locations)

print "Done."
    for arg in sys.argv[1:]:
        data = vtkio.getBlockByName(reader.GetOutput(), arg)
        merger.AddInputData(data)
    merger.Update()
    ds = dsa.WrapDataObject(merger.GetOutput())

    times = []
        
    for i in range(reader.GetTimeSets().GetNumberOfItems()):
        array = reader.GetTimeSets().GetItem(i)
        for j in range(array.GetNumberOfTuples()):
            times.append(array.GetComponent(j, 0))
    eigvalues = []
    timecoefficients = []
    del merger, ds, reader
    snaps = Parallel(n_jobs=6, max_nbytes=1e9, verbose=30)(delayed(compute_snapshot)(files[0], time, sys.argv[1:]) for time in times[1:])
    #pdb.set_trace()
    N = len(times)-1
    np.savez('cache_snapshots.npz', snaps=snaps)
    fft_values = np.empty((snaps[0].shape[0], N//2))
    xf = fftfreq(N, 0.001)
    snapshots = np.empty((snaps[0].shape[0], N))
    for i, snap in enumerate(snaps):
        snapshots[:,i] = snap
    ffts = Parallel(n_jobs=6, max_nbytes=1e9, verbose=30, prefer='threads')(delayed(fft)(snapshots[i,:]) for i in range(snapshots.shape[0]))
    for i, snap in enumerate(ffts):
        fft_values[i, :] = 2.0 / N * np.abs(snap[:N//2])
    np.savez(cacheFile, fft_values=fft_values, xf=xf)

else:
    data = np.load(cacheFile)
Example #11
0
def node2vec(
    G,
    dimensions=128,
    walk_length=80,
    num_walks=10,
    p=1.0,
    q=1.0,
    weight_key=None,
    workers=None,
    **skip_gram_params,
):
    """Graph embedding via Node2Vec.

    Parameters
    ----------
    G : easygraph.Graph or easygraph.DiGraph

    dimensions : int
        Embedding dimensions, optional(default: 128)

    walk_length : int
        Number of nodes in each walk, optional(default: 80)

    num_walks : int
        Number of walks per node, optional(default: 10)

    p : float
        The return hyper parameter, optional(default: 1.0)

    q : float
        The input parameter, optional(default: 1.0)

    weight_key : string or None (default: None)
        On weighted graphs, this is the key for the weight attribute

    workers : int or None, optional(default : None)
        The number of workers generating random walks (default: None). None if not using only one worker.

    skip_gram_params : dict
        Parameters for gensim.models.Word2Vec - do not supply 'size', it is taken from the 'dimensions' parameter

    Returns
    -------
    embedding_vector : dict
        The embedding vector of each node

    most_similar_nodes_of_node : dict
        The most similar nodes of each node and its similarity

    Examples
    --------

    >>> node2vec(G,
    ...          dimensions=128, # The graph embedding dimensions.
    ...          walk_length=80, # Walk length of each random walks.
    ...          num_walks=10, # Number of random walks.
    ...          p=1.0, # The `p` possibility in random walk in [1]_
    ...          q=1.0, # The `q` possibility in random walk in [1]_
    ...          weight_key='weight',
    ...          skip_gram_params=dict( # The skip_gram parameters in Python package gensim.
    ...          window=10,
    ...             min_count=1,
    ...             batch_words=4
    ...          ))

    References
    ----------
    .. [1] https://arxiv.org/abs/1607.00653

    """
    G_index, index_of_node, node_of_index = G.to_index_node_graph()

    if workers is None:
        walks = simulate_walks(
            G_index,
            walk_length=walk_length,
            num_walks=num_walks,
            p=p,
            q=q,
            weight_key=weight_key,
        )
    else:
        from joblib import Parallel
        from joblib import delayed

        num_walks_lists = np.array_split(range(num_walks), workers)
        walks = Parallel(n_jobs=workers)(delayed(simulate_walks)(
            G_index, walk_length, len(num_walks), p, q, weight_key)
                                         for num_walks in num_walks_lists)
        # Change multidimensional array to one dimensional array
        walks = [walk for walk_group in walks for walk in walk_group]

    model = learn_embeddings(walks=walks,
                             dimensions=dimensions,
                             **skip_gram_params)

    (
        embedding_vector,
        most_similar_nodes_of_node,
    ) = _get_embedding_result_from_gensim_skipgram_model(
        G=G,
        index_of_node=index_of_node,
        node_of_index=node_of_index,
        model=model)

    del G_index
    return embedding_vector, most_similar_nodes_of_node
Example #12
0
    def classify(self, sequences_fname: str, verbose=False) -> List[SingleResult]:
        """Perform a two-step classification.

        Parameters
        ----------
            sequences_fname : a path to fasta file to classify

        Returns
        -------
            predictions: a list of lists containing SingleResult objects.
        """
        with open(sequences_fname, "r") as sequences_handle:
            seqs = list(SimpleFastaParser(sequences_handle))
        # seqs = [(desc, "".join([l for l in seq.upper() if l in allowed_letters])) for desc, seq in seqs]
        seqs = [x for x in seqs if len(x[1]) >= self.min_len]

        do = delayed(fun)
        executor = Parallel(n_jobs=self.threads)
        tasks = (
            do(x[1], 0, self.predictors[0].transformer, self.params[0]["fragment_len"])
            for x in seqs
        )
        cont_manager = (
            time_context_manager("Calculating first stage sequence representations")
            if verbose
            else suppress()
        )
        with cont_manager:
            seqs = list(
                zip([x[0] for x in seqs], [x[1] for x in seqs], executor(tasks))
            )

        # Two-step classification
        if verbose:
            print("Performing first stage of classification.")
            fst_stage_results = []
            for seq in tqdm(seqs):
                fst_stage_results.append(self.predictors[0].make_prediction(seq))
        else:
            # tasks = (do(seq) for seq in seqs)
            # fst_stage_results = executor(tasks)
            fst_stage_results = [
                self.predictors[0].make_prediction(seq) for seq in seqs
            ]
        if verbose:
            print("Done")
        predictions = []
        to_second_stage = []
        for prediction in fst_stage_results:
            if prediction.cls[0] == "organelle":
                to_second_stage.append(prediction)
            else:
                predictions.append(prediction)
        if to_second_stage:
            tasks = (
                do(
                    record.seq,
                    1,
                    self.predictors[1].transformer,
                    self.params[1]["fragment_len"],
                )
                for record in to_second_stage
            )
            cont_manager = (
                time_context_manager(
                    "Calculating second stage sequence representations"
                )
                if verbose
                else suppress()
            )
            with cont_manager:
                seqs2 = list(
                    zip(
                        [record.desc for record in to_second_stage],
                        [record.seq for record in to_second_stage],
                        executor(tasks),
                    )
                )
            if verbose:
                print("Performing second stage of classification.")
                snd_stage_results = []
                for seq in tqdm(seqs2):
                    snd_stage_results.append(self.predictors[1].make_prediction(seq))
            else:
                # tasks = (do_second_stage(seq) for seq in seqs2)
                # snd_stage_results = executor(tasks)
                snd_stage_results = [
                    self.predictors[1].make_prediction(seq) for seq in seqs2
                ]
            for fst, snd in zip(to_second_stage, snd_stage_results):
                assert fst.desc == snd.desc, "Descriptions not the same"
                assert fst.seq == snd.seq, "Sequences not the same"
                predictions.append(
                    SingleResult(
                        desc=fst.desc,
                        seq=fst.seq,
                        cls=[fst.cls[0], snd.cls[1]],
                        probs=[fst.probs[0], snd.probs[1]],
                    )
                )
        return predictions
import pandas as pd
import numpy as np
from utils import particle
from joblib import delayed, Parallel
from tqdm import tqdm

part_dir = '../output/pipeline/particles/parts_filtered.csv'
savedir = '../output/pipeline/particles/parts_allframesimputed.csv'

parts = pd.read_csv(part_dir)
# forward-fill particle coordinates for missing frames
coords_complete = Parallel(n_jobs=12)(
    delayed(particle.impute_coords)(coords_df)
    for _, coords_df in tqdm(parts.groupby(['roi', 'mov_name'])))
coords_complete = pd.concat(coords_complete, ignore_index=True)
# add the rest of the columns back; new rows get NaNs in all of these
coords_complete = pd.merge(coords_complete,
                           parts,
                           on=list(coords_complete.columns),
                           how='outer')
coords_complete.to_csv(savedir, index=False)
Example #14
0
def main():
    train = read_train_data(path=os.path.join(
        input.__path__[0], 'petfinder-adoption-prediction/train/'))
    test = read_test_data(path=os.path.join(
        input.__path__[0], 'petfinder-adoption-prediction/test/'))
    if adoption_shuffle:
        train['AdoptionSpeed'] = random.sample(
            train['AdoptionSpeed'].values.tolist(), len(train))
    if densenet_predict:
        dnet_model = densenet_model(weight_path=os.path.join(
            input.__path__[0], 'densenet-keras/DenseNet-BC-121-32-no-top.h5'))
        train_feats = predict_using_img(
            dnet_model,
            train,
            img_path=os.path.join(
                input.__path__[0],
                'petfinder-adoption-prediction/train_images/'))
        test_feats = predict_using_img(
            dnet_model,
            test,
            img_path=os.path.join(
                input.__path__[0],
                'petfinder-adoption-prediction/test_images/'))
        train_feats.to_pickle('densenet_train_predict.pkl')
        test_feats.to_pickle('densenet_test_predict.pkl')
    else:
        with open('./densenet_train_predict.pkl', 'rb') as f:
            train_feats = pickle.load(f)
        with open('./densenet_test_predict.pkl', 'rb') as f:
            test_feats = pickle.load(f)

    all_ids = pd.concat([train, test], axis=0, ignore_index=True,
                        sort=False)[['PetID']]

    svd_col = adopt_svd(train_feats, test_feats)

    img_features = pd.concat([all_ids, svd_col], axis=1)

    labels_breed = pd.read_csv(
        os.path.join(input.__path__[0],
                     'petfinder-adoption-prediction/breed_labels.csv'))
    labels_color = pd.read_csv(
        os.path.join(input.__path__[0],
                     'petfinder-adoption-prediction/color_labels.csv'))
    labels_state = pd.read_csv(
        os.path.join(input.__path__[0], 'my_state_labels/my_state_labels.csv'))

    train_image_files = sorted(
        glob.glob(
            os.path.join(input.__path__[0],
                         'petfinder-adoption-prediction/train_images/*.jpg')))
    train_metadata_files = sorted(
        glob.glob(
            os.path.join(
                input.__path__[0],
                'petfinder-adoption-prediction/train_metadata/*.json')))
    train_sentiment_files = sorted(
        glob.glob(
            os.path.join(
                input.__path__[0],
                'petfinder-adoption-prediction/train_sentiment/*.json')))

    test_image_files = sorted(
        glob.glob(
            os.path.join(input.__path__[0],
                         'petfinder-adoption-prediction/test_images/*.jpg')))
    test_metadata_files = sorted(
        glob.glob(
            os.path.join(
                input.__path__[0],
                'petfinder-adoption-prediction/test_metadata/*.json')))
    test_sentiment_files = sorted(
        glob.glob(
            os.path.join(
                input.__path__[0],
                'petfinder-adoption-prediction/test_sentiment/*.json')))

    # Metadata:
    train_df_metadata = pd.DataFrame(train_metadata_files)
    train_df_metadata.columns = ['metadata_filename']
    train_df_sentiment = pd.DataFrame(train_sentiment_files)
    train_df_sentiment.columns = ['sentiment_filename']
    # Metadata:
    test_df_metadata = pd.DataFrame(test_metadata_files)
    test_df_metadata.columns = ['metadata_filename']
    test_df_sentiment = pd.DataFrame(test_sentiment_files)
    test_df_sentiment.columns = ['sentiment_filename']

    train_pet_ids = train.PetID.unique()
    test_pet_ids = test.PetID.unique()

    if exe_extract_additional_feature:
        dfs_train = Parallel(n_jobs=12, verbose=1)(
            delayed(extract_additional_features)(i, mode='train')
            for i in train_pet_ids)
        dfs_test = Parallel(n_jobs=12, verbose=1)(
            delayed(extract_additional_features)(i, mode='test')
            for i in test_pet_ids)
        train_dfs_sentiment = [
            x[0] for x in dfs_train if isinstance(x[0], pd.DataFrame)
        ]
        train_dfs_metadata = [
            x[1] for x in dfs_train if isinstance(x[1], pd.DataFrame)
        ]
        train_dfs_sentiment = pd.concat(train_dfs_sentiment,
                                        ignore_index=True,
                                        sort=False)
        train_dfs_metadata = pd.concat(train_dfs_metadata,
                                       ignore_index=True,
                                       sort=False)
        test_dfs_sentiment = [
            x[0] for x in dfs_test if isinstance(x[0], pd.DataFrame)
        ]
        test_dfs_metadata = [
            x[1] for x in dfs_test if isinstance(x[1], pd.DataFrame)
        ]
        test_dfs_sentiment = pd.concat(test_dfs_sentiment,
                                       ignore_index=True,
                                       sort=False)
        test_dfs_metadata = pd.concat(test_dfs_metadata,
                                      ignore_index=True,
                                      sort=False)
        train_dfs_metadata.to_pickle('train_dfs_metadata.pkl')
        train_dfs_sentiment.to_pickle('train_dfs_sentiment.pkl')
        test_dfs_metadata.to_pickle('test_dfs_metadata.pkl')
        test_dfs_sentiment.to_pickle('test_dfs_sentiment.pkl')

    else:
        with open('./train_dfs_metadata.pkl', 'rb') as f:
            train_dfs_metadata = pickle.load(f)
        with open('./train_dfs_sentiment.pkl', 'rb') as f:
            train_dfs_sentiment = pickle.load(f)
        with open('./test_dfs_metadata.pkl', 'rb') as f:
            test_dfs_metadata = pickle.load(f)
        with open('./test_dfs_sentiment.pkl', 'rb') as f:
            test_dfs_sentiment = pickle.load(f)

    # ### group extracted features by PetID:
    train_proc = agg_feature(train, train_dfs_metadata, train_dfs_sentiment)
    test_proc = agg_feature(test, test_dfs_metadata, test_dfs_sentiment)
    train_proc = merge_labels_breed(train_proc, labels_breed)
    test_proc = merge_labels_breed(test_proc, labels_breed)
    train_proc, test_proc = merge_labels_state(train_proc, test_proc,
                                               labels_state)
    train_proc = fill_and_drop_feature(train_proc)
    test_proc = fill_and_drop_feature(test_proc)
    train_proc = add_feature(train_proc)
    test_proc = add_feature(test_proc)

    X = pd.concat([train_proc, test_proc], ignore_index=True, sort=False)
    X_temp = X.copy()
    text_columns = [
        'Description', 'metadata_annots_top_desc', 'sentiment_entities'
    ]
    categorical_columns = ['main_breed_BreedName', 'second_breed_BreedName']
    to_drop_columns = ['PetID', 'Name', 'RescuerID']

    rescuer_count = X.groupby(['RescuerID'])['PetID'].count().reset_index()
    rescuer_count.columns = ['RescuerID', 'RescuerID_COUNT']

    X_temp = X_temp.merge(rescuer_count, how='left', on='RescuerID')

    for i in categorical_columns:
        try:
            X_temp.loc[:, i] = pd.factorize(X_temp.loc[:, i])[0]
        except:
            pass

    X_text = X_temp[text_columns]
    for i in X_text.columns:
        X_text.loc[:, i] = X_text.loc[:, i].fillna('none')

    X_temp['Length_Description'] = X_text['Description'].map(len)
    X_temp['Length_metadata_annots_top_desc'] = X_text[
        'metadata_annots_top_desc'].map(len)
    X_temp['Lengths_sentiment_entities'] = X_text['sentiment_entities'].map(
        len)
    X_temp = parse_tfidf(X_temp, X_text)

    X_temp = X_temp.merge(img_features, how='left', on='PetID')

    agg_train_imgs = agg_img_feature(train_image_files)
    agg_test_imgs = agg_img_feature(test_image_files)
    agg_imgs = pd.concat([agg_train_imgs, agg_test_imgs],
                         axis=0).reset_index(drop=True)
    X_temp = X_temp.merge(agg_imgs, how='left', on='PetID')

    # ### Drop ID, name and rescuerID
    X_temp = X_temp.drop(to_drop_columns, axis=1)

    X_train = X_temp.loc[np.isfinite(X_temp.AdoptionSpeed), :]
    X_test = X_temp.loc[~np.isfinite(X_temp.AdoptionSpeed), :]

    X_test = X_test.drop(['AdoptionSpeed'], axis=1)
    assert X_train.shape[0] == train.shape[0]
    assert X_test.shape[0] == test.shape[0]
    train_cols = X_train.columns.tolist()
    train_cols.remove('AdoptionSpeed')

    test_cols = X_test.columns.tolist()

    assert np.all(train_cols == test_cols)

    X_train_non_null = X_train.fillna(-1)
    X_test_non_null = X_test.fillna(-1)
    X_train_non_null.isnull().any().any(), X_test_non_null.isnull().any().any()

    xgb_params = {
        'eval_metric': 'rmse',
        'object': 'reg:squarederror',
        'seed': 1337,
        'eta': 0.0123,
        'subsample': 0.8,
        'colsample_bytree': 0.85,
        'tree_method': 'gpu_hist',
        'device': 'gpu',
        'silent': 1,
    }
    X_train_non_null = fill_and_drop_feature_end(X_train_non_null)
    X_test_non_null = fill_and_drop_feature_end(X_test_non_null)

    X_train_non_null.to_csv('./X_train.csv')

    model, oof_train, oof_test, feature_score = run_xgb(
        xgb_params, X_train_non_null, X_test_non_null)

    optR = OptimizedRounder()
    optR.fit(oof_train, X_train['AdoptionSpeed'].values)
    coefficients = optR.coefficients()
    valid_pred = optR.predict(oof_train, coefficients)
    qwk = quadratic_weighted_kappa(X_train['AdoptionSpeed'].values, valid_pred)
    print("QWK = ", qwk)

    coefficients_ = coefficients.copy()
    coefficients_[0] = 1.66
    coefficients_[1] = 2.13
    coefficients_[3] = 2.85
    train_predictions = optR.predict(oof_train, coefficients_).astype(np.int8)
    test_predictions = optR.predict(oof_test.mean(axis=1),
                                    coefficients_).astype(np.int8)

    valid_pred = optR.predict(oof_train, coefficients_)
    qwk_change = quadratic_weighted_kappa(X_train['AdoptionSpeed'].values,
                                          valid_pred)
    print("QWK_change = ", qwk_change)

    submission = pd.DataFrame({
        'PetID': test['PetID'].values,
        'AdoptionSpeed': test_predictions
    })
    submission.to_csv('submission.csv', index=False)
    str_metric_score = 'qwk' + '_0' + str(int(qwk * 100000))
    storage_process(submission, str_metric_score, qwk, qwk_change,
                    feature_score)
 def get_items(self, indexes):
     items = Parallel(n_jobs=1)(delayed(self.get_item)(i) for i in indexes)
     images, meta_info = zip(*items)
     images = torch.stack(images, dim=0)
     return images, meta_info
Example #16
0
    def fit(
        self,
        train_loader,
        epochs=100,
        log_interval=100,
        test_loader=None,
        save_model=True,
        save_dir=None,
    ):

        self._validate_parameters(epochs, log_interval)
        self.n_outputs = self._decide_n_outputs(train_loader)

        # Instantiate a pool of base estimators, optimizers, and schedulers.
        estimators = []
        for _ in range(self.n_estimators):
            estimators.append(self._make_estimator())

        optimizers = []
        for i in range(self.n_estimators):
            optimizers.append(
                set_module.set_optimizer(estimators[i], self.optimizer_name,
                                         **self.optimizer_args))

        if self.use_scheduler_:
            scheduler_ = set_module.set_scheduler(optimizers[0],
                                                  self.scheduler_name,
                                                  **self.scheduler_args)

        # Utils
        criterion = nn.CrossEntropyLoss()
        best_acc = 0.0

        # Internal helper function on pesudo forward
        def _forward(estimators, data):
            outputs = [
                F.softmax(estimator(data), dim=1) for estimator in estimators
            ]
            proba = op.average(outputs)

            return proba

        # Maintain a pool of workers
        with Parallel(n_jobs=self.n_jobs) as parallel:

            # Training loop
            for epoch in range(epochs):
                self.train()

                if self.use_scheduler_:
                    cur_lr = scheduler_.get_last_lr()[0]
                else:
                    cur_lr = None

                if self.n_jobs and self.n_jobs > 1:
                    msg = "Parallelization on the training epoch: {:03d}"
                    self.logger.info(msg.format(epoch))

                rets = parallel(
                    delayed(_parallel_fit_per_epoch)(
                        train_loader,
                        estimator,
                        cur_lr,
                        optimizer,
                        criterion,
                        idx,
                        epoch,
                        log_interval,
                        self.device,
                        True,
                    ) for idx, (
                        estimator,
                        optimizer) in enumerate(zip(estimators, optimizers)))

                estimators, optimizers = [], []
                for estimator, optimizer in rets:
                    estimators.append(estimator)
                    optimizers.append(optimizer)

                # Validation
                if test_loader:
                    self.eval()
                    with torch.no_grad():
                        correct = 0
                        total = 0
                        for _, (data, target) in enumerate(test_loader):
                            data = data.to(self.device)
                            target = target.to(self.device)
                            output = _forward(estimators, data)
                            _, predicted = torch.max(output.data, 1)
                            correct += (predicted == target).sum().item()
                            total += target.size(0)
                        acc = 100 * correct / total

                        if acc > best_acc:
                            best_acc = acc
                            self.estimators_ = nn.ModuleList()
                            self.estimators_.extend(estimators)
                            if save_model:
                                io.save(self, save_dir, self.logger)

                        msg = ("Epoch: {:03d} | Validation Acc: {:.3f}"
                               " % | Historical Best: {:.3f} %")
                        self.logger.info(msg.format(epoch, acc, best_acc))
                        if self.tb_logger:
                            self.tb_logger.add_scalar("voting/Validation_Acc",
                                                      acc, epoch)

                # Update the scheduler
                with warnings.catch_warnings():

                    # UserWarning raised by PyTorch is ignored because
                    # scheduler does not have a real effect on the optimizer.
                    warnings.simplefilter("ignore", UserWarning)

                    if self.use_scheduler_:
                        scheduler_.step()

        self.estimators_ = nn.ModuleList()
        self.estimators_.extend(estimators)
        if save_model and not test_loader:
            io.save(self, save_dir, self.logger)
Example #17
0
    data = get_data(filename)
    print('Loaded data from: {}\nLength: {}'.format(filename,
                                                    len(data['mass'])))

    plot_distributions(data, scenario)
    print('[{}] Plotted Distrbutions (1/7)'.format(scenario))
    plot_abundances(data, scenario)
    print('[{}] Plotted Abundances (2/7)'.format(scenario))
    plot_chisq_distribution(data, scenario)
    print('[{}] Plotted Chi Squared Distrbutions (3/7)'.format(scenario))
    plot_mchi_omegab_contours(data, scenario, 'BBN')
    plot_mchi_omegab_contours(data, scenario, 'CMB')
    plot_mchi_omegab_contours(data, scenario, 'BBN+CMB')
    print('[{}] Plotted Omegab vs Mchi Contours (4/7)'.format(scenario))
    plot_joint_mchi_omegab(data, scenario)
    print('[{}] Plotted joint contours (5/7)'.format(scenario))
    plot_deltachisq(data, scenario, zoom=False)
    plot_deltachisq(data, scenario, zoom=True)
    print('[{}] Plotted Delta Chi curves (6/7)'.format(scenario))
    print('[{}] Saving results (7/7)'.format(scenario))
    save_results(data, scenario)


if __name__ == '__main__':
    scenarios = [
        'EE_Neutral_Scalar', 'EE_Maj', 'Nu_Complex_Scalar',
        'Nu_Neutral_Scalar', 'Nu_Zp'
    ]
    Parallel(n_jobs=-1)(delayed(run_scenario)(scenario=scenario)
                        for scenario in scenarios)
Example #18
0
    def fit(
        self,
        train_loader,
        epochs=100,
        log_interval=100,
        test_loader=None,
        save_model=True,
        save_dir=None,
    ):

        self._validate_parameters(epochs, log_interval)
        self.n_outputs = self._decide_n_outputs(train_loader)

        # Instantiate a pool of base estimators, optimizers, and schedulers.
        estimators = []
        for _ in range(self.n_estimators):
            estimators.append(self._make_estimator())

        optimizers = []
        for i in range(self.n_estimators):
            optimizers.append(
                set_module.set_optimizer(estimators[i], self.optimizer_name,
                                         **self.optimizer_args))

        if self.use_scheduler_:
            scheduler_ = set_module.set_scheduler(optimizers[0],
                                                  self.scheduler_name,
                                                  **self.scheduler_args)

        # Utils
        criterion = nn.MSELoss()
        best_mse = float("inf")

        # Internal helper function on pesudo forward
        def _forward(estimators, data):
            outputs = [estimator(data) for estimator in estimators]
            pred = op.average(outputs)

            return pred

        # Maintain a pool of workers
        with Parallel(n_jobs=self.n_jobs) as parallel:

            # Training loop
            for epoch in range(epochs):
                self.train()

                if self.use_scheduler_:
                    cur_lr = scheduler_.get_last_lr()[0]
                else:
                    cur_lr = None

                if self.n_jobs and self.n_jobs > 1:
                    msg = "Parallelization on the training epoch: {:03d}"
                    self.logger.info(msg.format(epoch))

                rets = parallel(
                    delayed(_parallel_fit_per_epoch)(
                        train_loader,
                        estimator,
                        cur_lr,
                        optimizer,
                        criterion,
                        idx,
                        epoch,
                        log_interval,
                        self.device,
                        False,
                    ) for idx, (
                        estimator,
                        optimizer) in enumerate(zip(estimators, optimizers)))

                estimators, optimizers = [], []
                for estimator, optimizer in rets:
                    estimators.append(estimator)
                    optimizers.append(optimizer)

                # Validation
                if test_loader:
                    self.eval()
                    with torch.no_grad():
                        mse = 0.0
                        for _, (data, target) in enumerate(test_loader):
                            data = data.to(self.device)
                            target = target.to(self.device)
                            output = _forward(estimators, data)
                            mse += criterion(output, target)
                        mse /= len(test_loader)

                        if mse < best_mse:
                            best_mse = mse
                            self.estimators_ = nn.ModuleList()
                            self.estimators_.extend(estimators)
                            if save_model:
                                io.save(self, save_dir, self.logger)

                        msg = ("Epoch: {:03d} | Validation MSE:"
                               " {:.5f} | Historical Best: {:.5f}")
                        self.logger.info(msg.format(epoch, mse, best_mse))
                        if self.tb_logger:
                            self.tb_logger.add_scalar("voting/Validation_MSE",
                                                      mse, epoch)

                # Update the scheduler
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", UserWarning)

                    if self.use_scheduler_:
                        scheduler_.step()

        self.estimators_ = nn.ModuleList()
        self.estimators_.extend(estimators)
        if save_model and not test_loader:
            io.save(self, save_dir, self.logger)
Example #19
0
            pi.set_epsilon(epsilon)
            mdp.set_episode_end(True)
            core.learn(n_steps=evaluation_frequency,
                       n_steps_per_fit=train_frequency,
                       quiet=args.quiet)

            if args.save:
                agent.approximator.model.save()

            print('- Evaluation:')
            # evaluation step
            pi.set_eval(True)
            pi.set_epsilon(epsilon_test)
            mdp.set_episode_end(False)
            dataset = core.evaluate(n_steps=test_samples,
                                    render=args.render,
                                    quiet=args.quiet)
            scores.append(get_stats(dataset))

            np.save(folder_name + '/scores.npy', scores)

    return scores


if __name__ == '__main__':
    n_experiments = 1

    out = Parallel(n_jobs=-1)(delayed(experiment)()
                              for _ in range(n_experiments))
    tf.reset_default_graph()
Example #20
0
        homoFile = 
        energyFile = 
        corrFile =
        ASMFile = 



        merge, xx, yy, gt = read_raster(in_raster)

        merge[np.isnan(merge)] = 0

        Z,ind = sliding_window(merge,(win,win),(win,win))

        Ny, Nx = np.shape(merge)

        w = Parallel(n_jobs = cpu_count(), verbose=0)(delayed(p_me)(Z[k]) for k in xrange(len(Z)))

        cont = [a[0] for a in w]
        diss = [a[1] for a in w]
        h**o = [a[2] for a in w]
        eng  = [a[3] for a in w]
        corr = [a[4] for a in w]
        ASM  = [a[5] for a in w]


        #Reshape to match number of windows
        plt_cont = np.reshape(cont , ( ind[0], ind[1] ) )
        plt_diss = np.reshape(diss , ( ind[0], ind[1] ) )
        plt_homo = np.reshape(h**o , ( ind[0], ind[1] ) )
        plt_eng = np.reshape(eng , ( ind[0], ind[1] ) )
        plt_corr = np.reshape(corr , ( ind[0], ind[1] ) )
# Script for grabbing congressional graph data
from joblib import Parallel, delayed
import multiprocessing
import time
from src import write_graph

data_dir = 'graphs'
num_cores = multiprocessing.cpu_count()

print('Running on {} cores.'.format(num_cores))


def grab_data(congress):
    start = time.time()
    write_graph(congress)
    end = time.time()
    print(
        'Finished graph for congress {}.\nElapsed time {:.03f} s.\n\n'.format(
            congress, end - start))


Parallel(n_jobs=num_cores)(delayed(grab_data)(congress)
                           for congress in range(101, 115))
Example #22
0
        "yRange": (0, YRAN[1] * 1)
    }
    # print('YRAN: {}'.format(STYLE))
    ###############################################################################
    # Plot
    ###############################################################################
    (fNum, digs) = monet.lenAndDigits(subset)
    Parallel(n_jobs=JOB)(
        delayed(dbg.exportPstTracesParallel)(exIx,
                                             fNum,
                                             aux.STABLE_T,
                                             0,
                                             QNT,
                                             STYLE,
                                             pt_img,
                                             digs=digs,
                                             border=True,
                                             autoAspect=True,
                                             labelPos=(.8, .15),
                                             poePrint=False,
                                             mnfPrint=False,
                                             ticksHide=TICKS_HIDE,
                                             transparent=True,
                                             sampRate=aux.SAMP_RATE)
        for exIx in subset)
# Export gene legend ------------------------------------------------------
# repDta = pkl.load(expsIter[0][1])
# monet.exportGeneLegend(
#     repDta['genotypes'], [i[:-2]+'cc' for i in CLR],
#     PT_IMG+'/legend_{}.png'.format(TRC), 500
# )
    def fit(
        self,
        train_loader,
        epochs=100,
        use_reduction_sum=True,
        log_interval=100,
        test_loader=None,
        save_model=True,
        save_dir=None,
    ):

        # Instantiate base estimators and set attributes
        for _ in range(self.n_estimators):
            self.estimators_.append(self._make_estimator())
        self._validate_parameters(epochs, log_interval)
        self.n_outputs = self._decide_n_outputs(train_loader)

        # Utils
        criterion = (nn.MSELoss(
            reduction="sum") if use_reduction_sum else nn.MSELoss())
        total_iters = 0

        # Set up optimizer and learning rate scheduler
        optimizer = set_module.set_optimizer(self, self.optimizer_name,
                                             **self.optimizer_args)

        if self.use_scheduler_:
            scheduler = set_module.set_scheduler(
                optimizer,
                self.scheduler_name,
                **self.scheduler_args  # noqa: E501
            )

        for epoch in range(epochs):
            self.train()
            for batch_idx, elem in enumerate(train_loader):

                data, target = io.split_data_target(elem, self.device)
                output = [estimator(*data) for estimator in self.estimators_]

                # Compute pseudo residuals in parallel
                rets = Parallel(n_jobs=self.n_jobs)(
                    delayed(_parallel_compute_pseudo_residual)(
                        output,
                        target,
                        i,
                        self.shrinkage_rate,
                        self.n_outputs,
                        self.is_classification,
                    ) for i in range(self.n_estimators))

                # Compute sGBM loss
                loss = torch.tensor(0.0, device=self.device)
                for idx, estimator in enumerate(self.estimators_):
                    loss += criterion(output[idx], rets[idx])

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # Print training status
                if batch_idx % log_interval == 0:
                    with torch.no_grad():
                        msg = "Epoch: {:03d} | Batch: {:03d} | RegLoss: {:.5f}"
                        self.logger.info(msg.format(epoch, batch_idx, loss))
                        if self.tb_logger:
                            self.tb_logger.add_scalar("sGBM/Train_Loss", loss,
                                                      total_iters)
                total_iters += 1

            # Validation
            if test_loader:
                flag = self._evaluate_during_fit(test_loader, epoch)
                if save_model and flag:
                    io.save(self, save_dir, self.logger)

            # Update the scheduler
            if self.use_scheduler_:
                scheduler.step()

        if save_model and not test_loader:
            io.save(self, save_dir, self.logger)
Example #24
0
def returnMatchTable(rootDir,
                     visit,
                     ccdList,
                     outfile=None,
                     fakeCat=None,
                     overwrite=False,
                     filt=None,
                     tol=1.0,
                     pixMatch=False,
                     multiband=False,
                     reffMatch=False,
                     pix=0.168,
                     multijobs=1,
                     includeMissing=True,
                     minRad=None,
                     raCol='RA',
                     decCol='Dec'):
    """
    Driver (main function) for return match to fakes.

    INPUT: rootDir = rerun directory
           visit = visit id (int) (or tracts)
           ccdList = list of ccds to look at (or patches)
           outdir = output directory for matched file,
                    None means no output written
           fakeCat = fake catalog to match to,
                     None means the fake sources are just
                     extracted from the header of the CCDs based on
                     position but no matching is done
           overwrite = whether to overwrite the existing output file,
                       default is False
           pixMatch = do pixel matching instead of ra/dec matching
                      even if there is a catalog supplied
           multiband = whether match to forced photometry catalogs
                       from multiband process
           reffMatch = whether match fake sources in pixel radius
                       or using tol x Reff (Only for Ra, Dec match)
    OUTPUT: returns an astropy.table.Table with all the entries
            from the source catalog for objects which match in pixel
            position to the fake sources
    """
    butler = dafPersist.Butler(rootDir)
    slist = None

    if multijobs > 1:
        try:
            from joblib import Parallel, delayed
            mlist = Parallel(n_jobs=multijobs)(
                delayed(returnMatchSingle)(butler,
                                           None,
                                           visit,
                                           ccd,
                                           filt=filt,
                                           fakeCat=fakeCat,
                                           includeMissing=includeMissing,
                                           pixMatch=pixMatch,
                                           reffMatch=reffMatch,
                                           tol=tol,
                                           multiband=multiband,
                                           minRad=minRad,
                                           pix=pix,
                                           decCol=decCol,
                                           raCol=raCol) for ccd in ccdList)
            for m in mlist:
                if m is not None:
                    if slist is None:
                        slist = m.copy(True)
                    else:
                        slist.extend(m, True)
                    del m
        except ImportError:
            print("# Can not import joblib, stop multiprocessing!")
            for ccd in ccdList:
                slist = returnMatchSingle(butler,
                                          slist,
                                          visit,
                                          ccd,
                                          filt=filt,
                                          fakeCat=fakeCat,
                                          includeMissing=includeMissing,
                                          pixMatch=pixMatch,
                                          reffMatch=reffMatch,
                                          tol=tol,
                                          pix=pix,
                                          multiband=multiband,
                                          minRad=minRad,
                                          raCol=raCol,
                                          decCol=decCol)
    else:
        for ccd in ccdList:
            slist = returnMatchSingle(butler,
                                      slist,
                                      visit,
                                      ccd,
                                      filt=filt,
                                      fakeCat=fakeCat,
                                      includeMissing=includeMissing,
                                      pixMatch=pixMatch,
                                      reffMatch=reffMatch,
                                      tol=tol,
                                      pix=pix,
                                      multiband=multiband,
                                      minRad=minRad,
                                      raCol=raCol,
                                      decCol=decCol)

    if slist is None:
        print("Returns no match....!")

        return None
    else:
        astroTable = getAstroTable(slist, mags=True)

        if fakeCat is not None:
            astroTable = matchToFakeCatalog(astroTable, fakeCat)

        if outfile is not None:
            try:
                astroTable.write(outfile + '.fits',
                                 format='fits',
                                 overwrite=overwrite)
            except IOError:
                print("Try setting the option -w to overwrite the file.")
                raise

        return astroTable
Example #25
0
def create_tarenc_agg_features(mode_target_persons, mode_target_cols):
    logger.info(f"tarenc_agg_tp{mode_target_persons}_tc{mode_target_cols}")
    mprof_timestamp(f"tarenc_agg_tp{mode_target_persons}_tc{mode_target_cols}")
    _ = Parallel(n_jobs=args.n_jobs//2, verbose=args.verbose_joblib) \
            ([delayed(create_tarenc_agg_features_1fold)(fold, mode_target_persons, mode_target_cols, "tarenc") for fold in range(args.FOLD_NUM)])
    os.makedirs(savedir)

size = 512,512
colors = ['red','green','blue','yellow']
imgList = pd.read_csv("../input/HPAv18/HPAv18RBGY_wodpl.csv")
print(len(imgList))


#for i in tqdm(imgList['Id']):

def save_image(i):
    for color in colors:
        img_path = i + "_" + color + ".jpg"
        img_name = i + "_" + color + ".png"
        img_full_path = img_dir + img_path

        fname = os.path.join(savedir,img_name)
        if not os.path.exists(fname):
            x = Image.open(img_full_path, 'r')
            x = x.convert('L')  # makes it greyscale
            y = np.asarray(x.getdata(), dtype=np.float64).reshape((x.size[1], x.size[0]))
            y = np.asarray(y, dtype=np.uint8)  # if values still in range 0-255!
            w = Image.fromarray(y, mode='L')
            w.thumbnail(size, Image.ANTIALIAS)
            w.save(os.path.join(savedir,img_name))

num_cores = 6
Parallel(n_jobs=num_cores, prefer="threads")(delayed(save_image)(i) \
                                             for i in tqdm(imgList['Id']))

print('done')
Example #27
0
import subprocess as sp
from joblib import Parallel, delayed


def job(i):
    sp.call([
        "python",
        "/home/mszul/git/DANC_MEG_learning_beta/pipeline_08_epoch_qc.py",
        str(i), "settings_hdd.json"
    ])


Parallel(n_jobs=-1)(delayed(job)(i) for i in range(0, 38))
    mask_2d = np.zeros_like(np_fix)
    ellipse = opt_ellipse(np_fix, ellipse_tuple, sigma=sigma, steps=steps)

    ((cx, cy), (M, m), theta) = ellipse
    (M, m) = (M + dilate, m + dilate)
    ellipse = ((cx, cy), (M, m), theta)

    np_cnt_help = np.zeros_like(np_fix, dtype=np.uint8)
    cv2.ellipse(np_cnt_help, ellipse, 255, -1)
    contours, _ = cv2.findContours(np_cnt_help, cv2.RETR_TREE,
                                   cv2.CHAIN_APPROX_SIMPLE)
    cnt = contours[0]
    cv2.drawContours(mask_2d, [cnt], 0, 1, -1)
    mask_2d = cv2.resize(mask_2d, (xS, yS), cv2.INTER_NEAREST)
    mask_2d = np.expand_dims(mask_2d, -1)
    mask = mask + mask_2d  # broadcast
    # print(mask.shape)

    nrrd.write(dstpath, mask)


filelist = os.listdir(srcdir)

if parallel:
    Parallel(n_jobs=n_jobs, backend="multiprocessing")(
        delayed(draw_ellipse)(filename, ellipse_tuple)
        for filename in filelist)
else:
    for filename in filelist:
        draw_ellipse(filename, ellipse_tuple)
def Prediction(best_parameters_dct, df_dict, n_jobs=-1):
    """Predication of the model for the best parameters.

       Take as argument a list of the best parameters for the three datasets.
       """

    # Extract the best model
    data_aug = best_parameters_dct["Data Augmentation"]["Function"]
    embedding = best_parameters_dct["Embedding"]["Function"]
    model = best_parameters_dct["Model"]["Function"]

    # Array of prediction
    pred = np.zeros((3 * 1000, 2), dtype=int)

    def subPredictions(k, df_dct=df_dict):
        """Compute predictions for a given dataset."""

        # Extraction of the data
        X_train_k = df_dct[k][0]["seq"].values
        y_train_k = df_dct[k][2]["Bound"].values
        X_test_k = df_dct[k][1]["seq"].values

        # Data augmentation
        X_train_k, y_train_k = data_aug.call(X_train_k, y_train_k)

        # Embedding
        X_train_k = embedding.call(X_train_k, train=True)

        # Training of the model
        model.fit(X_train_k, y_train_k)

        # Compute average score
        score = model.score(X_train_k, y_train_k) / 3

        # Prediction of test data
        X_test_k = embedding.call(X_test_k, train=False, X_train=df_dct[k][0]["seq"].values)
        y_pred_k = model.predict(X_test_k)

        return y_pred_k, score

    # Paralleisation of the cv
    if n_jobs != 1:
        preds_score = Parallel(n_jobs=n_jobs)(delayed(subPredictions)(k)
                                              for k in tqdm(range(3)))

    elif n_jobs == 1:
        preds_score = [subPredictions(k) for k in range(3)]

    # Initialisation of average score
    final_score = 0

    # Loop to extract the predictions and scores
    for k in range(3):

        # Update pred
        pred[1000 * k: 1000 * (k + 1), 0] = df_dict[k][1]["Id"].values
        pred[1000 * k: 1000 * (k + 1), 1] = preds_score[k][0].reshape(-1)

        # Update final_score
        final_score += preds_score[k][1]

    # Display average score
    print(final_score)

    return pred
Example #30
0
def preprocessData_v3(img_size, RGB):
    # %%

    train_perc = 0.7

    filelist = glob.glob('data/train/*')
    filelist_train = filelist[:int(len(filelist) * train_perc)]
    filelist_val = filelist[int(len(filelist) * train_perc):]

    def processImageBN(fname, store_dir, store_dir_full, source):
        fname = fname.split('/')[2].split('.')[0]
        img = Image.open(source + fname + '.jpg').resize(
            img_size, Image.ANTIALIAS).convert('L')
        img.save(store_dir + fname + '.png')
        if store_dir_full: img.save(store_dir_full + fname + '.png')

    def processImageRGB(fname, store_dir, store_dir_full, source):
        fname = fname.split('/')[2].split('.')[0]
        img = Image.open(source + fname + '.jpg').resize(
            img_size, Image.ANTIALIAS)
        img.save(store_dir + fname + '.png')
        if store_dir_full: img.save(store_dir_full + fname + '.png')

    def processImageBN_mask(fname, store_dir_full, store_dir):
        fname = fname.split('/')[2].split('.')[0]
        img = Image.open('data/train_masks/' + fname + '_mask.gif').resize(
            img_size, Image.ANTIALIAS)
        img.save(store_dir + fname + '.png')
        if store_dir_full: img.save(store_dir_full + fname + '.png')

    data_func = processImageRGB if RGB else processImageBN
    rgb_sufix = '_RGB' if RGB else ''

    # Get taining data
    t = time.time()
    store_dir = 'data/train_' + str(img_size) + rgb_sufix + '/data/'
    store_dir_full = 'data/full_' + str(img_size) + rgb_sufix + '/data/'
    if not os.path.exists(store_dir): os.makedirs(store_dir)
    if not os.path.exists(store_dir_full): os.makedirs(store_dir_full)
    Parallel(n_jobs=8)(
        delayed(data_func)(fname, store_dir, store_dir_full, 'data/train/')
        for fname in filelist_train)

    store_dir = 'data/train_mask_' + str(img_size) + '/data/'
    store_dir_full = 'data/full_mask_' + str(img_size) + '/data/'
    if not os.path.exists(store_dir): os.makedirs(store_dir)
    if not os.path.exists(store_dir_full): os.makedirs(store_dir_full)
    Parallel(n_jobs=8)(
        delayed(processImageBN_mask)(fname, store_dir, store_dir_full)
        for fname in filelist_train)
    print "Train. Time elapsed:", (time.time() - t) / 60

    # Get validation data
    t = time.time()
    store_dir = 'data/val_' + str(img_size) + rgb_sufix + '/data/'
    store_dir_full = 'data/full_' + str(img_size) + rgb_sufix + '/data/'
    if not os.path.exists(store_dir): os.makedirs(store_dir)
    if not os.path.exists(store_dir_full): os.makedirs(store_dir)
    Parallel(n_jobs=8)(
        delayed(data_func)(fname, store_dir, store_dir_full, 'data/train/')
        for fname in filelist_val)

    store_dir = 'data/val_mask_' + str(img_size) + '/data/'
    store_dir_full = 'data/full_mask_' + str(img_size) + '/data/'
    if not os.path.exists(store_dir): os.makedirs(store_dir)
    Parallel(n_jobs=8)(
        delayed(processImageBN_mask)(fname, store_dir, store_dir_full)
        for fname in filelist_val)
    print "Validation. Time elapsed:", (time.time() - t) / 60

    # Get test data
    t = time.time()
    print "Processing Test..."
    filelist = glob.glob('data/test/*')
    store_dir = 'data/test_' + str(img_size) + rgb_sufix + '/data/'
    if not os.path.exists(store_dir): os.makedirs(store_dir)
    step = len(filelist) / 10
    for i in np.arange(0, len(filelist), step):
        st = time.time()
        Parallel(n_jobs=8)(
            delayed(data_func)(fname, store_dir, None, 'data/test/')
            for fname in filelist[i:i + step])
        #        print i,'/', 10, '\t', time.strftime("%H:%M:%S"), '-', (time.time()-st)
        print '{0}/{1}\t{2} - {3:.2f}'.format(i, 10, time.strftime("%H:%M:%S"),
                                              (time.time() - st))
    print "Test. Time elapsed:", (time.time() - t) / 60