Ejemplo n.º 1
0
def training_all():

    # properties
    properties = misc.load_npy(args.scratch + "properties")

    # fchls
    kernel = misc.load_npy(args.scratch + "kernel." + "fchl18")


    return
def get_avg_repr(idx, scr="_tmp_ensemble_/", **kwargs):

    name = "slatm"

    energies = misc.load_npy(scr + str(idx) + ".energies")
    molobjs = cheminfo.read_sdffile(scr + str(idx) + ".sdf")
    molobjs = [mol for mol in molobjs]

    xyzs = molobjs_to_xyzs(molobjs)
    reprs = xyzs_to_representations(*xyzs, **kwargs)

    # Boltzmann factors
    factors = np.exp(-energies)
    factors /= np.sum(factors)

    length = reprs.shape[1]
    avgrep = np.zeros(length)

    for rep, factor in zip(reprs, factors):
        avgrep += factor * rep

    print(idx, avgrep.shape)

    if "array" in kwargs:
        results = kwargs["array"]
        results[idx, :] = avgrep

    else:
        return idx, avgrep
Ejemplo n.º 3
0
def main():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--scratch',
                        action='store',
                        help='',
                        metavar="dir",
                        default="_tmp_")
    parser.add_argument('-j',
                        '--procs',
                        action='store',
                        help='pararallize',
                        metavar="int",
                        default=0,
                        type=int)

    args = parser.parse_args()

    if args.scratch[-1] != "/":
        args.scratch += "/"

    # Read properties
    properties = misc.load_npy(args.scratch + "properties")
    molecules = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz")
    molecules = list(molecules)

    heavy_atoms = []
    predictions = []
    errors = []

    for mol, prop in zip(molecules, properties):

        smi = cheminfo.molobj_to_smiles(mol, remove_hs=True)
        J = thermo.joback.Joback(smi)
        # J = thermo.joback.Joback('CC(=O)C')
        # J = thermo.joback.Joback('CCC(=O)OC(=O)CC')

        status = J.status

        atoms, coord = cheminfo.molobj_to_xyz(mol)
        idx = np.where(atoms != 1)
        atoms = atoms[idx]
        N = len(atoms)
        heavy_atoms.append(N)

        if "Did not match all atoms present" in status:
            errors.append(1)
            predictions.append(float("nan"))
            continue

        try:
            estimate = J.estimate()
        except TypeError:
            errors.append(1)
            predictions.append(float("nan"))
            continue

        errors.append(0)

        T_b = estimate["Tb"]
        T_m = estimate["Tm"]

        predictions.append(T_m)

    errors = np.array(errors, dtype=int)

    idx_success, = np.where(errors == 0)

    heavy_atoms = np.array(heavy_atoms)
    predictions = np.array(predictions)
    properties = np.array(properties)

    predictions = predictions[idx_success]
    properties = properties[idx_success]
    heavy_atoms = heavy_atoms[idx_success]

    print("total", errors.shape[0], "filter", idx_success.shape[0])
    print()
    print(rmse(properties, predictions))

    plt.plot(properties, properties, "-k")
    plt.scatter(properties, predictions, s=0.95, alpha=0.8, c=heavy_atoms)

    plt.xlabel("True")
    plt.ylabel("Predicted")

    plt.savefig("_fig_joback")
    plt.clf()

    return
Ejemplo n.º 4
0
def dump_distances_and_kernels(scr, name, procs=0):

    # TODO Properties should be read by scr!!
    # properties
    # print("Saving properties")
    # with open(scr + 'properties.csv', 'r') as f:
    #     properties = f.readlines()
    #     properties = [x.split()[0] for x in properties]
    #     properties = [float(x) for x in properties]
    #     properties = np.array(properties)

    # print(properties.shape)
    # misc.save_npy(scr + "properties", properties)

    representation_names_coordbased = ["cm", "slatm", "bob"]
    representation_names_molbased = ["morgan", "rdkitfp"]

    if procs != 0:
        os.environ["OMP_NUM_THREADS"] = str(procs)

    # Prepare fchl kernels
    if name == "fclh18":
        print("Generating fchl18 kernel")
        start = time.time()
        reps = misc.load_npy(scr + "repr." + "fchl18")
        print("shape:", reps.shape)
        sigmas, kernels = get_fchl18_kernels(reps, return_sigmas=True)
        end = time.time()
        print("time:", end - start)
        misc.save_npy(scr + "fchl18." + "sigmas", sigmas)
        misc.save_npy(scr + "kernels." + "fchl18", kernels)

        reps = None
        del reps
        kernels = None
        del kernels

    elif name == "fchl19":
        print("Generating fchl19 kernel")
        reps = misc.load_npy(scr + "repr." + "fchl19")
        print("shape:", reps.shape)
        atoms = misc.load_obj(scr + "atoms")
        start = time.time()
        sigmas, kernels = get_fchl19_kernels(reps, atoms, return_sigmas=True)
        end = time.time()
        print("time:", end - start)
        misc.save_npy(scr + "fchl19." + "sigmas", sigmas)
        misc.save_npy(scr + "kernels." + "fchl19", kernels)

    elif name in representation_names_coordbased:
        print("Distance", name)
        representations = misc.load_npy(scr + "repr." + name)
        print(representations.shape)
        dist = generate_l2_distances(representations)
        misc.save_npy(scr + "dist." + name, dist)

        dist = None
        del dist

    elif name == "rdkitfp" or name == "morgan":

        print("Generating fingerprint kernel", name)
        representations_fp = misc.load_npy(scr + "repr." + name)
        representations_fp = np.asarray(representations_fp, dtype=np.float)

        # t = time.time()
        # print("jaccard numpy")
        # kernel = fingerprints.bitmap_jaccard_kernel(representations_fp)
        # print("time", time.time()-t)
        # print("saving kernel")
        #
        # kernel = None
        # del kernel

        print(os.environ["OMP_NUM_THREADS"])

        n_items = representations_fp.shape[0]

        # FORTRAN KERNEL
        # t = time.time()
        # print("jaccard fortran")
        # representations_fp = np.array(representations_fp, dtype=int).T
        # kernel = bitmap_kernels.symmetric_jaccard_kernel(n_items, representations_fp)
        # print("time", time.time()-t)

        # kernel = fingerprints.fingerprints_to_kernel(representations_fp, representations_fp, procs=procs)
        # misc.save_npy(scr + "kernel." + name, kernel)

        # DISTANCE
        print("make dist")
        dist = generate_l2_distances(representations_fp)
        print("save dist")
        misc.save_npy(scr + "dist." + name, dist)
        print("saved")

        print(dist.shape)

        kernel = None
        del kernel

    else:
        print("error: unknown representation", name)
        quit()

    return
Ejemplo n.º 5
0
def main():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--scratch',
                        action='store',
                        help='',
                        metavar="dir",
                        default="_tmp_")
    parser.add_argument('-j',
                        '--procs',
                        action='store',
                        help='pararallize',
                        metavar="int",
                        default=0,
                        type=int)

    args = parser.parse_args()

    if args.scratch[-1] != "/":
        args.scratch += "/"

    properties = misc.load_npy(args.scratch + "properties")
    molecules = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz")

    heavy_atoms = []
    distances = []
    volumes = []

    for mol in molecules:

        # atoms = cheminfo.molobj_to_atoms(mol)
        atoms, coord = cheminfo.molobj_to_xyz(mol)

        idx = np.where(atoms != 1)
        atoms = atoms[idx]
        N = len(atoms)
        heavy_atoms.append(N)

        hull = ConvexHull(coord, qhull_options="QJ")

        vol = hull.volume
        volumes.append(vol)

        avgdist = distance.pdist(coord)
        avgdist = np.mean(avgdist)

        distances.append(avgdist)

    heavy_atoms = np.array(heavy_atoms)
    volumes = np.array(volumes)
    distances = np.array(distances)

    #
    #
    #

    representation = distances

    # linear fit
    p = np.polyfit(representation, properties, 3)
    p = np.poly1d(p)

    results = p(representation)
    rmse_error = rmse(results, properties)

    print(rmse_error)

    plt.scatter(representation, properties, c=heavy_atoms, s=0.8)
    x_prop = np.linspace(min(representation), max(representation), 80)
    plt.plot(x_prop, p(x_prop), "k-")

    plt.savefig("i_can_member_it")
    plt.clf()

    return
Ejemplo n.º 6
0
def dump_distances_and_kernels(scr):

    # TODO Properties should be read by scr!!

    # properties
    print("Saving properties")
    with open(scr + 'properties.csv', 'r') as f:
        properties = f.readlines()
        properties = [x.split()[0] for x in properties]
        properties = [float(x) for x in properties]
        properties = np.array(properties)

    print("properties", properties.shape)

    misc.save_npy(scr + "properties", properties)

    # Prepare distances
    representation_names = ["cm", "bob", "slatm"] # + ["avgslatm"]
    for name in representation_names:
        print("Distance", name)
        representations = misc.load_npy(scr + "repr." + name)
        print(representations.shape)
        dist = generate_l2_distances(representations)
        misc.save_npy(scr + "dist." + name, dist)

        dist = None
        del dist

    # Prepare fchl kernels
    if False:
        print("Generating fchl18 kernel")
        start = time.time()
        reps = misc.load_npy(scr + "repr." + "fchl18")
        print("shape:", reps.shape)
        sigmas, kernels = get_fchl18_kernels(reps, return_sigmas=True)
        end = time.time()
        print("time:", end-start)
        misc.save_npy(scr + "fchl18." + "sigmas", sigmas)
        misc.save_npy(scr + "kernels." + "fchl18", kernels)

        reps = None
        del reps
        kernels = None
        del kernels

    if False:
        print("Generating fchl19 kernel")
        reps = misc.load_npy(scr + "repr." + "fchl19")
        print("shape:", reps.shape)
        atoms = misc.load_obj(scr + "atoms")
        start = time.time()
        sigmas, kernels = get_fchl19_kernels(reps, atoms, return_sigmas=True)
        end = time.time()
        print("time:", end-start)
        misc.save_npy(scr + "fchl19." + "sigmas", sigmas)
        misc.save_npy(scr + "kernels." + "fchl19", kernels)

    if True:
        print("Generating fingerprint kernel")
        representations_fp = misc.load_obj(scr + "repr.fp")
        kernel = get_fp_kernel(representations_fp)
        misc.save_npy(scr + "kernel.fp", kernel)

    return
Ejemplo n.º 7
0
def dump_kernel_scores(scr, names=[]):

    # Predefined reg
    l2regs = [10**-x for x in range(1, 6, 2)] + [0.0]
    n_l2regs = len(l2regs)

    # Define n_training
    # n_trains=[2**x for x in range(4, 12)]
    n_trains=[2**x for x in range(4, 17)]
    n_trains = np.array(n_trains, dtype=int)
    n_items = misc.load_txt(scr + "n_items")

    n_train_idx, = np.where(n_trains < n_items*4.0/5.0)
    n_trains = n_trains[n_train_idx]
    n_trains = list(n_trains) # + [-1]

    print("Assume total items", n_items,
            "N train", "{:5.1f}".format(np.floor(n_items*4/5)),
            "N test", "{:5.1f}".format(np.ceil(n_items*1/5)))
    print("Training:", list(n_trains))
    misc.save_npy(scr + "n_train", n_trains)

    # Load properties
    try:
        properties = misc.load_npy(scr + "properties")
    except:
        with open(scr + "properties.csv", 'r') as f:
            lines = f.readlines()
            properties = []
            for line in lines:

                values = [float(x) for x in line.split()]
                values = values[1:]
                value = np.median(values)
                properties.append(value)

            properties = np.array(properties)
            misc.save_npy(scr + "properties", properties)


    print(n_items, "==", len(properties))
    assert n_items == len(properties)

    # Load done kernel
    this_names = ["rdkitfp", "morgan"]
    for name in names:

        break

        if name not in this_names:
            continue

        print("scoring", name)

        now = time.time()

        print("load kernel", name)
        kernel = misc.load_npy(scr + "kernel." + name)

        n_len = kernel.shape[0]
        diaidx = np.diag_indices(n_len)

        def scan_kernels(debug=True):
            kernel[diaidx] += l2regs[0]
            yield kernel
            # for i in tqdm.tqdm(range(1, n_l2regs), ncols=47, ascii=True, desc=name):
            for i in range(1, n_l2regs):
                kernel[diaidx] += -l2regs[i-1] +l2regs[i]
                yield kernel

        generator = functools.partial(tqdm, scan_kernels(), ncols=75, ascii=True, desc=name+ " kernels", total=n_l2regs)

        print("scan kernels", name)
        idx_winners, scores = cross_validation(generator(), properties, training_points=n_trains)
        misc.save_npy(scr + "score."+name, scores)
        scores = np.around(np.mean(scores, axis=1), decimals=2)

        # Save parameters
        winner_parameters = {}
        for ni, index in enumerate(idx_winners):

            n = n_trains[ni]
            l2reg = l2regs[index]

            parameters = {
                "reg": l2reg,
            }

            winner_parameters[str(n)] = parameters

        nower = time.time()

        print("time: {:10.1f}s".format(nower-now))
        print(name, list(scores))

        misc.save_json(scr + "parameters."+name, winner_parameters)

        print("saved")

        kernel = None
        del kernel

    # Load multi kernels (reg search)
    this_names = ["fchl19", "fchl18"]
    for name in names:
        break
        kernels = misc.load_npy(scr + "kernels." + name)

        n_l2regs = len(l2regs)
        n_kernels = kernels.shape[0]
        n_len = kernels[0].shape[0]

        diaidx = np.diag_indices(n_len)

        def scan_kernels():
            for kernel in kernels:
                kernel[diaidx] += l2regs[0]
                yield kernel
                for i in range(1, n_l2regs):
                    kernel[diaidx] += -l2regs[i-1] +l2regs[i]
                    yield kernel

        idx_winners, scores = cross_validation(scan_kernels(), properties, training_points=n_trains)
        misc.save_npy(scr + "score."+name, scores)
        scores = np.around(np.mean(scores, axis=1), decimals=2)

        # Clean
        kernels = None
        del kernels

        # Save parameters
        winner_parameters = {}
        for ni, index in enumerate(idx_winners):

            # convert linear index to multi-dimensions
            idx_parameters = np.unravel_index([index], (n_kernels, n_l2regs))
            i, j = idx_parameters
            i = int(i[0])
            j = int(j[0])

            n = n_trains[ni]
            sigma = i
            l2reg = l2regs[j]

            parameters = {
                "sigma": sigma,
                "reg": l2reg,
            }

            winner_parameters[str(n)] = parameters

        misc.save_json(scr + "parameters."+name, winner_parameters)

        print(name, scores)


    # Load distance kernels
    models = []
    parameters = {
        "name": "rdkitfp",
        "sigma": [2**x for x in range(1, 12, 2)],
        # "sigma": [2**x for x in np.arange(20, 40, 0.5)],
        # "lambda": l2regs,
        # "lambda":  [10.0**-x for x in np.arange(1, 10, 1)]
        "lambda":  [10.0**-6],
    }
    models.append(parameters)
    parameters = {
        "name": "slatm",
        "sigma": [2**x for x in range(1, 12, 2)],
        # "sigma": [2**x for x in np.arange(20, 40, 0.5)],
        # "lambda": l2regs,
        # "lambda":  [10.0**-x for x in np.arange(1, 10, 1)]
        "lambda":  [10.0**-6],
    }
    models.append(parameters)
    parameters = {
        "name": "cm",
        "sigma": [2**x for x in range(1, 12, 2)],
        "lambda": l2regs,
    }
    models.append(parameters)
    parameters = {
        "name": "bob",
        "sigma": [2**x for x in range(1, 12, 2)],
        "lambda": l2regs,
    }
    models.append(parameters)
    parameters = {
        "name": "avgslatm",
        "sigma": [2**x for x in range(1, 20, 2)],
        "lambda": l2regs,
    }
    # models.append(parameters)

    for model in models:
        name = model["name"]

        if name not in names:
            continue

        print("scoring", name)

        parameters = model

        n_sigma = len(parameters["sigma"])
        n_lambda = len(parameters["lambda"])

        print("parameter range")
        print("sigma", min(parameters["sigma"]), max(parameters["sigma"]))

        dist = misc.load_npy(scr + "dist." + name)
        kernels = get_kernels_l2distance(dist, parameters)

        # Cross validate
        idx_winners, scores = cross_validation(kernels, properties, training_points=n_trains)

        # Save scores
        misc.save_npy(scr + "score."+name, scores)
        scores = np.around(np.mean(scores, axis=1), decimals=2)

        # Save parameters
        winner_parameters = {}
        for ni, index in enumerate(idx_winners):

            # convert linear index to multi-dimensions
            idx_parameters = np.unravel_index([index], (n_sigma, n_lambda))
            i, j = idx_parameters
            i = int(i[0])
            j = int(j[0])

            n = n_trains[ni]
            sigma = parameters["sigma"][i]
            l2reg = parameters["lambda"][j]

            this_parameters = {
                "sigma": str(sigma),
                "reg": str(l2reg),
            }

            winner_parameters[str(n)] = this_parameters


        print(name, scores)
        misc.save_json(scr + "parameters."+name, winner_parameters)



    quit()

    return
Ejemplo n.º 8
0
def main():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--scratch',
                        action='store',
                        help='',
                        metavar="dir",
                        default="_tmp_")
    parser.add_argument('--randomseed',
                        action='store',
                        help='random seed',
                        metavar="int",
                        default=1)
    parser.add_argument('-j',
                        '--procs',
                        action='store',
                        help='pararallize',
                        type=int,
                        metavar="int",
                        default=0)
    args = parser.parse_args()

    if args.scratch[-1] != "/":
        args.scratch += "/"

    # Not that random
    np.random.seed(args.randomseed)

    # Get properties
    properties = misc.load_npy(args.scratch + "properties")
    molobjs = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz")

    # Get features
    filename = "repr.ols"
    if os.path.exists(args.scratch + filename + ".pkl"):
        features = misc.load_obj(args.scratch + filename)

    else:
        features = extract_features(properties, molobjs, procs=args.procs)
        features = pd.DataFrame(features)
        features = features.fillna(0)
        misc.save_obj(args.scratch + filename, features)

    n_items = len(features)
    X = np.arange(n_items)

    assert len(properties) == n_items

    # Train
    n_splits = 5
    n_train = misc.load_npy(args.scratch + "n_train")

    fold_five = sklearn.model_selection.KFold(n_splits=n_splits,
                                              random_state=45,
                                              shuffle=True)

    scores = []

    for i, (idxs_train, idxs_test) in enumerate(fold_five.split(X)):

        # un-ordered idxs_train
        np.random.seed(45 + i)
        np.random.shuffle(idxs_train)

        learning_curve = []

        for n in n_train:
            idxs = idxs_train[:n]

            # signed difference
            sign_diff = fit_model(features, idxs, idxs_test)

            # rmse
            diff = sign_diff**2
            rmse_test = np.sqrt(diff.mean())

            # save
            learning_curve.append(rmse_test)

        scores.append(learning_curve)

    scores = np.array(scores)
    scores = scores.T

    mean_score = np.mean(scores, axis=1)
    print(mean_score)
    misc.save_npy(args.scratch + "score.ols", scores)

    return
Ejemplo n.º 9
0
def main():

    # L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001.

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--scratch', action='store', help='', metavar="dir", default="_tmp_")
    parser.add_argument('--randomseed', action='store', help='random seed', metavar="int", default=1)
    parser.add_argument('-j', '--procs', action='store', help='pararallize', type=int, metavar="int", default=0)
    args = parser.parse_args()

    if args.scratch[-1] != "/":
        args.scratch += "/"

    # Not that random
    np.random.seed(args.randomseed)

    # Get properties
    properties = misc.load_npy(args.scratch + "properties")
    molobjs = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz")

    X = []

    try:
        X = misc.load_npy(args.scratch + "repr.rdkitfp")
        print("loaded")
    except:
        for molobj in molobjs:
            bitmap = fingerprints.get_rdkitfp(molobj)
            X.append(bitmap)

    X = np.asarray(X)
    y = properties

    # load predefined training points
    n_train = misc.load_npy(args.scratch + "n_train")


    # CV
    idxs = np.array(list(range(len(properties))), dtype=int)
    scores = []

    for idxs_train, idxs_test in cv.cross_view(idxs):

        learning_curve = []

        for n in n_train:
            idxs = idxs_train[:n]

            clf = get_best_rfr(X[idxs], y[idxs])

            # training error
            # predictions = clf.predict(X)

            # predictions
            predictions = clf.predict(X[idxs_test])
            diff = predictions-y[idxs_test]
            diff = diff**2
            rmse_test = np.sqrt(diff.mean())
            learning_curve.append(rmse_test)
            print(n, rmse_test)

        scores.append(learning_curve)

    scores = np.array(scores)
    scores = scores.T

    mean_score = np.mean(scores, axis=1)
    print(mean_score)
    misc.save_npy(args.scratch + "score.rfr", scores)
Ejemplo n.º 10
0
def main():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--scratch',
                        action='store',
                        help='',
                        metavar="dir",
                        default="_tmp_")
    parser.add_argument('--randomseed',
                        action='store',
                        help='random seed',
                        metavar="int",
                        default=1)
    parser.add_argument('-j',
                        '--procs',
                        action='store',
                        help='pararallize',
                        type=int,
                        metavar="int",
                        default=0)
    args = parser.parse_args()

    if args.scratch[-1] != "/":
        args.scratch += "/"

    # Not that random
    # np.random.seed(args.randomseed)

    # Get properties
    properties = misc.load_npy(args.scratch + "properties")
    # molobjs = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz")

    n_items = len(properties)
    X = np.arange(n_items)

    # Train
    n_splits = 5
    n_train = misc.load_npy(args.scratch + "n_train")

    fold_five = sklearn.model_selection.KFold(n_splits=n_splits,
                                              random_state=45,
                                              shuffle=True)

    scores = []

    for i, (idxs_train, idxs_test) in enumerate(fold_five.split(X)):

        # un-ordered idxs_train
        np.random.seed(45 + i)
        np.random.shuffle(idxs_train)

        learning_curve = []

        for n in n_train:
            idxs = idxs_train[:n]

            train = properties[idxs]
            model = train.mean()

            test = properties[idxs_test]

            # predict
            sign_diff = model - test

            # rmse
            diff = sign_diff**2
            rmse_test = np.sqrt(diff.mean())

            # save
            learning_curve.append(rmse_test)

        scores.append(learning_curve)

    scores = np.array(scores)
    scores = scores.T

    mean_score = np.mean(scores, axis=1)
    print(mean_score)
    misc.save_npy(args.scratch + "score.null", scores)

    return
Ejemplo n.º 11
0
def plot_errors(scr):

    fig, axes = plt.subplots(1, 1, figsize=(8, 4))
    # fig, axes = plt.subplots(1, 1, figsize=(4,4))
    ax = axes

    # n_trains=[2**x for x in range(4, 4+7)]
    try:
        n_trains = misc.load_npy(scr + "n_train")
    except FileNotFoundError:
        n_trains = misc.load_txt(scr + "n_train")

    print(n_trains)

    names = ["cm", "bob", "fchl18", "fchl19", "fp", "slatm"]
    names = glob.glob(scr + "score.*")

    fix_name = lambda x: x.replace(scr, "").replace(".npy", "").replace(
        "score.", "")

    names = [fix_name(x) for x in names]

    lines = []
    last_points = []

    y_min = np.inf
    y_max = -np.inf

    for name in names:

        scores = misc.load_npy(scr + "score." + name)
        mean = scores.mean(axis=1)
        std = scores.std(axis=1)

        if "ols" in name:
            view, = np.where(n_trains > 250)
            x_mean = n_trains[view]
            mean = mean[view]
            std = std[view]
        else:
            valid_scores, = np.where(mean < 200)
            x_mean = n_trains[valid_scores]
            mean = mean[valid_scores]
            std = std[valid_scores]

        line = ax.errorbar(
            x_mean,
            mean,
            std,
            fmt='-o',
            # color="k",
            capsize=3,
            lw=1,
            markersize=4,
            label=name.upper())

        lines.append(line)
        last_points.append(mean[-1])

        max_mean = max(mean) + max(std)
        if max_mean > y_max:
            y_max = max_mean

        min_mean = min(mean) - max(std)
        if min_mean < y_min:
            y_min = min_mean

        print(name, list(mean))

    y_min = np.floor(y_min)
    y_min = int(np.floor(y_min / 10.0)) * 10
    y_max = int(np.ceil(y_max) / 10.0) * 10

    ykeys = []

    y_min = 40

    print("y", y_min, y_max)

    diff = y_max - y_min
    if diff < 50:
        y_min -= 40

    if y_min < 0.0:
        y_min = 50

    if y_max > 120:
        y_max = 120

    # ykeys = np.arange(y_min, y_max, 30)

    # y_max = 100
    y_min = 30

    ykeys = np.geomspace(y_min, y_max, num=5)

    ykeys = [int(np.ceil(y) / 5.0) * 5 for y in ykeys]

    # ykeys = [40 +10*x for x in range(0, 12, 2)]
    xkeys = n_trains

    print("x", n_trains)

    views.learning_curve_error(ax,
                               xkeys,
                               ykeys,
                               x_range=(10, max(n_trains) * 1.3),
                               y_range=(y_min * 0.95, y_max * 1.12))

    views.legend_colorcoded(ax, lines, names)

    # learning legends

    # idxs = np.argsort(last_points)
    # idxs = np.flip(idxs, axis=0)
    # offset = 0.06
    #
    # for n, idx in enumerate(idxs):
    #
    #     name = names[idx]
    #     point = last_points[idx]
    #     color = plt.getp(lines[idx][0], 'color')
    #
    #     ax.text(0.8, 0.46-offset*n, name.upper(),
    #         fontweight='bold',
    #         color=color,
    #         transform=ax.transAxes)
    #

    # help(ax.grid)
    # ax.grid( linestyle='-', linewidth=.5, axis="x")

    # ax.grid(True)

    ax.set_xlabel('Training set size', fontweight='medium', fontsize=11)
    ax.set_ylabel('RMSE [Kelvin]', fontweight='medium', fontsize=11)

    plt.savefig(scr + "learning_curves.png", bbox_inches="tight")
    plt.savefig(scr + "learning_curves.pdf", bbox_inches="tight")

    print(scr + "learning_curves.png")

    return
def generate_conformer_representation(scr="_tmp_ensemble_/", procs=0):

    names = ["cm", "slatm", "bob"]
    name = "slatm"

    mbtypes = misc.load_npy(scr + "slatm.mbtypes")

    # TODO Calculate max_size
    mol_atoms = misc.load_obj(scr + "atoms")
    max_atoms = [len(atoms) for atoms in mol_atoms]
    max_atoms = max(max_atoms)

    kwargs = {
        "name": name,
        "mbtypes": mbtypes,
        "debug": False,
        "max_atoms": max_atoms,
    }

    # n_total = 1285
    n_total = 3456
    idxs = range(n_total)

    avgreps = [0] * n_total

    if procs == 0:

        for idx in idxs:

            idx, avgrep = get_avg_repr(idx, **kwargs)
            avgreps[idx] = avgrep

    else:

        idx, rep = get_avg_repr(0, **kwargs)
        rep_size = rep.shape[0]
        print("rep size", rep_size)

        m = MyManager()
        m.start()

        results = m.np_zeros((n_total, rep_size))

        # TODO Hardcoded, puuuha
        pool = Pool(32)

        kwargs["array"] = results
        func = partial(get_avg_repr, **kwargs)
        pool.map(func, idxs)
        avgreps = results

        # results = misc.parallel(idxs, get_avg_repr, [], kwargs, procs=nprocs)
        #
        # for result in results:
        #     idx, avgrep = result
        #     avgreps[idx] = avgrep
        #     print(idx, avgrep.mean())

    avgreps = np.array(avgreps)
    misc.save_npy(scr + "repr.avgslatm", avgreps)

    return