Beispiel #1
0
    def test_values_real_to_scaled_multiple_bounds(self):
        bounds = [[2.0, 4.0], [2.0, 3.0]]
        values = [[2.0, 3.0]]
        scaled_values = values_real_to_scaled(values, bounds)
        assert np.isclose(scaled_values, [[0.0, 1.0]]).all()

        values = [[2.0, 3.0], [3.0, 2.0]]
        scaled_values = values_real_to_scaled(values, bounds)
        assert np.isclose(scaled_values, [[0.0, 1.0], [0.5, 0.0]]).all()
Beispiel #2
0
    def test_values_real_to_scaled_multiple_values(self):
        bounds = [2.0, 4.0]
        values = [2.0, 4.0]
        scaled_values = values_real_to_scaled(values, bounds)
        assert np.isclose(scaled_values, [[0.0], [1.0]]).all()

        values = [2.0, 4.0, 5.0]
        scaled_values = values_real_to_scaled(values, bounds)
        assert np.isclose(scaled_values, [[0.0], [1.0], [1.5]]).all()
Beispiel #3
0
def prepare_df(df_csv, AP):
    """Prepare a pandas dataframe for fitting GP models to AP data"""

    # Drop simulations that failed
    df_all = df_csv.dropna(subset=['uc_mean_distance'])
    # Add scaled_temperature
    df_all["scaled_temperature"] = values_real_to_scaled(
        df_all["temperature"].values, AP.temperature_bounds
    )

    params = df_all[list(AP.param_names)]
    scaled_params = values_real_to_scaled(params, AP.param_bounds)
    df_all[list(AP.param_names)] = scaled_params

    return df_all
Beispiel #4
0
    def test_values_real_to_scaled_single(self):
        bounds = [2.0, 4.0]
        value = 2.0
        scaled_value = values_real_to_scaled(value, bounds)
        assert np.isclose(scaled_value, 0.0)

        value = 4.0
        scaled_value = values_real_to_scaled(value, bounds)
        assert np.isclose(scaled_value, 1.0)

        value = 3.0
        scaled_value = values_real_to_scaled(value, bounds)
        assert np.isclose(scaled_value, 0.5)

        value = [1.0]
        scaled_value = values_real_to_scaled(value, bounds)
        assert np.isclose(scaled_value, -0.5)

        bounds = [-5.0, -4.0]
        value = -4.5
        scaled_value = values_real_to_scaled(value, bounds)
        assert np.isclose(scaled_value, 0.5)
Beispiel #5
0
def _calc_gp_mse(gp_model, samples, expt_property, property_bounds,
                 temperature_bounds):
    """Calculate the MSE between the GP model and experiment for samples"""

    all_errs = np.empty(shape=(samples.shape[0], len(expt_property.keys())))
    col_idx = 0
    for (temp, density) in expt_property.items():
        scaled_temp = values_real_to_scaled(temp, temperature_bounds)
        xx = np.hstack((samples, np.tile(scaled_temp, (samples.shape[0], 1))))
        means_scaled, vars_scaled = gp_model.predict_f(xx)
        means = values_scaled_to_real(means_scaled, property_bounds)
        err = means - density
        all_errs[:, col_idx] = err[:, 0]
        col_idx += 1

    return np.mean(all_errs**2, axis=1)
Beispiel #6
0
    for result_csv_name in result_csv_names
]

# Concatenate all parameter sets and results
df_params = pd.concat(df_params).reset_index(drop=True)
df_results = pd.concat(df_results).reset_index(drop=True)

# Create a df with the MSE for each parameter set
# and add the parameter set idx
df_results["expt_density"] = df_results["temperature"].apply(
    lambda x: R125.expt_liq_density[int(x)])
df_results["sq_err"] = (df_results["density"] - df_results["expt_density"])**2
df_mse = (df_results.groupby(list(
    R125.param_names))["sq_err"].mean().reset_index(name="mse"))

scaled_param_values = values_real_to_scaled(df_mse[list(R125.param_names)],
                                            R125.param_bounds)
param_idxs = []
param_vals = []
for params1 in scaled_param_values:
    for idx, params2 in enumerate(df_params[list(R125.param_names)].values):
        if np.allclose(params1, params2):
            param_idxs.append(idx)
            param_vals.append(params2)
            break
df_mse["param_idx"] = param_idxs
df_mse[list(R125.param_names)] = param_vals

# Plot all with MSE < 625
g = seaborn.pairplot(
    pd.DataFrame(df_mse[df_mse["mse"] < 625.0],
                 columns=list(R125.param_names)))
Beispiel #7
0
def main():

    # Make sure we have a folder for figures
    try:
        os.mkdir("figs")
    except FileExistsError:
        pass

    ###########################################################
    ####################   Fit GP models    ###################
    ###########################################################
    ## UCMD Model
    param_names = list(AP.param_names)
    property_name = "uc_mean_distance"
    # Only train on 10 K data that meets ucmd clf threshold
    df_ucmd = df_all.loc[(df_all["temperature"] == 10)
                         & (df_all["uc_mean_distance"] < ucmd_clf_threshold)]
    # Get train/test
    x_train, y_train, x_test, y_test = shuffle_and_split(
        df_ucmd, param_names, property_name, shuffle_seed=gp_shuffle_seed)
    # Fit model
    ucmd_gp = run_gpflow_scipy(
        x_train,
        y_train,
        gpflow.kernels.Matern32(lengthscales=np.ones(AP.n_params)),
    )

    ## Lattice APE Model
    param_names = list(AP.param_names) + ["scaled_temperature"]
    property_name = "lattice_ape"
    # Get train/test
    x_train, y_train, x_test, y_test = shuffle_and_split(
        df_all, param_names, property_name, shuffle_seed=gp_shuffle_seed)
    # Fit model
    lattice_gp = run_gpflow_scipy(
        x_train,
        y_train,
        gpflow.kernels.Matern32(lengthscales=np.ones(AP.n_params + 1)),
    )

    ###########################################################
    ##################   Train classifers    ##################
    ###########################################################

    x_train, y_train, x_test, y_test = shuffle_and_split(
        df_all.loc[df_all["temperature"] == 10],
        list(AP.param_names),
        "uc_mean_distance",
        shuffle_seed=clf_shuffle_seed,
    )

    y_train = np.array(y_train < ucmd_clf_threshold, dtype=np.int32)
    y_test = np.array(y_test < ucmd_clf_threshold, dtype=np.int32)
    ucmd_clf = svm.SVC(kernel="rbf")
    ucmd_clf.fit(x_train, y_train)
    y_pred = ucmd_clf.predict(x_train)
    print("Training accuracy:", metrics.accuracy_score(y_train, y_pred))
    y_pred = ucmd_clf.predict(x_test)
    print("Testing accuracy:", metrics.accuracy_score(y_test, y_pred))
    print(metrics.confusion_matrix(y_test, y_pred))

    ###########################################################
    ###################   Find new points   ###################
    ###########################################################

    # Load large hypercube
    latin_hypercube = np.loadtxt("LHS_1e6x8.csv", delimiter=",")

    # Apply UCMD classifier
    ucmd_pred = ucmd_clf.predict(latin_hypercube)

    # Predict UCMD with GP model
    gp_means_ucmd, gp_vars_ucmd = ucmd_gp.predict_f(latin_hypercube)
    # Predict Lattice APE with GP model at each temperature
    all_errs = np.empty(shape=(latin_hypercube.shape[0], len(temperatures)))
    col_idx = 0
    for temperature in temperatures:
        scaled_temperature = values_real_to_scaled(temperature,
                                                   AP.temperature_bounds)
        xx = np.hstack((latin_hypercube,
                        np.tile(scaled_temperature,
                                (latin_hypercube.shape[0], 1))))
        gp_means_ape, gp_vars_ape = lattice_gp.predict_f(xx)
        all_errs[:, col_idx] = gp_means_ape[:, 0]
        col_idx += 1
    # Compute MAPE across all three temperatures
    mean_errs = np.mean(all_errs, axis=1)

    ## Save all the results to a dataframe
    LH_results = pd.DataFrame(latin_hypercube, columns=AP.param_names)
    LH_results["ucmd_clf"] = ucmd_pred.astype(np.bool_)
    LH_results["ucmd"] = gp_means_ucmd.numpy()
    LH_results["lattice_mape"] = mean_errs

    # Only take points where structure classifier is satisifed
    LH_results_pass_ucmd_clf = LH_results.loc[LH_results.ucmd_clf == True]
    costs = LH_results_pass_ucmd_clf[["ucmd", "lattice_mape"]].to_numpy()

    # Find pareto efficient points
    result, pareto_points, dominated_points = find_pareto_set(
        costs, is_pareto_efficient)
    LH_results_pass_ucmd_clf["is_pareto"] = result

    # Plot pareto points vs. costs
    g = seaborn.pairplot(
        LH_results_pass_ucmd_clf,
        vars=["ucmd", "lattice_mape"],
        hue="is_pareto",
    )
    g.savefig("figs/pareto-mses.png", dpi=300)

    # Plot pareto points vs. params
    g = seaborn.pairplot(LH_results_pass_ucmd_clf,
                         vars=list(AP.param_names),
                         hue="is_pareto")
    g.set(xlim=(-0.1, 1.1), ylim=(-0.1, 1.1))
    g.savefig("figs/pareto-params.png", dpi=300)

    # For next iteration: 1. All non-dominated points that meet the thresholds
    #                     2. "Separated" dominated points that meet the thresholds

    next_iteration_points = LH_results_pass_ucmd_clf.loc[
        (LH_results_pass_ucmd_clf.is_pareto == True)
        & (LH_results_pass_ucmd_clf.ucmd < ucmd_next_itr_threshold) &
        (LH_results_pass_ucmd_clf.lattice_mape <
         lattice_mape_next_itr_threshold)]
    dominated_points = LH_results_pass_ucmd_clf.loc[
        (LH_results_pass_ucmd_clf.is_pareto == False)
        & (LH_results_pass_ucmd_clf.ucmd < ucmd_next_itr_threshold) &
        (LH_results_pass_ucmd_clf.lattice_mape <
         lattice_mape_next_itr_threshold)]

    print(f"{len(LH_results_pass_ucmd_clf)} points meet the ucmd classifier.")
    print(
        f"{len(LH_results_pass_ucmd_clf[LH_results_pass_ucmd_clf.is_pareto == True])} are non-dominated."
    )
    print(
        f"{len(dominated_points)} are dominated with ucmd < {ucmd_next_itr_threshold} and lattice_mape < {lattice_mape_next_itr_threshold}"
    )

    removal_distance = 1.034495
    np.random.seed(distance_seed)
    discarded_points = pd.DataFrame(columns=dominated_points.columns)

    while len(dominated_points > 0):
        # Shuffle the top parameter sets
        dominated_points = dominated_points.sample(frac=1)

        # Select one off the top
        # Note: here we use a random one rather than the "best" one; we don't have
        # confidence that the GP models are more accurate than our thresholds
        next_iteration_points = next_iteration_points.append(
            dominated_points.iloc[[0]])

        # Remove anything within given distance
        l1_norm = np.sum(
            np.abs(
                dominated_points[list(AP.param_names)].values -
                next_iteration_points[list(AP.param_names)].iloc[[-1]].values),
            axis=1,
        )

        points_to_remove = np.where(l1_norm < removal_distance)[0]
        discarded_points = discarded_points.append(
            dominated_points.iloc[points_to_remove])
        dominated_points.drop(index=dominated_points.index[points_to_remove],
                              inplace=True)

    print(
        f"After removing similar points, we are left with {len(next_iteration_points)} final top points."
    )

    next_iteration_points = next_iteration_points[:250]
    next_iteration_points.drop(columns=["ucmd", "lattice_mape", "is_pareto"],
                               inplace=True)

    # Plot new points
    g = seaborn.pairplot(next_iteration_points, vars=list(AP.param_names))
    g.set(xlim=(-0.1, 1.1), ylim=(-0.1, 1.1))
    g.savefig("figs/new-points-params.png", dpi=300)

    # Save the final new parameters
    next_iteration_points.to_csv(csv_path + out_csv_name)
def main():
    df = pd.read_csv("../analysis/csv/uc-lattice-final-params.csv",
                     index_col=0)
    df = df[df.temperature == 10]
    dff = pd.read_csv("../analysis/csv/ap-final-2.csv", index_col=0)
    seaborn.set_palette('bright', n_colors=len(df))
    #data = values_real_to_scaled(df[list(AP.param_names)].values, AP.param_bounds)
    #data_f = values_real_to_scaled(dff[list(AP.param_names)].values, AP.param_bounds)
    data = df[list(AP.param_names)].values
    data_f = dff[list(AP.param_names)].values
    result_bounds = np.array([[0, 0.5], [0, 5]])
    results = values_real_to_scaled(
        df[["uc_mean_distance", "lattice_ape"]].values, result_bounds)
    results_f = values_real_to_scaled(
        dff[["uc_mean_distance", "lattice_ape"]].values, result_bounds)
    param_bounds = AP.param_bounds
    param_bounds[:4] = param_bounds[:4]  #* NM_TO_ANGSTROM
    param_bounds[4:] = param_bounds[4:]  #* KJMOL_TO_K

    data = np.hstack((data, results))
    data_f = np.hstack((data_f, results_f))
    bounds = np.vstack((param_bounds, result_bounds))
    print(data.shape)

    col_names = [
        r"$\sigma_{Cl}$",
        r"$\sigma_H$",
        r"$\sigma_N$",
        r"$\sigma_O$",
        r"$\epsilon_{Cl}$",
        r"$\epsilon_H$",
        r"$\epsilon_N$",
        r"$\epsilon_O$",
        r"UCMD",
        "Lattice\nMAPE",
    ]
    n_axis = len(col_names)
    assert data.shape[1] == n_axis
    x_vals = [i for i in range(n_axis)]

    # Create (N-1) subplots along x axis
    fig, axes = plt.subplots(1, n_axis - 1, sharey=False, figsize=(12, 5))

    # Plot each row
    for i, ax in enumerate(axes):
        for line in data:
            ax.plot(x_vals, line, alpha=0.35)
        ax.set_xlim([x_vals[i], x_vals[i + 1]])
        for line in data_f:
            ax.plot(x_vals, line, alpha=1.0, linewidth=3)

    for dim, ax in enumerate(axes):
        ax.xaxis.set_major_locator(ticker.FixedLocator([dim]))
        set_ticks_for_axis(ax, bounds[dim], nticks=6)
        ax.set_xticklabels([col_names[dim]], fontsize=30)
        ax.tick_params(axis="x", pad=10)
        ax.set_ylim(-0.05, 1.05)
        # Add white background behind labels
        for label in ax.get_yticklabels():
            label.set_bbox(
                dict(facecolor='white',
                     edgecolor='none',
                     alpha=0.45,
                     boxstyle=mpatch.BoxStyle("round4")))
        ax.spines['top'].set_visible(False)
        ax.spines['bottom'].set_visible(False)
        ax.spines['left'].set_linewidth(2.0)

    ax = axes[-1]
    ax.xaxis.set_major_locator(ticker.FixedLocator([n_axis - 2, n_axis - 1]))
    ax.set_xticklabels([col_names[-2], col_names[-1]], fontsize=24)
    ax.tick_params(axis="x", pad=14)

    # Add class II data
    #ax.plot(x_vals[-2], 0.3485/bounds[-2][1], markersize=15, color="red", marker="*", clip_on=False, zorder=200)

    ax = plt.twinx(axes[-1])
    ax.set_ylim(-0.05, 1.05)
    set_ticks_for_axis(ax, bounds[-1], nticks=6)
    ax.spines['top'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['right'].set_linewidth(2.0)

    # Add class I data
    ax.plot(x_vals[-1],
            1.42 / bounds[-1][1],
            markersize=15,
            color="tab:purple",
            marker="o",
            clip_on=False,
            zorder=200)
    ax.plot(x_vals[-2],
            0.156 / bounds[-2][1],
            markersize=15,
            color="tab:purple",
            marker="o",
            clip_on=False,
            zorder=200)

    # Add class II data
    ax.plot(x_vals[-1],
            3.55 / bounds[-1][1],
            markersize=20,
            color="tab:red",
            marker="*",
            clip_on=False,
            zorder=200)
    ax.plot(x_vals[-2],
            0.3485 / bounds[-2][1],
            markersize=20,
            color="tab:red",
            marker="*",
            clip_on=False,
            zorder=200)

    # Remove space between subplots
    plt.subplots_adjust(wspace=0, bottom=0.2, left=0.05, right=0.95)

    fig.savefig("pdfs/fig4-ap-parallel.pdf")
Beispiel #9
0
def plot_slices_temperature(
    models,
    n_params,
    temperature_bounds,
    property_bounds,
    plot_bounds=[220.0, 340.0],
    property_name="property",
):
    """Plot the model predictions as a function of temperature
    Slices are plotted where the values of the other parameters
    are all set to 0.0 --> 1.0 in increments of 0.1
    Parameters
    ----------
    models : dict
        models to plot, key=label, value=gpflow.model
    n_params : int
        number of non-temperature parameters in the model
    temperature_bounds: array-like
        bounds for scaling temperature between physical
        and dimensionless values
    property_bounds: array-like
        bounds for scaling the property between physical
        and dimensionless values
    plot_bounds : array-like, optional
        temperature bounds for the plot
    property_name : str, optional, default="property"
        property name with units for axis label

    Returns
    -------
    figs : list
        list of matplotlib.figure.Figure objects
    """

    n_samples = 100
    vals = np.linspace(plot_bounds[0], plot_bounds[1],
                       n_samples).reshape(-1, 1)
    vals_scaled = values_real_to_scaled(vals, temperature_bounds)

    figs = []
    for other_vals in np.arange(0, 1.1, 0.1):
        other = np.tile(other_vals, (n_samples, n_params))
        xx = np.hstack((other, vals_scaled))

        fig, ax = plt.subplots()
        for (label, model) in models.items():
            mean_scaled, var_scaled = model.predict_f(xx)
            mean = values_scaled_to_real(mean_scaled, property_bounds)
            var = variances_scaled_to_real(var_scaled, property_bounds)

            ax.plot(vals, mean, lw=2, label=label)
            ax.fill_between(
                vals[:, 0],
                mean[:, 0] - 1.96 * np.sqrt(var[:, 0]),
                mean[:, 0] + 1.96 * np.sqrt(var[:, 0]),
                alpha=0.3,
            )

        ax.set_title(f"Other vals = {other_vals:.2f}")
        ax.set_xlabel("Temperature")
        ax.set_ylabel(property_name)
        fig.legend()
        figs.append(fig)

    if not mpl_is_inline:
        return figs
Beispiel #10
0
def plot_model_vs_test(
    models,
    param_values,
    train_points,
    test_points,
    temperature_bounds,
    property_bounds,
    plot_bounds=[220.0, 340.0],
    property_name="property",
):
    """Plots the GP model(s) as a function of temperature with all other parameters
    taken as param_values. Overlays training and testing points with the same
    param_values.

    Parameters
    ----------
    models : dict {"label" : gpflow.model }
        GPFlow models to plot
    param_values : np.ndarray, shape=(n_params)
        The parameters at which to evaluate the GP model
    train_points : np.ndarray, shape=(n_points, 2)
        The temperature (scaled) and property (scaled) of each training point
    test_points : np.ndarray, shape=(n_points, 2)
        The temperature (scaled) and property (scaled) of each test point
    temperature_bounds: array-like
        bounds for scaling temperature between physical
        and dimensionless values
    property_bounds: array-like
        bounds for scaling property between physical
        and dimensionless values
    plot_bounds : array-like, optional
        temperature bounds for the plot
    property_name : str, optional, default="property"
        property name with units for axis label

    Returns
    -------
    matplotlib.figure.Figure
    """

    n_samples = 100
    vals = np.linspace(plot_bounds[0], plot_bounds[1],
                       n_samples).reshape(-1, 1)
    vals_scaled = values_real_to_scaled(vals, temperature_bounds)

    other = np.tile(param_values, (n_samples, 1))
    xx = np.hstack((other, vals_scaled))

    fig, ax = plt.subplots()
    for (label, model) in models.items():
        mean_scaled, var_scaled = model.predict_f(xx)

        mean = values_scaled_to_real(mean_scaled, property_bounds)
        var = variances_scaled_to_real(var_scaled, property_bounds)
        ax.plot(vals, mean, lw=2, label="GP model" + label)
        ax.fill_between(
            vals[:, 0],
            mean[:, 0] - 1.96 * np.sqrt(var[:, 0]),
            mean[:, 0] + 1.96 * np.sqrt(var[:, 0]),
            alpha=0.25,
        )

    if train_points.shape[0] > 0:
        md_train_temp = values_scaled_to_real(train_points[:, 0],
                                              temperature_bounds)
        md_train_property = values_scaled_to_real(train_points[:, 1],
                                                  property_bounds)
        ax.plot(md_train_temp,
                md_train_property,
                "s",
                color="black",
                label="Train")
    if test_points.shape[0] > 0:
        md_test_temp = values_scaled_to_real(test_points[:, 0],
                                             temperature_bounds)
        md_test_property = values_scaled_to_real(test_points[:, 1],
                                                 property_bounds)
        ax.plot(md_test_temp, md_test_property, "ro", label="Test")

    ax.set_xlabel("Temperature")
    ax.set_ylabel(property_name)
    fig.legend()

    if not mpl_is_inline:
        return fig
Beispiel #11
0
def plot_slices_params(
    models,
    param_to_plot,
    param_names,
    temperature,
    temperature_bounds,
    property_bounds,
    property_name="property",
):
    """Plot the model predictions as a function of param_to_plot
    at the specified temperature

    Parameters
    ----------
    models : dict {"label" : gpflow.model }
        GPFlow models to plot
    param_to_plot : string
        Parameter to vary
    param_names : list, tuple
        list of parameter names
    temperature : float
        temperature at which to plot the surface
    temperature_bounds: array-like
        bounds for scaling temperature between physical
        and dimensionless values
    property_bounds: array-like
        bounds for scaling property between physical
        and dimensionless values
    property_name : string, optional, default="property"
        name of property to plot

    Returns
    -------
    figs : list
        list of matplotlib.figure.Figure objects
    """

    try:
        param_idx = param_names.index(param_to_plot)
    except ValueError:
        raise ValueError(
            f"parameter: {param_to_plot} not found in parameter_names: {param_names}"
        )

    n_params = len(param_names)

    n_samples = 100
    vals_scaled = np.linspace(-0.1, 1.1, n_samples).reshape(-1, 1)
    temp_vals = np.tile(temperature, (n_samples, 1))
    temp_vals_scaled = values_real_to_scaled(temp_vals, temperature_bounds)

    figs = []
    for other_vals in np.arange(0, 1.1, 0.1):
        other1 = np.tile(other_vals, (n_samples, param_idx))
        other2 = np.tile(other_vals, (n_samples, n_params - 1 - param_idx))
        xx = np.hstack((other1, vals_scaled, other2, temp_vals_scaled))

        fig, ax = plt.subplots()
        for (label, model) in models.items():
            mean_scaled, var_scaled = model.predict_f(xx)
            mean = values_scaled_to_real(mean_scaled, property_bounds)
            var = variances_scaled_to_real(var_scaled, property_bounds)

            ax.plot(vals_scaled, mean, lw=2, label=label)
            ax.fill_between(
                vals_scaled[:, 0],
                mean[:, 0] - 1.96 * np.sqrt(var[:, 0]),
                mean[:, 0] + 1.96 * np.sqrt(var[:, 0]),
                alpha=0.3,
            )

        math_parameter = "$\\" + param_to_plot + "$"
        ax.set_title(
            f"{math_parameter} at T = {temperature:.0f} K. Other vals = {other_vals:.2f}."
        )
        ax.set_xlabel(math_parameter)
        ax.set_ylabel(property_name)
        fig.legend()
        figs.append(fig)

    if not mpl_is_inline:
        return figs
def main():
    # ID the top ten by lowest average MAPE
    df = pd.read_csv("../csv/r125-pareto.csv", index_col=0)
    dff = pd.read_csv("../csv/r125-final-4.csv", index_col=0)

    seaborn.set_palette('bright', n_colors=len(df))
    data = df[list(R125.param_names)].values
    result_bounds = np.array([[0, 25], [0, 50], [0, 50], [0, 25]])
    results = values_real_to_scaled(
        df[["mape_liq_density", "mape_vap_density", "mape_Pvap",
            "mape_Hvap"]].values, result_bounds)
    data_f = dff[list(R125.param_names)].values
    results_f = values_real_to_scaled(
        dff[["mape_liq_density", "mape_vap_density", "mape_Pvap",
             "mape_Hvap"]].values, result_bounds)
    param_bounds = R125.param_bounds
    param_bounds[:5] = param_bounds[:5] * NM_TO_ANGSTROM
    param_bounds[5:] = param_bounds[5:] * KJMOL_TO_K

    data = np.hstack((data, results))
    data_f = np.hstack((data_f, results_f))
    bounds = np.vstack((param_bounds, result_bounds))

    col_names = [
        r"$\sigma_{C1}$",
        r"$\sigma_{C2}$",
        r"$\sigma_{F1}$",
        r"$\sigma_{F2}$",
        r"$\sigma_{H}$",
        r"$\epsilon_{C1}$",
        r"$\epsilon_{C2}$",
        r"$\epsilon_{F1}$",
        r"$\epsilon_{F2}$",
        r"$\epsilon_{H}$",
        "MAPE\n" + r"$\rho^l_{\mathrm{sat}}$",
        "MAPE\n" + r"$\rho^v_{\mathrm{sat}}$",
        "MAPE\n" + r"$P_{\mathrm{vap}}$",
        "MAPE\n" + r"$\Delta H_{\mathrm{vap}}$",
    ]
    n_axis = len(col_names)
    assert data.shape[1] == n_axis
    x_vals = [i for i in range(n_axis)]

    # Create (N-1) subplots along x axis
    fig, axes = plt.subplots(1, n_axis - 1, sharey=False, figsize=(20, 5))

    # Plot each row
    for i, ax in enumerate(axes):
        for line in data:
            ax.plot(x_vals, line, alpha=0.45)
        ax.set_xlim([x_vals[i], x_vals[i + 1]])
        for line in data_f:
            ax.plot(x_vals, line, alpha=1.0, linewidth=3)

    for dim, ax in enumerate(axes):
        ax.xaxis.set_major_locator(ticker.FixedLocator([dim]))
        set_ticks_for_axis(ax, bounds[dim], nticks=6)
        if dim < 10:
            ax.set_xticklabels([col_names[dim]], fontsize=24)
        else:
            ax.set_xticklabels([col_names[dim]], fontsize=20)
        ax.set_ylim(-0.05, 1.05)
        # Add white background behind labels
        for label in ax.get_yticklabels():
            label.set_bbox(
                dict(facecolor='white',
                     edgecolor='none',
                     alpha=0.45,
                     boxstyle=mpatch.BoxStyle("round4")))
        ax.spines['top'].set_visible(False)
        ax.spines['bottom'].set_visible(False)
        ax.spines['left'].set_linewidth(2.0)

    ax = axes[-1]
    ax.xaxis.set_major_locator(ticker.FixedLocator([n_axis - 2, n_axis - 1]))
    ax.set_xticklabels([col_names[-2], col_names[-1]], fontsize=20)

    ax = plt.twinx(axes[-1])
    ax.set_ylim(-0.05, 1.05)
    set_ticks_for_axis(ax, bounds[-1], nticks=6)
    ax.spines['top'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['right'].set_linewidth(2.0)

    # Add GAFF
    ax.plot(x_vals[-1],
            22.37 / bounds[-1][1],
            markersize=12,
            color="gray",
            marker="s",
            clip_on=False,
            zorder=200)
    ax.plot(x_vals[-2],
            46.05 / bounds[-2][1],
            markersize=12,
            color="gray",
            marker="s",
            clip_on=False,
            zorder=200)
    ax.plot(x_vals[-3],
            50.52 / bounds[-3][1],
            markersize=12,
            color="gray",
            marker="s",
            clip_on=False,
            zorder=200)
    ax.plot(x_vals[-4],
            2.92 / bounds[-4][1],
            markersize=12,
            color="gray",
            marker="s",
            clip_on=False,
            zorder=200)

    # Remove space between subplots
    plt.subplots_adjust(wspace=0, bottom=0.2)

    fig.savefig("pdfs/fig_r125-parallel.pdf")
Beispiel #13
0
def prepare_df_vle(df_csv, molecule):
    """Prepare a pandas dataframe for fitting a GP model to density data

    Performs the following actions:
       - Renames "liq_density" to "sim_liq_density"
       - Renames "vap_density" to "sim_vap_density"
       - Renames "Pvap" to "sim_Pvap"
       - Removes "liq_enthalpy" and "vap_enthalpy" and adds "sim_Hvap"
       - Adds "expt_liq_density"
       - Adds "expt_vap_density"
       - Adds "expt_Pvap"
       - Adds "expt_Hvap"
       - Adds "is_liquid"
       - Converts all values from physical values to scaled values

    Parameters
    ----------
    df_csv : pd.DataFrame
        The dataframe as loaded from a CSV file with the signac results
    molecule : R32Constants, R125Constants
        An instance of a molecule constants class
    n_molecules : int
        The number of molecules in the simulation

    Returns
    -------
    df_all : pd.DataFrame
        The dataframe with scaled parameters and MD/expt. properties
    """
    if "liq_density" not in df_csv.columns:
        raise ValueError("df_csv must contain column 'liq_density'")
    if "vap_density" not in df_csv.columns:
        raise ValueError("df_csv must contain column 'vap_density'")
    if "Pvap" not in df_csv.columns:
        raise ValueError("df_csv must contain column 'Pvap'")
    if "Hvap" not in df_csv.columns:
        raise ValueError("df_csv must contain column 'Hvap'")
    if "liq_enthalpy" not in df_csv.columns:
        raise ValueError("df_csv must contain column 'liq_enthalpy'")
    if "vap_enthalpy" not in df_csv.columns:
        raise ValueError("df_csv must contain column 'vap_enthalpy'")
    if "temperature" not in df_csv.columns:
        raise ValueError("df_csv must contain column 'temperature'")
    for param in list(molecule.param_names):
        if param not in df_csv.columns:
            raise ValueError(
                f"df_csv must contain a column for parameter: '{param}'")

    # Rename properties to MD, calculate Hvap, add expt properties
    df_all = df_csv.rename(columns={"liq_density": "sim_liq_density"})
    df_all = df_all.rename(columns={"vap_density": "sim_vap_density"})
    df_all = df_all.rename(columns={"Pvap": "sim_Pvap"})
    df_all = df_all.rename(columns={"Hvap": "sim_Hvap"})
    df_all.drop(columns="vap_enthalpy", inplace=True)
    df_all.drop(columns="liq_enthalpy", inplace=True)

    # Convert Hvap to kJ/kg
    df_all["sim_Hvap"] = (df_all["sim_Hvap"] / molecule.molecular_weight *
                          1000.0)

    df_all["expt_liq_density"] = df_all["temperature"].apply(
        lambda temp: molecule.expt_liq_density[int(temp)])
    df_all["expt_vap_density"] = df_all["temperature"].apply(
        lambda temp: molecule.expt_vap_density[int(temp)])
    df_all["expt_Pvap"] = df_all["temperature"].apply(
        lambda temp: molecule.expt_Pvap[int(temp)])
    df_all["expt_Hvap"] = df_all["temperature"].apply(
        lambda temp: molecule.expt_Hvap[int(temp)])

    # Scale all values
    scaled_param_values = values_real_to_scaled(
        df_all[list(molecule.param_names)], molecule.param_bounds)
    scaled_temperature = values_real_to_scaled(df_all["temperature"],
                                               molecule.temperature_bounds)
    scaled_sim_liq_density = values_real_to_scaled(df_all["sim_liq_density"],
                                                   molecule.liq_density_bounds)
    scaled_sim_vap_density = values_real_to_scaled(df_all["sim_vap_density"],
                                                   molecule.vap_density_bounds)
    scaled_sim_Pvap = values_real_to_scaled(df_all["sim_Pvap"],
                                            molecule.Pvap_bounds)
    scaled_sim_Hvap = values_real_to_scaled(df_all["sim_Hvap"],
                                            molecule.Hvap_bounds)
    scaled_expt_liq_density = values_real_to_scaled(
        df_all["expt_liq_density"], molecule.liq_density_bounds)
    scaled_expt_vap_density = values_real_to_scaled(
        df_all["expt_vap_density"], molecule.vap_density_bounds)
    scaled_expt_Pvap = values_real_to_scaled(df_all["expt_Pvap"],
                                             molecule.Pvap_bounds)
    scaled_expt_Hvap = values_real_to_scaled(df_all["expt_Hvap"],
                                             molecule.Hvap_bounds)
    df_all[list(molecule.param_names)] = scaled_param_values
    df_all["temperature"] = scaled_temperature
    df_all["sim_liq_density"] = scaled_sim_liq_density
    df_all["sim_vap_density"] = scaled_sim_vap_density
    df_all["sim_Pvap"] = scaled_sim_Pvap
    df_all["sim_Hvap"] = scaled_sim_Hvap
    df_all["expt_liq_density"] = scaled_expt_liq_density
    df_all["expt_vap_density"] = scaled_expt_vap_density
    df_all["expt_Pvap"] = scaled_expt_Pvap
    df_all["expt_Hvap"] = scaled_expt_Hvap

    return df_all
Beispiel #14
0
def prepare_df_density(df_csv, molecule, liquid_density_threshold):
    """Prepare a pandas dataframe for fitting a GP model to density data

    Performs the following actions:
       - Renames "density" to "md_density"
       - Adds "expt_density"
       - Adds "is_liquid"
       - Converts all values from physical values to scaled values

    Parameters
    ----------
    df_csv : pd.DataFrame
        The dataframe as loaded from a CSV file with the signac results
    molecule : R32Constants, R125Constants
        An instance of a molecule constants class
    liquid_density_threshold : float
        Density threshold (kg/m^3) for distinguishing liquid and vapor

    Returns
    -------
    df_all : pd.DataFrame
        The dataframe with scaled parameters, temperature, density, and is_liquid
    df_liquid : pd.DataFrame
        `df_all` where `is_liquid` is True
    df_vapor : pd.DataFrame
        `df_all` where `is_liquid` is False
    """
    if "density" not in df_csv.columns:
        raise ValueError("df_csv must contain column 'density'")
    if "temperature" not in df_csv.columns:
        raise ValueError("df_csv must contain column 'temperature'")
    for param in list(molecule.param_names):
        if param not in df_csv.columns:
            raise ValueError(
                f"df_csv must contain a column for parameter: '{param}'")

    # Add expt density and is_liquid
    df_all = df_csv.rename(columns={"density": "md_density"})
    df_all["expt_density"] = df_all["temperature"].apply(
        lambda temp: molecule.expt_liq_density[int(temp)])
    df_all["is_liquid"] = df_all["md_density"].apply(
        lambda x: x > liquid_density_threshold)

    # Scale all values
    scaled_param_values = values_real_to_scaled(
        df_all[list(molecule.param_names)], molecule.param_bounds)
    scaled_temperature = values_real_to_scaled(df_all["temperature"],
                                               molecule.temperature_bounds)
    scaled_md_density = values_real_to_scaled(df_all["md_density"],
                                              molecule.liq_density_bounds)
    scaled_expt_density = values_real_to_scaled(df_all["expt_density"],
                                                molecule.liq_density_bounds)
    df_all[list(molecule.param_names)] = scaled_param_values
    df_all["temperature"] = scaled_temperature
    df_all["md_density"] = scaled_md_density
    df_all["expt_density"] = scaled_expt_density

    # Split out vapor and liquid samples
    df_liquid = df_all[df_all["is_liquid"] == True]
    df_vapor = df_all[df_all["is_liquid"] == False]

    return df_all, df_liquid, df_vapor