import pandas as pd from sklearn.model_selection import LeaveOneGroupOut from tabulate import tabulate from package import gpr, io, rf, testhelper as th def checkAlreadyDone(element, alreadylist): for x in alreadylist: if element == x: return True return False # Data Collection data = io.importdata('../data/Diffusion_Data_allfeatures.csv') # data = io.importdata('../data/temp.csv') groups = data['Material compositions 1'].values data = io.sanitizedata(data) gprsavedkernel = io.loadmodelobj('../models/GPR_data_Diffusion_Data_allfeatures_csv_02-24-20_18-32-12') \ .getGPRkernel() X = data.iloc[:, 1:] Y = data.iloc[:, 0] rfslope = 0.919216 rfintercept = -0.025370 y_std = statistics.stdev(Y.to_numpy(dtype=float)) # Setup thresholds
# This script imports the PV data set from its CSV file, removes unnecessary columns, and saves the x- and y-values as np arrays. from package import io import numpy as np # import data #data = io.importdata('perovskite_data/PVstability_Weipaper_alldata_featureselected.csv') #data = io.sanitizedata(data, user_list=['is_testdata', 'Material Composition']) data = io.importdata('perovskite_data/Perovskite_stability_Wei_updated.csv') data = io.sanitizedata(data, user_list=['Compositions']) # separate x- and y-values and save as numpy arrays X_values = data.iloc[:, 1:] y_values = data.iloc[:, 0] X_values = X_values.to_numpy(dtype=float) y_values = y_values.to_numpy(dtype=float) # save arrays for later use np.save('perovskite_data/all_x_values.npy', X_values) np.save('perovskite_data/all_y_values.npy', y_values)
normalityTests = ['MetricOne', 'MetricTwo'] # bin_sizes = [10, 50, 100, 200, 500] bin_sizes = [200] contour_plot_same_scale = True make_counts_plot = True # Resources trainfile = '../data/Diffusion_Data_allfeatures.csv' # trainfile = '../data/temp.csv' rfslope = 0.919216 rfintercept = -0.025370 gprsavedkernel = io.loadmodelobj('../models/GPR_data_Diffusion_Data_allfeatures_csv_02-24-20_18-32-12') \ .getGPRkernel() # Data data = io.importdata(trainfile) groups = data['Material compositions 1'].values data = io.sanitizedata(data) X = data.iloc[:, 1:] Y = data.iloc[:, 0] y_std = statistics.stdev(Y.to_numpy(dtype=float)) # For Infinite Cutoffs INF = np.inf if include_INF: gpr_thresholds_range = np.append(gpr_thresholds_range, INF) rf_thresholds_range = np.append(rf_thresholds_range, INF) gpr_thresholds, rf_thresholds = np.meshgrid(gpr_thresholds_range, rf_thresholds_range) accumulator = {(r, g, 1): [] for g in gpr_thresholds_range for r in rf_thresholds_range}
from package import io import csv data = io.importdata('_haijinlogfeaturesnobarrier_alldata.csv') data_features = data.iloc[:, :-4] data_useless = data.iloc[:, -4:] feature_fields = data_features.columns useless_fields = data_useless.columns data_np = data_features.to_numpy(dtype=float).std(axis=0) filename = "NOISE_1STD_haijinlogfeaturesnobarrier_alldata.csv" with open(filename, 'w', newline='') as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(feature_fields.append(useless_fields)) for j in range(0, len(data)): row = [] for i in range(0, len(feature_fields)): row.append(float(data_features.iloc[j, i])+data_np[i]) for i in range(len(useless_fields)): row.append(data_useless.iloc[j, i]) csvwriter.writerow(row)