Beispiel #1
0
import pandas as pd
from sklearn.model_selection import LeaveOneGroupOut
from tabulate import tabulate

from package import gpr, io, rf, testhelper as th


def checkAlreadyDone(element, alreadylist):
    for x in alreadylist:
        if element == x:
            return True
    return False


# Data Collection
data = io.importdata('../data/Diffusion_Data_allfeatures.csv')
# data = io.importdata('../data/temp.csv')

groups = data['Material compositions 1'].values
data = io.sanitizedata(data)
gprsavedkernel = io.loadmodelobj('../models/GPR_data_Diffusion_Data_allfeatures_csv_02-24-20_18-32-12') \
    .getGPRkernel()

X = data.iloc[:, 1:]
Y = data.iloc[:, 0]
rfslope = 0.919216
rfintercept = -0.025370
y_std = statistics.stdev(Y.to_numpy(dtype=float))

# Setup thresholds
# This script imports the PV data set from its CSV file, removes unnecessary columns, and saves the x- and y-values as np arrays.
from package import io
import numpy as np

# import data
#data = io.importdata('perovskite_data/PVstability_Weipaper_alldata_featureselected.csv')
#data = io.sanitizedata(data, user_list=['is_testdata', 'Material Composition'])
data = io.importdata('perovskite_data/Perovskite_stability_Wei_updated.csv')
data = io.sanitizedata(data, user_list=['Compositions'])

# separate x- and y-values and save as numpy arrays
X_values = data.iloc[:, 1:]
y_values = data.iloc[:, 0]
X_values = X_values.to_numpy(dtype=float)
y_values = y_values.to_numpy(dtype=float)

# save arrays for later use
np.save('perovskite_data/all_x_values.npy', X_values)
np.save('perovskite_data/all_y_values.npy', y_values)
normalityTests = ['MetricOne', 'MetricTwo']
# bin_sizes = [10, 50, 100, 200, 500]
bin_sizes = [200]
contour_plot_same_scale = True
make_counts_plot = True

# Resources
trainfile = '../data/Diffusion_Data_allfeatures.csv'
# trainfile = '../data/temp.csv'
rfslope = 0.919216
rfintercept = -0.025370
gprsavedkernel = io.loadmodelobj('../models/GPR_data_Diffusion_Data_allfeatures_csv_02-24-20_18-32-12') \
    .getGPRkernel()

# Data
data = io.importdata(trainfile)
groups = data['Material compositions 1'].values
data = io.sanitizedata(data)
X = data.iloc[:, 1:]
Y = data.iloc[:, 0]
y_std = statistics.stdev(Y.to_numpy(dtype=float))

# For Infinite Cutoffs
INF = np.inf
if include_INF:
    gpr_thresholds_range = np.append(gpr_thresholds_range, INF)
    rf_thresholds_range = np.append(rf_thresholds_range, INF)
gpr_thresholds, rf_thresholds = np.meshgrid(gpr_thresholds_range,
                                            rf_thresholds_range)
accumulator = {(r, g, 1): []
               for g in gpr_thresholds_range for r in rf_thresholds_range}
from package import io
import csv


data = io.importdata('_haijinlogfeaturesnobarrier_alldata.csv')
data_features = data.iloc[:, :-4]
data_useless = data.iloc[:, -4:]
feature_fields = data_features.columns
useless_fields = data_useless.columns
data_np = data_features.to_numpy(dtype=float).std(axis=0)
filename = "NOISE_1STD_haijinlogfeaturesnobarrier_alldata.csv"
with open(filename, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(feature_fields.append(useless_fields))
    for j in range(0, len(data)):
        row = []
        for i in range(0, len(feature_fields)):
            row.append(float(data_features.iloc[j, i])+data_np[i])
        for i in range(len(useless_fields)):
            row.append(data_useless.iloc[j, i])
        csvwriter.writerow(row)