if method_name != 'pls' and method_name != 'svr': sys.exit( '\'{0}\' という回帰分析手法はありません。method_name を見直してください。'.format(method_name)) dataset = pd.read_csv('virtual_resin.csv', index_col=0) y = dataset.iloc[:, y_number] # 目的変数 x = dataset.iloc[:, 2:] # 説明変数 # ランダムにトレーニングデータとテストデータとに分割 # random_state に数字を与えることで、別のときに同じ数字を使えば、ランダムとはいえ同じ結果にすることができます x_train_tmp, x_test_tmp, y_train, y_test = train_test_split( x, y, test_size=number_of_test_samples, random_state=0) if add_nonlinear_terms_flag: x_train_tmp = sample_functions.add_nonlinear_terms( x_train_tmp) # 説明変数の二乗項や交差項を追加 x_test_tmp = sample_functions.add_nonlinear_terms(x_test_tmp) x_train = x_train_tmp.drop(x_train_tmp.columns[x_train_tmp.std() == 0], axis=1) # 標準偏差が 0 の説明変数を削除 x_test = x_test_tmp.drop(x_train_tmp.columns[x_train_tmp.std() == 0], axis=1) else: x_train = x_train_tmp.copy() x_test = x_test_tmp.copy() # オートスケーリング autoscaled_x_train = (x_train - x_train.mean()) / x_train.std() autoscaled_y_train = (y_train - y_train.mean()) / y_train.std() autoscaled_x_test = (x_test - x_train.mean()) / x_train.std() if method_name == 'pls':
number_of_y = 2 # 目的変数の数 k_in_knn = 1 # k-NN における k fold_number = 10 # N-fold CV の N max_number_of_principal_components = 20 # 使用する主成分の最大数 svr_cs = 2 ** np.arange(-5, 11, dtype=float) # C の候補 svr_epsilons = 2 ** np.arange(-10, 1, dtype=float) # ε の候補 svr_gammas = 2 ** np.arange(-20, 11, dtype=float) # γ の候補 if method_name != 'pls' and method_name != 'svr': sys.exit('\'{0}\' という回帰分析手法はありません。method_name を見直してください。'.format(method_name)) dataset = pd.read_csv('virtual_resin.csv', index_col=0) ys = dataset.iloc[:, 0:number_of_y] # 目的変数 if add_nonlinear_terms_flag: x_tmp = sample_functions.add_nonlinear_terms(dataset.iloc[:, number_of_y:]) # 説明変数の二乗項や交差項を追加 x = x_tmp.drop(x_tmp.columns[x_tmp.std() == 0], axis=1) # 標準偏差が 0 の説明変数を削除 else: x = dataset.iloc[:, number_of_y:] # 説明変数 autoscaled_x = (x - x.mean(axis=0)) / x.std(axis=0, ddof=1) # オートスケーリング models = [] # ここに y ごとの回帰モデルを格納 for y_number in range(number_of_y): y = ys.iloc[:, y_number] autoscaled_y = (y - y.mean()) / y.std(ddof=1) # オートスケーリング if method_name == 'pls': # CV による成分数の最適化 components = [] # 空の list の変数を作成して、成分数をこの変数に追加していきます同じく成分数をこの変数に追加 r2_in_cv_all = [] # 空の list の変数を作成して、成分数ごとのクロスバリデーション後の r2 をこの変数に追加 for component in range(1, min(np.linalg.matrix_rank(autoscaled_x), max_number_of_principal_components) + 1): # PLS
y_measured_dataset_test = dataset_test.iloc[0:1, :] measured_index = [0] for sample_number in range(1, dataset_test.shape[0]): if y_measured_dataset_test.iloc[-1, 0] != dataset_test.iloc[sample_number, 0]: y_measured_dataset_test = pd.concat([ y_measured_dataset_test, dataset_test.iloc[sample_number:sample_number + 1, :] ], axis=0) measured_index.append(sample_number) y_test = y_measured_dataset_test.iloc[:, 0] x_test = y_measured_dataset_test.iloc[:, 1:] if add_nonlinear_terms_flag: x_train = sample_functions.add_nonlinear_terms(x_train) # 説明変数の二乗項や交差項を追加 x_test = sample_functions.add_nonlinear_terms(x_test) x_test_prediction = sample_functions.add_nonlinear_terms(x_test_prediction) # 標準偏差が 0 の説明変数を削除 std_0_variable_flags = x_train.std() == 0 x_train = x_train.drop(x_train.columns[std_0_variable_flags], axis=1) x_test = x_test.drop(x_test.columns[std_0_variable_flags], axis=1) x_test_prediction = x_test_prediction.drop( x_test_prediction.columns[std_0_variable_flags], axis=1) # オートスケーリング autoscaled_x_train = (x_train - x_train.mean()) / x_train.std() autoscaled_y_train = (y_train - y_train.mean()) / y_train.std() autoscaled_x_test = (x_test - x_train.mean()) / x_train.std() if method_name == 'svr':
original_x_prediction.columns = original_x.columns original_x_prediction = (original_x_prediction.T / original_x_prediction.T.sum()).T # 予測用データに対して、AD の中か外かを判定 autoscaled_original_x_prediction = (original_x_prediction - original_x.mean()) / original_x.std() data_density_prediction = ad_model.decision_function( autoscaled_original_x_prediction) # データ密度 (f(x) の値) original_x_prediction_inside_ad = original_x_prediction.iloc[ data_density_prediction >= 0, :] original_x_prediction_inside_ad = original_x_prediction_inside_ad.reset_index( drop=True) if add_nonlinear_terms_flag: x_prediction = sample_functions.add_nonlinear_terms( original_x_prediction_inside_ad) # 説明変数の二乗項や交差項を追加 x_prediction = x_prediction.drop( x_prediction.columns[std_0_nonlinear_variable_flags], axis=1) # 標準偏差が 0 の説明変数を削除 else: x_prediction = original_x_prediction_inside_ad.copy() # オートスケーリング autoscaled_x_prediction = (x_prediction - x.mean(axis=0)) / x.std(axis=0, ddof=1) # 予測して、 positive なサンプルのみ保存 predicted_y = model.predict(autoscaled_x_prediction) predicted_positive_dataset = pd.concat([ predicted_positive_dataset, original_x_prediction.iloc[predicted_y == 'positive', :] ],
# -*- coding: utf-8 -*- """ @author: hkaneko """ import pandas as pd import sample_functions from sklearn.model_selection import train_test_split number_of_test_samples = 800 dataset = pd.read_csv('unique_m.csv', index_col=-1) dataset = dataset.sort_values('critical_temp', ascending=False).iloc[:4000, :] y = dataset.iloc[:, 86].copy() x = dataset.iloc[:, :86] x = (x.T / x.T.sum()).T # ランダムにトレーニングデータとテストデータとに分割 # random_state に数字を与えることで、別のときに同じ数字を使えば、ランダムとはいえ同じ結果にすることができます x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=number_of_test_samples, shuffle=True, random_state=21) # 標準偏差が 0 の説明変数を削除 std_0_variable_flags = x_train.std() == 0 x_train = x_train.drop(x_train.columns[std_0_variable_flags], axis=1) x_test = x_test.drop(x_test.columns[std_0_variable_flags], axis=1) # 説明変数の二乗項や交差項を追加 x_train = sample_functions.add_nonlinear_terms(x_train) x_test = sample_functions.add_nonlinear_terms(x_test) # 保存 x_train.to_csv('x_train_superconductor.csv') x_test.to_csv('x_test_superconductor.csv')
# -*- coding: utf-8 -*- """ @author: hkaneko """ import pandas as pd import sample_functions dataset = pd.read_csv('unique_m.csv', index_col=-1) dataset = dataset.sort_values('critical_temp', ascending=False).iloc[:4000, :] x = dataset.iloc[:, :86] x = (x.T / x.T.sum()).T # 標準偏差が 0 の説明変数を削除 x = x.drop(x.columns[x.std() == 0], axis=1) # 説明変数の二乗項や交差項を追加 x = sample_functions.add_nonlinear_terms(x) # 保存 x.to_csv('x_superconductor.csv')
for sample_number in range(1, dataset_train.shape[0]): if dataset.iloc[-1, 0] != dataset_train.iloc[sample_number, 0]: dataset = pd.concat( [dataset, dataset_train.iloc[sample_number:sample_number + 1, :]], axis=0) dataset = pd.concat([dataset, dataset_test.iloc[0:1, :]], axis=0) for sample_number in range(1, dataset_test.shape[0]): if dataset.iloc[-1, 0] != dataset_test.iloc[sample_number, 0]: dataset = pd.concat( [dataset, dataset_test.iloc[sample_number:sample_number + 1, :]], axis=0) y = dataset.iloc[:, 0] x = dataset.iloc[:, 1:] if add_nonlinear_terms_flag: x = sample_functions.add_nonlinear_terms(x) # 説明変数の二乗項や交差項を追加 x_prediction = sample_functions.add_nonlinear_terms(x_prediction) # 標準偏差が 0 の説明変数を削除 std_0_variable_flags = x.std() == 0 x = x.drop(x.columns[std_0_variable_flags], axis=1) x_prediction = x_prediction.drop( x_prediction.columns[std_0_variable_flags], axis=1) # オートスケーリング autoscaled_x = (x - x.mean()) / x.std() autoscaled_y = (y - y.mean()) / y.std() autoscaled_x_prediction = (x_prediction - x.mean()) / x.std() if method_name == 'svr': # 時間短縮のため、最初だけグラム行列の分散を最大化することによる γ の最適化 optimal_svr_gamma = sample_functions.gamma_optimization_with_variance(