result_matrix[first_header][second_header]) print("Pearson correlation coefficients have calculated!") return result_matrix def write_to_csv(file_name, headers, data): with open(file_name, 'w') as outfile: # the first cell is empty outfile.write(";" + ";".join(headers) + "\n") for first_header in headers: outfile.write(first_header) for second_header in headers: outfile.write(";" + str(data[first_header][second_header][0]) + " " + str(data[first_header][second_header][1])) outfile.write("\n") data_handler = DataHandler() signals, backgrounds = data_handler.get_separate_training_data() headers = data_handler.get_headers() # signals_backgrounds_correlation(headers, signals, backgrounds) correlation_matrix_signals = pearson_correlation_matrix(headers, signals) # file_name = "./reports/correlation/Pearson_correlation_signals.csv" # write_to_csv(file_name, headers, correlation_matrix_signals) # correlation_matrix_backgrounds = pearson_correlation_matrix(headers, backgrounds) # file_name = "./reports/correlation/Pearson_correlation_backgrounds.csv" # write_to_csv(file_name, headers, correlation_matrix_backgrounds)
'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } estimator = GridSearchCV(svm.SVC(kernel='rbf'), param_grid) return cross_validation_for_grid(estimator, data, targets) def learn_by_one_feature(data, targets, estimator): for columnNumber in range(data.shape[1]): mean, standart_deviation, time = cross_validation(estimator, data[:, columnNumber:columnNumber + 1], targets) print("Column number: %d" % columnNumber) print("Accuracy: %0.2f (+/- %0.2f)" % (mean, standart_deviation)) print("Time: %0.2f" % time) if __name__ == "__main__": data_handler = DataHandler() all_data, all_targets = data_handler.get_training_data() samples_size = 5000 data = all_data[-samples_size:] targets = all_targets[-samples_size:] # estimator = svm.SVC(kernel='linear', C=1) # estimator = svm.SVC(kernel='rbf', C=1, gamma=0.0001) # mean, standart_deviation, time = cross_validation(estimator, data, targets) # mean, standart_deviation, time = find_best_linear_param(data, targets) # mean, standart_deviation, time = find_best_rbf_param(data, targets) # print("Accuracy: %0.2f (+/- %0.2f)" % (mean, standart_deviation)) # print("Time: %0.2f" % time)
from sklearn import tree from SVM.DataHandler import DataHandler from SVM.EvaluatingEstimator import cross_validation from sklearn.grid_search import GridSearchCV from SVM.EvaluatingEstimator import cross_validation_for_grid from sklearn.decomposition import RandomizedPCA dt = DataHandler() training_data, targets = dt.get_training_data(samples_size=5000) # training_data, targets, test_data = dt.get_pretreated_data(training_samples_size=5000, # test_samples_size=5000) # pca = RandomizedPCA(n_components=5, whiten=False).fit(training_data) # training_data = pca.transform(training_data) estimator = tree.DecisionTreeClassifier(max_depth=6, min_samples_leaf=9) mean, standart_deviation, time = cross_validation(estimator, training_data, targets) # param_grid = [{'max_depth': list(range(3, 20)), 'min_samples_leaf': list(range(5, 10)), # 'min_samples_split': list(range(1, 5))}] # # estimator = tree.DecisionTreeClassifier() # estimator = GridSearchCV(estimator, param_grid) # mean, standart_deviation, time = cross_validation_for_grid(estimator, training_data, targets) print("Accuracy: %0.2f (+/- %0.2f)" % (mean, standart_deviation)) print("Time: %0.2f" % time)
plt.xlim(min_first_value, max_first_value) ax.set_ylabel(headers[first_feature], size='12') plt.ylim(min_second_value, max_second_value) ax.set_xlabel(headers[second_feature], size='12') ax.scatter(test_corrected_first_feature, test_corrected_second_feature, alpha=0.5, color='yellow') fig.tight_layout() fig.savefig('reports/by_pair_features/' + str(first_feature) + " " + headers[first_feature] + "-" + str(second_feature) + " " + headers[second_feature] + '.png', dpi=120) print("Columns # " + str(first_feature) + "_" + headers[first_feature] + "-" + str(second_feature) + "_" + headers[second_feature] + ": ok!") def split_training_data(data, targets): signals_indices = [index for index, value in enumerate(targets) if value == 's'] backgrounds_indices = [index for index, value in enumerate(targets) if value == 'b'] return data[signals_indices], data[backgrounds_indices] if __name__ == "__main__": data_handler = DataHandler() training_data, training_targets = data_handler.get_training_data() test_data = data_handler.get_test_data() headers = data_handler.get_headers() # by_one_features(training_data, training_targets, test_data, headers) by_pair_features(training_data, training_targets, test_data, headers)
""" Testing different ideas """ from SVM.DataHandler import DataHandler from SVM.EvaluatingEstimator import cross_validation from sklearn import neighbors from sklearn import svm data_handler = DataHandler() # Columns with big correlation (more 90%): # "DER_sum_pt", "PRI_met_sumet", "PRI_jet_all_pt" remove_columns_names = ("EventId", "PRI_met_sumet", "PRI_jet_all_pt") # 0.73+0.03 # remove_columns_names = ("EventId", "DER_sum_pt", "PRI_jet_all_pt") # 0.74+0.03 # remove_columns_names = ("EventId", "DER_sum_pt", "PRI_met_sumet") # 0.74 (+/- 0.04) 2369.93 training_data, targets, test_data = data_handler.get_pretreated_data( training_samples_size=5000, test_samples_size=5000, remove_columns_names=remove_columns_names) # estimator = svm.SVC(kernel='linear', C=1) # estimator = neighbors.KNeighborsClassifier(n_neighbors=12) estimator = svm.SVC(kernel='rbf', C=1, gamma=0.0001) mean, standart_deviation, time = cross_validation(estimator, training_data, targets) print("Accuracy: %0.2f (+/- %0.2f)" % (mean, standart_deviation)) print("Time: %0.2f" % time)