Example #1
0
                      result_matrix[first_header][second_header])
    print("Pearson correlation coefficients have calculated!")
    return result_matrix


def write_to_csv(file_name, headers, data):
    with open(file_name, 'w') as outfile:
        # the first cell is empty
        outfile.write(";" + ";".join(headers) + "\n")
        for first_header in headers:
            outfile.write(first_header)
            for second_header in headers:
                outfile.write(";" + str(data[first_header][second_header][0]) +
                              " " + str(data[first_header][second_header][1]))
            outfile.write("\n")


data_handler = DataHandler()
signals, backgrounds = data_handler.get_separate_training_data()
headers = data_handler.get_headers()

# signals_backgrounds_correlation(headers, signals, backgrounds)

correlation_matrix_signals = pearson_correlation_matrix(headers, signals)
# file_name = "./reports/correlation/Pearson_correlation_signals.csv"
# write_to_csv(file_name, headers, correlation_matrix_signals)

# correlation_matrix_backgrounds = pearson_correlation_matrix(headers, backgrounds)
# file_name = "./reports/correlation/Pearson_correlation_backgrounds.csv"
# write_to_csv(file_name, headers, correlation_matrix_backgrounds)
Example #2
0
File: SVM.py Project: himl/boson
                  'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
    estimator = GridSearchCV(svm.SVC(kernel='rbf'), param_grid)
    return cross_validation_for_grid(estimator, data, targets)


def learn_by_one_feature(data, targets, estimator):
    for columnNumber in range(data.shape[1]):
        mean, standart_deviation, time = cross_validation(estimator,
            data[:, columnNumber:columnNumber + 1], targets)
        print("Column number: %d" % columnNumber)
        print("Accuracy: %0.2f (+/- %0.2f)" % (mean, standart_deviation))
        print("Time: %0.2f" % time)


if __name__ == "__main__":
    data_handler = DataHandler()
    all_data, all_targets = data_handler.get_training_data()

    samples_size = 5000
    data = all_data[-samples_size:]
    targets = all_targets[-samples_size:]

    # estimator = svm.SVC(kernel='linear', C=1)
    # estimator = svm.SVC(kernel='rbf', C=1, gamma=0.0001)
    # mean, standart_deviation, time = cross_validation(estimator, data, targets)

    # mean, standart_deviation, time = find_best_linear_param(data, targets)
    # mean, standart_deviation, time = find_best_rbf_param(data, targets)

    # print("Accuracy: %0.2f (+/- %0.2f)" % (mean, standart_deviation))
    # print("Time: %0.2f" % time)
Example #3
0
from sklearn import tree
from SVM.DataHandler import DataHandler
from SVM.EvaluatingEstimator import cross_validation
from sklearn.grid_search import GridSearchCV
from SVM.EvaluatingEstimator import cross_validation_for_grid
from sklearn.decomposition import RandomizedPCA


dt = DataHandler()
training_data, targets = dt.get_training_data(samples_size=5000)

# training_data, targets, test_data = dt.get_pretreated_data(training_samples_size=5000,
#                                                            test_samples_size=5000)

# pca = RandomizedPCA(n_components=5, whiten=False).fit(training_data)
# training_data = pca.transform(training_data)


estimator = tree.DecisionTreeClassifier(max_depth=6, min_samples_leaf=9)
mean, standart_deviation, time = cross_validation(estimator, training_data, targets)


# param_grid = [{'max_depth': list(range(3, 20)), 'min_samples_leaf': list(range(5, 10)),
#                'min_samples_split': list(range(1, 5))}]
#
# estimator = tree.DecisionTreeClassifier()
# estimator = GridSearchCV(estimator, param_grid)
# mean, standart_deviation, time = cross_validation_for_grid(estimator, training_data, targets)

print("Accuracy: %0.2f (+/- %0.2f)" % (mean, standart_deviation))
print("Time: %0.2f" % time)
Example #4
0
            plt.xlim(min_first_value, max_first_value)
            ax.set_ylabel(headers[first_feature], size='12')

            plt.ylim(min_second_value, max_second_value)
            ax.set_xlabel(headers[second_feature], size='12')

            ax.scatter(test_corrected_first_feature, test_corrected_second_feature, alpha=0.5,
                       color='yellow')

            fig.tight_layout()
            fig.savefig('reports/by_pair_features/' +
                        str(first_feature) + " " + headers[first_feature] + "-" +
                        str(second_feature) + " " + headers[second_feature] + '.png', dpi=120)
            print("Columns # " + str(first_feature) + "_" + headers[first_feature] + "-" +
                  str(second_feature) + "_" + headers[second_feature] + ": ok!")


def split_training_data(data, targets):
    signals_indices = [index for index, value in enumerate(targets) if value == 's']
    backgrounds_indices = [index for index, value in enumerate(targets) if value == 'b']
    return data[signals_indices], data[backgrounds_indices]


if __name__ == "__main__":
    data_handler = DataHandler()
    training_data, training_targets = data_handler.get_training_data()
    test_data = data_handler.get_test_data()
    headers = data_handler.get_headers()

    # by_one_features(training_data, training_targets, test_data, headers)
    by_pair_features(training_data, training_targets, test_data, headers)
Example #5
0
File: Test.py Project: himl/boson
""" Testing different ideas """
from SVM.DataHandler import DataHandler
from SVM.EvaluatingEstimator import cross_validation
from sklearn import neighbors
from sklearn import svm


data_handler = DataHandler()
# Columns with big correlation (more 90%):
# "DER_sum_pt", "PRI_met_sumet", "PRI_jet_all_pt"

remove_columns_names = ("EventId", "PRI_met_sumet", "PRI_jet_all_pt") # 0.73+0.03
# remove_columns_names = ("EventId", "DER_sum_pt", "PRI_jet_all_pt") # 0.74+0.03
# remove_columns_names = ("EventId", "DER_sum_pt", "PRI_met_sumet") # 0.74 (+/- 0.04) 2369.93
training_data, targets, test_data = data_handler.get_pretreated_data(
    training_samples_size=5000, test_samples_size=5000, remove_columns_names=remove_columns_names)

# estimator = svm.SVC(kernel='linear', C=1)
# estimator = neighbors.KNeighborsClassifier(n_neighbors=12)
estimator = svm.SVC(kernel='rbf', C=1, gamma=0.0001)
mean, standart_deviation, time = cross_validation(estimator, training_data, targets)
print("Accuracy: %0.2f (+/- %0.2f)" % (mean, standart_deviation))
print("Time: %0.2f" % time)