Ejemplo n.º 1
0
def process_pca():

    if not os.path.exists(get_pca_dir()):
        os.makedirs(get_pca_dir())

    pca = PCA(
        0.95
    )  # 0.95 means that the maximum number of columns will be removed and accuracy will stay above 95%

    data, target, headers = get_data(get_transformed_dir() + 'dataset.csv')

    data = StandardScaler().fit_transform(data)
    pca.fit(data)
    pca_data = pca.transform(data)

    plt.semilogy(pca.explained_variance_ratio_, '--o')
    plt.show()

    pca_headers = ['target']
    for i in range(pca.n_components_):
        pca_headers.append('PCA_component_' + str(i + 1))

    write_to_csv(get_pca_dir() + 'dataset.csv', pca_data, target, pca_headers)

    print pca
Ejemplo n.º 2
0
def get_data(location=get_transformed_dir()):
    t1 = time()
    data = []
    target = []
    header = []
    with open(location, 'rb') as f:
        reader = csv.reader(f)
        for i, row in enumerate(reader):
            if i == 0:
                header = row
            elif i != 0:
                try:
                    data.append(map(int, row))
                except ValueError:
                    data.append(map(float, row))

    for i in range(len(data)):
        target.append(data[i][0])
        # del data[i][30]
        del data[i][0]

    data = np.array(data)

    t2 = time()

    print 'Getting data from ' + location + '\ntook: ' + str(t2 -
                                                             t1) + ' sec\n'

    return data, target, header
def evaluate_sequential_nn(method, dataset_location=get_transformed_dir()):

    data, target = get_test_data(dataset_location)

    if data.dtype != float:
        data = StandardScaler().fit_transform(data)
        print 'Dataset scaled'

    prediction = list(
        np.rint(method.predict(data)).astype(int).astype(str).flatten('F'))
    target = map(np.str, target)

    print('Sequential neural network: ')
    print_f_measure(target, prediction)
Ejemplo n.º 4
0
def select_k_best():

    if not os.path.exists(get_k_best_dir()):
        os.makedirs(get_k_best_dir())

    data, target, headers = get_data(get_transformed_dir() + 'dataset.csv')

    # data = normalize(data, axis=0, norm='max')

    data = SelectKBest(chi2, k=40).fit_transform(data, target)

    headers = ['target']
    for i in range(data.shape[1]):
        headers.append('PCA_component_' + str(i + 1))

    write_to_csv(get_k_best_dir() + 'dataset.csv', data, target, headers)
def evaluate(method,
             method_name,
             scaled=False,
             dataset_location=get_transformed_dir()):

    data, target = get_test_data(dataset_location)

    if scaled:
        data = StandardScaler().fit_transform(data)
        print 'Dataset scaled'

    prediction = method.predict(data).astype(float)
    target = np.array(target, dtype=float)

    print(method_name + ':')
    print_f_measure(target, prediction)
def evaluate_neural_network(network, dataset_location=get_transformed_dir()):

    data, target = get_test_data(dataset_location)

    if data.dtype != float:
        data = StandardScaler().fit_transform(data)
        print 'Dataset scaled'

    test = []
    for i in range(len(data)):
        temp = np.array(data[i], dtype='float64')
        test.append(Instance(temp))

    prediction = list(
        np.rint(network.predict(test)).astype(int).astype(str).flatten('F'))
    target = map(np.str, target)
    print('Scaled conjugate network: ')
    print_f_measure(target, prediction)
Ejemplo n.º 7
0
from os.path import isfile

from keras import Sequential, optimizers
from keras.layers import Dense
from keras.models import load_model
from sklearn.preprocessing import StandardScaler

from dataset_process.data_io import get_train_data, get_validation_data
from evaluation.evaluation import evaluate_sequential_nn
from global_variables import get_transformed_dir

dataset_location = get_transformed_dir()


def learn_nn():
    data, target = get_train_data(dataset_location)

    if data.dtype != float:
        print 'Scaling dataset'
        data = StandardScaler().fit_transform(data)

    model = Sequential()
    model.add(Dense(80, input_dim=data.shape[1], activation='tanh'))
    model.add(Dense(70, activation='relu'))
    model.add(Dense(60, activation='tanh'))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(40, activation='tanh'))
    model.add(Dense(30, activation='relu'))
    model.add(Dense(20, activation='tanh'))
    model.add(Dense(10, activation='tanh'))
    model.add(Dense(1, activation='relu'))
from dataset_process.data_io import get_data
from global_variables import get_transformed_dir


# this script checks distribution of accident severity in train, validate and test
def view_severity_distribution(location='../dataset/transformed/dataset.csv'):
    data, target, header = get_data(location)
    values = {}
    for value in target:
        if value in values:
            values[value] += 1
        else:
            values[value] = 1

    for key, value in values.iteritems():
        print str(key) + ': ' + str(
            round(float(values[key]) / len(target) * 100, 2)) + '%'


def print_distribution_for_train_validate_test(root_dir):
    view_severity_distribution(location=root_dir + 'dataset.csv')

    view_severity_distribution(location=root_dir + 'train.csv')
    view_severity_distribution(location=root_dir + 'validate.csv')
    view_severity_distribution(location=root_dir + 'test.csv')


print_distribution_for_train_validate_test(get_transformed_dir())
    elif float(len(non_null)) / data.shape[0] < 0.5:
        print 'This index should be altered: ' + str(i)
    else:
        temp[temp == 'NaN'] = np.median(non_null)

print '4:6 - Removed missing values and replaced with median values'

# vehicle_type = data[:, 2].reshape(-1, 1)
#
# one_hot_encoder = OneHotEncoder(sparse=False)
# one_hot_encoded = one_hot_encoder.fit_transform(vehicle_type)
# data = np.concatenate((data, one_hot_encoded), axis=1)
# data = np.delete(data, 2, 1)
# header = np.delete(header, 2)
#
# print "4:6 - Transformed categorical data"

data = data.astype(np.float)
data = data.astype(np.int64)
data = data.astype(np.str)

print '5:6 - Conversion done'
with open(get_transformed_dir() + 'dataset.csv', 'w') as trans_file:
    trans_file.write(",".join(header))
    trans_file.write('\n')
    for index in range(len(data)):
        trans_file.write(",".join(data[index]))
        trans_file.write('\n')

print '6:6 - Data written in transformed/dataset.csv file'
Ejemplo n.º 10
0
def get_test_data(root_dir=get_transformed_dir()):
    data, target, header = get_data(root_dir + 'test.csv')
    return data, target
Ejemplo n.º 11
0
def get_validation_data(root_dir=get_transformed_dir()):
    data, target, header = get_data(root_dir + 'validate.csv')
    return data, target
    train_data, validate_data, train_target, validate_target = train_test_split(
        train_data, train_target, test_size=len(target) / 10)

    with open(root_folder + 'train.csv', 'w') as train_file:
        train_file.write(",".join(header))
        train_file.write('\n')
        for i, row in enumerate(train_data):
            train_file.write(
                str(train_target[i]) + "," + str(",".join(map(str, row))))
            train_file.write('\n')

    with open(root_folder + 'validate.csv', 'w') as validate_file:
        validate_file.write(",".join(header))
        validate_file.write('\n')
        for i, row in enumerate(validate_data):
            validate_file.write(
                str(validate_target[i]) + "," + str(",".join(map(str, row))))
            validate_file.write('\n')

    with open(root_folder + 'test.csv', 'w') as test_file:
        test_file.write(",".join(header))
        test_file.write('\n')
        for i, row in enumerate(test_data):
            test_file.write(
                str(test_target[i]) + "," + str(",".join(map(str, row))))
            test_file.write('\n')


divide_dataset(get_transformed_dir())