Exemple #1
0
def train(nb_epochs, batch_size, learning_rate, save_path=os.getcwd(), split_data=True):
    model = create_model()
    adam = Adam(lr=learning_rate)
    model.compile(optimizer=adam, loss='mse', metrics=['acc'])

    train_path = os.path.join(os.getcwd(), "training")
    val_path = os.path.join(os.getcwd(), "validation")
    train_images_path = os.path.join(train_path, 'images')
    val_images_path = os.path.join(val_path, 'images')
    nb_train = data_process.get_number_of_data(train_images_path)
    nb_val = data_process.get_number_of_data(val_images_path)
    print('Broj podataka za treniranje prije podjele:', nb_train)
    print('Broj podataka za testiranje prije podjele:', nb_val)
    if split_data:
        data_process.split_data(train_images_path, val_images_path, 0.1)
        nb_train = data_process.get_number_of_data(train_images_path)
        nb_val = data_process.get_number_of_data(val_images_path)
        print('Broj podataka za treniranje nakon podjele:', nb_train)
        print('Broj podataka za testiranje nakon podjele', nb_val)
    train_generator = data_process.generator(2, train_path)
    validation_generator = data_process.generator(2, val_path)

    train_steps = data_process.get_number_of_data(train_images_path) // batch_size
    val_steps = data_process.get_number_of_data(val_images_path) // batch_size

    model.fit(train_generator, batch_size=batch_size, steps_per_epoch=train_steps, epochs=nb_epochs,
              validation_data=validation_generator, validation_steps=val_steps)
    save_path = os.path.join(save_path, 'model.h5')
    model.save(save_path)
Exemple #2
0
def main():
        # Get the data
    data_train = pd.read_csv('dataset/train.csv')
    data_test = pd.read_csv('dataset/test.csv')
        # Transforming and dividing features
    Id_test = data_test['PassengerId']
    selected_features = ['Pclass','Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
    df_train, df_test = data_process.transform_features(data_train, data_test, selected_features)
    df_train, df_test = data_process.features_scaling(df_train, df_test, selected_features)
    X_train, Y_train, X_test, Y_test, test_X = data_process.split_data(df_train, df_test, selected_features)
        # Set parameters
    parameters = {}
    parameters['model_path'] = 'model/Titanic.ckpt'
    parameters['n_input'], parameters['n_features'] = X_train.shape
    parameters['n_hidden'] = 2
    parameters['hidden_dim'] = 40
    parameters['n_class'] = 1
    parameters['learning_rate'] = 0.01
    parameters['training_epochs'] = 15000
    parameters['visualize'] = False
    if ((len(argv) > 1 and argv[1] == '-v') or (len(argv) > 2 and argv[2] == '-v')):
        parameters['visualize'] = True

        # Get model & train
    titanic_model = model.make_model(parameters)
    if (len(argv) > 1 and argv[1] == '-n') or (len(argv) > 2 and argv[2] == '-n'):
        model.neural_network(X_train, Y_train, parameters, titanic_model, X_test, Y_test)
        # Print accuracy
    if os.path.isfile(parameters['model_path']) == True:
        accuracy_estimation.Accuracy(parameters, titanic_model, X_train, Y_train, X_test, Y_test)
        # Output the submission to estimation.csv
    if os.path.isfile(parameters['model_path']) == True:
        accuracy_estimation.Estimation(parameters, titanic_model, test_X, Id_test)
    else:
        print("\nNo model found, please create a new file named 'Titanic.ckpt' in a directory named 'model' and launch the programme with th folowing commande :\n'python3 main.py -n'\n")
Exemple #3
0
def t1():
    num_samp_per_class = 2
    dim = 2
    N_class = 4

    X, labels = gen_toy_data(dim, N_class, num_samp_per_class)

    X_norm, mean, std = normalize(X)

    X_norm, mean, U, S = PCA_white(X_norm)

    layer_param = [dim, 100, 100, N_class]

    overfit_tinydata(X_norm, labels, layer_param)

    X_train, labels_train, X_val, labels_val, X_test, labels_test = split_data(
        X_norm, labels)

    check_gradient(X, labels, [2, 100, 4], True)
Exemple #4
0
    def __init__(self, train_data_path, output_test_path, max_iter=50, max_time=10, C=9, tolerance=0.0001, kernel=SMO.linear_kernel):
        self.data = read_data(train_data_path)
        self.output_test_data = read_data(output_test_path)

        # TODO change to submit format
        self.training_data, self.testing_data = split_data(self.data)
        self.train_X, self.train_Y = self.training_data[:, :-1], np.squeeze(self.training_data[:, -1:])
        self.test_X, self.test_Y = self.testing_data[:, :-1], np.squeeze(self.testing_data[:, -1:])

        # print(self.train_X.shape, self.train_Y.shape)

        # self.alphas = np.random.randn(len(self.train_X))
        self.alphas = np.zeros(len(self.train_X))
        self.b = 0.0
        self.m = len(self.train_X)

        self.max_iter = max_iter
        self.max_time = max_time
        self.kernel = kernel
        self.C = C
        self.tolerance = tolerance
Exemple #5
0
def t2():
    num_samp_per_class = 200
    dim = 2
    N_class = 4

    # 生成数据
    X, labels = gen_toy_data(dim, N_class, num_samp_per_class)
    X_norm, mean, std = normalize(X)
    X_norm, mean, U, S = PCA_white(X_norm)
    X_train, labels_train, X_val, labels_val, X_test, labels_test = split_data(
        X_norm, labels)

    lr = 10**(-2.1)
    lr_decay = 1
    reg = 10**(-4.3)
    mu = 0.9
    max_epoch = 10000

    # 训练
    layer_param = [dim, 100, 100, N_class]
    train_net(X_train, labels_train, layer_param, lr, lr_decay, reg, mu,
              max_epoch, X_val, labels_val)
Exemple #6
0
output, edges, edges_attr, se_name = load_data(args.modular_file,
                                               args.ddi_file, 'onehot')
print(len(list(output.keys())))
args.num_edge_features = edges_attr.size(1)
args.device = 'cpu'

# split data into train val test.
num_edges = edges_attr.size(0) // 2
train_num = int(num_edges * args.train_ratio)
val_num = int(num_edges * args.val_ratio)
test_num = int(num_edges * args.test_ratio)
nums = [train_num, val_num, test_num]

# change the input to the the side effect name
train_edges, train_edges_attr, val_edges, val_edges_attr, test_edges, test_edges_attr \
    = split_data(edges, se_name, nums)
# print(train_edges_attr)
train_name = train_edges_attr
val_name = val_edges_attr
test_name = test_edges_attr
train_edges_attr = name_to_feature(train_edges_attr)
val_edges_attr = name_to_feature(val_edges_attr)
test_edges_attr = name_to_feature(test_edges_attr)

# read negative samples from file
neg_train_edges, neg_train_attr, neg_val_edges, neg_val_attr, neg_test_edges, neg_test_attr = read_negative(
)
print('negative samples generated')

print(args.device)
if args.feature_type == 'onehot':
Exemple #7
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import numpy as np
from data_process import load_data, split_data
dat_dir = '../data/'

ratings = load_data(dat_dir + "data_train.csv")
print(np.shape(ratings))

_, train, test = split_data(ratings, p_test=0.1)

from SGD_helpers import init_MF, matrix_factorization_SGD

from MF_helpers import get_bias_train, get_bias_test

bias_train, overal_bias, bias_u_train, bias_i_train = get_bias_train(
    train)  #ratings for final submissions
bias_test = get_bias_test(test, overal_bias, bias_u_train, bias_i_train)

# Grid Search:
grid = np.zeros((3, 4, 4))
gamma = 0.025
num_features = np.array([20, 50, 100])
lambda_user = np.logspace(-3, 0, 4)[::-1]
lambda_item = np.logspace(-3, 0, 4)[::-1]
num_epochs = 20

best_user_features = []
best_item_features = []
Exemple #8
0
            6: [10, 11, 23, 32],
            7: [14, 25, 26, 35],
            8: [15, 18, 28, 37],
            9: [16, 21, 27, 36]
        }
    else:
        cluster_disease = None

    cluster_label = random_cluster(args.n_tasks,
                                   args.n_outputs,
                                   cluster_disease=cluster_disease)

    train_data, test_data, vocabulary, embedding = \
        gen_data(train_path, test_path, args, fasttext_path)

    split_train = split_data(train_data, args.n_tasks, cluster_label)
    split_test = split_data(test_data, args.n_tasks, cluster_label)
    print("Loading and preprocessing done")

    memory_sent_data = []
    memory_word_data = []
    memory_sent_embed = []
    memory_word_embed = []
    save_word_embed = []
    cur_model = None
    word_alignment_model = None
    sent_alignment_model = None
    results = []
    random.seed(args.seed)

    for t in range(args.n_tasks):
Exemple #9
0
import json  # we need to use the JSON package to load the data, since the data is stored in JSON format
from data_process import split_data, preprocess, load_data, save_data

with open("data/reddit.json") as fp:
    data = json.load(fp)

# Now the data is loaded.
# It a list of data points, where each datapoint is a dictionary with the following attributes:
# popularity_score : a popularity score for this comment (based on the number of upvotes) (type: float)
# children : the number of replies to this comment (type: int)
# text : the text of this comment (type: string)
# controversiality : a score for how "controversial" this comment is (automatically computed by Reddit)
# is_root : if True, then this comment is a direct reply to a post; if False, this is a direct reply to another comment

# Example:
data_point = data[0]  # select the first data point in the dataset

# Now we print all the information about this datapoint
for info_name, info_value in data_point.items():
    print(info_name + " : " + str(info_value))

features = ['text', 'is_root', 'controversiality', 'children']  # list of features to preprocess
train, val, test = split_data(data)

train_ = preprocess(train, feature_list=features, max=500)
val_ = preprocess(val, feature_list=features)
test_ = preprocess(test, feature_list=features)

save_data(train_, val_, test_)