Esempio n. 1
0
def test_mse(neighborhood_size=5, filtertype="collaborative filtering"):
    """Tests the mse of predictions based on a given number of neighborhood sizes

    neighborhood_size -- the sizes of neighborhoods between the number and 1 (so 5 tests for neighborhood of length 1, 2, 3, 4, 5)
    filtertype -- the type of similarity you want to test the mse of   
    """
    # init variables
    all_df = helpers.json_to_df()
    df = helpers.split_data(all_df)
    ut = helpers.create_utility_matrix(df[0])

    if filtertype == "collaborative filtering":
        print("Creating needed variables...")
        sim = helpers.similarity_matrix_cosine(ut)
    elif filtertype == "content based":
        print("Creating needed variables...")
        cats = helpers.json_to_df_categories()
        fancy_cats = helpers.extract_genres(cats)
        ut_cats = helpers.pivot_genres(fancy_cats)
        sim = helpers.create_similarity_matrix_categories(ut_cats)
    elif filtertype == "spacy":
        print("Creating needed variables...")
        sim = pd.read_msgpack("spacy_similarity.msgpack")
    else:
        print("Please enter a valid filtertype")
        return

    print("Starting calculations...")
    mses = {}
    # test the mse based on the length of the neighborhood
    for i in range(1, neighborhood_size + 1):
        predictions = helpers.predict_ratings(sim, ut, df[1], i).dropna()
        amount = len(predictions)
        mses[i] = helpers.mse(predictions)
    return mses, amount
def reg_logistic_implementation(y, x, degrees, ratio, seed, max_iters, gamma):
    
    from helpers import build_poly, split_data
    
    # Split the data based on the input ratio into training and testing data
    x_tr, y_tr, x_te, y_te = split_data(x,y,ratio,seed)
    
    losses_tr = []
    losses_te = []
    
    
    for degree in degrees:
        print('degree = ',degree)
        
        # Build a training polynomial basis based on the choice of degree
        tx_tr = build_poly(x_tr, degree)
        
        # Initialize starting point of the gradient descent
        initial_w = np.zeros(tx_tr.shape[1])
        
        # Perform iteration - calculate w(t+1) and calculate the new loss
        w, loss_tr = reg_logistic_regression(y_tr, tx_tr, initial_w, max_iters, gamma)
        
        np.append(losses_tr,loss_tr)
        
        # Build a testing polynomial basis based on the choice of degree
        tx_te = build_poly(x_te, degree)
        
        # Test the validity of the predictions with the help of the test data
        correct_percentage, loss_te = reg_logistic_test(y_te,tx_te,w,degree)
        
        np.append(losses_te,loss_te)
    
    
    return
Esempio n. 3
0
def logistic_trials(y, tx, tx_sub, degree_range, partitions=2):
    ## Split data into test and training sets
    ## If partitions > 2, use k-fold cross-validation
    glob_tx_tr, glob_tx_te, glob_y_tr, glob_y_te = split_data(tx, y, 0.8)

    ## Initial results: losses, weights, preditions and (test) losses
    models = []
    losses = []
    accuracies = []
    predictions = []

    ## Loops over range of degrees
    degrees = range(degree_range[0], degree_range[1])
    for degree in degrees:
        print("Trying degree", degree, ":")

        tx_tr, tx_te, tx_pred = expand(degree, glob_tx_tr, glob_tx_te, tx_sub)
        initial_w = np.ones(tx_tr.shape[1])

        w, loss = logistic_regression(glob_y_tr, tx_tr, initial_w, MAX_ITERS,
                                      GAMMA)
        print("\tTraining Loss = ", loss)

        y_test = predict_labels(w, tx_te)
        test_loss = compute_loss(glob_y_te, tx_te, w, func="logistic")
        accuracy = compute_accuracy((y_test + 1) / 2, glob_y_te)
        y_pred = predict_labels(w, tx_pred)

        print("\tTest Loss = ", test_loss, " Test Accuracy = ", accuracy)
        models.append(("logistic_SGD", degree, w))
        losses.append(test_loss)
        accuracies.append(accuracy)
        predictions.append(y_pred)
    return models, losses, accuracies, predictions
Esempio n. 4
0
def train(batch_size=2):
    # Load DATA
    fixed_image, moving_image, dvf_label = load.data_reader(fixed_dir, moving_dir, dvf_dir)

    # Turn into numpy arrays
    fixed_array, fixed_affine = fixed_image.get_data()
    moving_array, moving_affine = moving_image.get_data()
    dvf_array, dvf_affine = dvf_label.get_data(is_image=False)
    # Shuffle arrays
    fixed_array, moving_array, dvf_array = helper.shuffle_inplace(
        fixed_array, moving_array, dvf_array)
    fixed_affine, moving_affine, dvf_affine = helper.shuffle_inplace(
        fixed_affine, moving_affine, dvf_affine)
    # Split into test and training set
    # Training/Validation/Test = 80/15/5 split
    test_fixed, test_moving, test_dvf, train_fixed, train_moving, train_dvf = helper.split_data(
        fixed_array, moving_array, dvf_array, split_ratio=0.05)
    # Test affine
    test_fixed_affine, test_moving_affine, test_dvf_affine, train_fixed_affine, train_moving_affine, train_dvf_affine = helper.split_data(
        fixed_affine, moving_affine, dvf_affine, split_ratio=0.05)
    # Split training into validation and training set
    validation_fixed, validation_moving, validation_dvf, train_fixed, train_moving, train_dvf = helper.split_data(
        train_fixed, train_moving, train_dvf, split_ratio=0.15)

    print("PCT Shape:", train_fixed.shape)
    print("PET Shape:", train_moving.shape)
    print("DVF Shape:", train_dvf.shape)
    outputPath = './transfer_logs/'
    # Callbacks
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                                  patience=5)
    history = LossHistory()
    checkpoint = ModelCheckpoint(outputPath + 'best_model.h5', monitor='val_loss',
                                 verbose=1, save_best_only=True, period=1)
    tensorboard = TrainValTensorBoard(write_graph=False, log_dir=outputPath)
    callbacks = [reduce_lr, history, checkpoint, tensorboard]

    # Train
    model = buildNet(train_fixed.shape[1:])
    for layer in model.layers:
        print(layer.name, layer.output_shape)

    print(model.summary())
    plot_model(model, to_file=outputPath + 'model.png', show_shapes=True)
    opt = optimizers.Adam(lr=0.0001)
    model.compile(optimizer=opt, loss='mean_squared_error')
    model.fit_generator(generator=helper.generator(inputs=[train_fixed, train_moving], label=train_dvf, batch_size=batch_size),
                        steps_per_epoch=math.ceil(train_fixed.shape[0]/batch_size),
                        epochs=500, verbose=1,
                        callbacks=callbacks,
                        validation_data=helper.generator(
                            inputs=[validation_fixed, validation_moving], label=validation_dvf, batch_size=batch_size),
                        validation_steps=math.ceil(validation_fixed.shape[0]/batch_size))

    # accuracy = model.evaluate_generator(generator(
    #    inputs=[validation_fixed, validation_moving], label=validation_dvf, batch_size=batch_size), steps=1, verbose=1)
    model.save(outputPath + 'model.h5')
Esempio n. 5
0
def solve(tX, y):
    tX_tr, y_tr, tX_te, y_te = split_data(tX, y, ratio=0.8, seed=2019)

    lambda_ = 1
    w, _ = ridge_regression(y_tr, tX_tr, lambda_)
    y_pr_tr = predict_labels(w, tX_tr)
    y_pr_te = predict_labels(w, tX_te)
    acc_tr = compute_accuracy(y_tr, y_pr_tr)
    acc_te = compute_accuracy(y_te, y_pr_te)

    return acc_tr, acc_te
Esempio n. 6
0
def algorithm_test(path_dataset, kwargs):
    '''test algorithms given '''
    ratings = load_data(path_dataset)

    train, test = split_data(ratings)

    alg = kwargs['algorithms']
    n_features = kwargs['k_range']
    lambda_user = kwargs['lambda_u']
    lambda_item = kwargs['lambda_i']

    if alg[0].lower() == 'als':
        X, RMSE_test, RMSE_train = get_ALS_predictions(ratings,
                                                       train,
                                                       test,
                                                       n_features,
                                                       lambda_user,
                                                       lambda_item,
                                                       kwargs=kwargs)

    elif alg[0].lower() == 'als_ours':
        X, RMSE_test, RMSE_train = get_ALS_predictions(ratings,
                                                       train,
                                                       test,
                                                       n_features,
                                                       lambda_user,
                                                       lambda_item,
                                                       kwargs=kwargs)

    elif alg[0].lower() == 'sgd':
        X, RMSE_test, RMSE_train = get_SGD_predictions(ratings,
                                                       train,
                                                       test,
                                                       n_features,
                                                       lambda_user,
                                                       lambda_item,
                                                       kwargs=kwargs)

    elif alg[0].lower() == 'svd' or alg[0].lower() == 'knn' or alg[0].lower(
    ) == 'cluster':

        X, RMSE_test, RMSE_train = get_splib_predictions(alg[0].lower(),
                                                         train,
                                                         test,
                                                         kwargs=kwargs)

    else:
        print('Algorithm', alg, 'is not supported in this project!')
        sys.exit(1)

    return X, RMSE_test, RMSE_train
Esempio n. 7
0
def train_3models(tX, y):
    # Preprocess data together to have the same shifts while creating log or root features
    prep_param = {
        "bias": True,
        "fill": True,
        "standardize": False,
        "degree": 8,
        "log": True,
        "root": True
    }
    tX_new, y_new, _ = preprocess_data(tX, y, prep_param)

    tX_tr, y_tr, tX_te, y_te = split_data(tX_new, y_new, ratio=0.8, seed=2019)

    # Split data according to PRI_jet_num value
    tX_tr_splitted, indices_tr = divide_data(tX_tr)
    tX_te_splitted, indices_te = divide_data(tX_te)
    n_models = len(tX_tr_splitted)

    y_tr_splitted = []
    for i in range(len(indices_tr)):
        y_tr_splitted.append(y_tr[indices_tr[i]])
        print(tX_tr_splitted[i].shape)

    # Train
    weights = []
    for i in range(n_models):
        lambda_ = lambda_cv(tX_tr_splitted[i], y_tr_splitted[i])
        print(f"Class {i}, lambda: {lambda_}")
        weights.append(
            ridge_regression(y_tr_splitted[i], tX_tr_splitted[i], lambda_)[0])
        print(len(weights[-1]))

    # Predict
    y_pr_tr = np.zeros(y_tr.shape)
    y_pr_te = np.zeros(y_te.shape)
    for i in range(n_models):
        y_pr_tr[indices_tr[i]] = predict_labels(weights[i], tX_tr_splitted[i])
        y_pr_te[indices_te[i]] = predict_labels(weights[i], tX_te_splitted[i])

    # Get accuracy
    acc_tr = compute_accuracy(y_tr, y_pr_tr)
    acc_te = compute_accuracy(y_te, y_pr_te)
    print(f"Total accuracy tr: {acc_tr}, te: {acc_te}")

    for i in range(n_models):
        acc_tr = compute_accuracy(y_tr[indices_tr[i]], y_pr_tr[indices_tr[i]])
        acc_te = compute_accuracy(y_te[indices_te[i]], y_pr_te[indices_te[i]])
        print(f"Class {i}, Accuracy tr: {acc_tr}, te: {acc_te}")
Esempio n. 8
0
def predict_all():
    """Fills an entire test set with predictions"""

    mses = []

    # predict cf based
    all_df = helpers.json_to_df()
    df = helpers.split_data(all_df)
    ut = helpers.create_utility_matrix(df[0])
    sim = helpers.similarity_matrix_cosine(ut)
    predictions = helpers.predict_ratings(sim, ut, df[1], 0)
    mses.append(helpers.mse(predictions))

    # find which values are still np.nan
    to_predict = predictions.loc[~predictions.index.isin(predictions.dropna().
                                                         index)]

    # predict content-based (normal) for those rows
    cats = helpers.json_to_df_categories()
    fancy_cats = helpers.extract_genres(cats)
    ut_cats = helpers.pivot_genres(fancy_cats)
    sim = helpers.create_similarity_matrix_categories(ut_cats)

    predictions = predictions.append(
        helpers.predict_ratings(sim, ut, to_predict, 0))
    mses.append(helpers.mse(predictions))

    # find which values are still np.nan
    to_predict = predictions.loc[~predictions.index.isin(predictions.dropna().
                                                         index)]

    # predict content-based (spacy) for those rows
    sim = pd.read_msgpack("spacy_similarity.msgpack")
    predictions = predictions.append(
        helpers.predict_ratings(sim, ut, to_predict, 0))
    to_predict = predictions.loc[~predictions.index.isin(predictions.dropna().
                                                         index)]
    mses.append(helpers.mse(predictions))

    # for the rows which have no neighborhood in any of the methods, predict the average rating of the test set
    predictions = predictions.fillna(predictions["stars"].mean())
    mses.append(helpers.mse(predictions))

    return mses
Esempio n. 9
0
    def transform(self, X):
        config = get_config()
        data_2, processed_2 = config.DATA_READER.read()
        X_2, y_2, test_2 = split_data(data_2)

        for column in list(X.select_dtypes(include=['object']).columns):
            if not X_2[column].nunique() == test_2[column].nunique(
            ) == y_2[column].nunique():
                set_X_2 = set(X_2[column].unique())
                set_y_2 = set(y_2[column].unique())
                set_test_2 = set(test_2[column].unique())
                remove_X_2 = set_X_2 - (
                    set_X_2.intersection(set_test_2)).intersection(set_y_2)
                remove_test_2 = set_test_2 - (
                    set_X_2.intersection(set_test_2)).intersection(set_y_2)
                remove_y_2 = set_y_2 - (
                    set_X_2.intersection(set_test_2)).intersection(set_y_2)
                remove = remove_X_2.union(remove_test_2).union(remove_y_2)

                X[column] = X[column].apply(lambda x: filter_cat(x, remove), 1)
        return X
Esempio n. 10
0
def ridge_trials(y, tx, tx_sub, degree_range, lambda_range, partitions=2):
    ## Split data into test and training sets
    ## If partitions > 2, use k-fold cross-validation
    glob_tx_tr, glob_tx_te, glob_y_tr, glob_y_te = split_data(tx, y, 0.8)

    ## Initial results: losses, weights, preditions and (test) losses
    models = []
    losses = []
    accuracies = []
    predictions = []

    ## Loops over range of degrees
    degrees = range(degree_range[0], degree_range[1])
    lambdas = np.logspace(lambda_range[0],
                          lambda_range[1],
                          num=1 + (lambda_range[1] - lambda_range[0]))
    for degree in degrees:
        ## Loops over range of lambdas
        for lambda_ in lambdas:
            print("Trying degree", degree, "with lambda =", lambda_, ":")

            tx_tr, tx_te, tx_pred = expand(degree, glob_tx_tr, glob_tx_te,
                                           tx_sub)

            w, loss = ridge_regression(glob_y_tr, tx_tr, lambda_)
            print("\tTraining Loss = ", loss)

            y_test = predict_labels(w, tx_te)
            test_loss = compute_loss(glob_y_te, tx_te, w)
            accuracy = compute_accuracy((y_test + 1) / 2, glob_y_te)
            y_pred = predict_labels(w, tx_pred)

            print("\tTest Loss = ", test_loss, " Test Accuracy = ", accuracy)
            models.append(("ridge_regression", degree, lambda_, w))
            losses.append(test_loss)
            accuracies.append(accuracy)
            predictions.append(y_pred)
    return models, losses, accuracies, predictions
Esempio n. 11
0
def auto_load_data():
    '''
    This function is used to load raw data from .csv file 
    and reload the splited training and testing set
    '''
    path_dataset = "data/data_train.csv"

    print('Loading the data...\n')
    ratings = load_data(path_dataset)

    # Split in training and testing sets
    train_file_path = 'split/sparse_trainset.npz'
    test_file_path = 'split/sparse_testset.npz'

    if os.path.exists(train_file_path) and os.path.exists(test_file_path):
        train = sp.lil_matrix(sp.load_npz(train_file_path))
        test = sp.lil_matrix(sp.load_npz(test_file_path))
    else:
        print('\nSplitting the data in train and test sets...')
        train, test = split_data(ratings, p_test=0.1)
        sp.save_npz(train_file_path, sp.csr_matrix(train))
        sp.save_npz(test_file_path, sp.csr_matrix(test))
    return ratings, train, test
Esempio n. 12
0
def logistic_implementation(y,
                            x,
                            degrees,
                            ratio,
                            seed,
                            max_iters,
                            gamma,
                            Newton=False):
    from helpers import build_poly, split_data

    x_tr, y_tr, x_te, y_te = split_data(x, y, ratio, seed)

    losses_tr = []
    losses_te = []

    for degree in degrees:
        print('degree = ', degree)
        tx_tr = build_poly(x_tr, degree)
        initial_w = np.zeros(tx_tr.shape[1])

        if Newton == False:
            w, loss_tr = logistic_regression(y_tr, tx_tr, initial_w, max_iters,
                                             gamma)
        else:
            w, loss_tr = logistic_newton(y_tr, tx_tr, initial_w, max_iters)

        np.append(losses_tr, loss_tr)

        tx_te = build_poly(x_te, degree)
        correct_percentage, loss_te = logistic_test(y_te, tx_te, w, degree)

        np.append(losses_te, loss_te)

    #plt.plot(degrees,losses_tr,'r',degrees,losses_te,'b')

    return
Esempio n. 13
0
def train():
    # Load DATA
    fixed_image, moving_image, dvf_label = load.data_reader(
        fixed_dir, moving_dir, dvf_dir)

    # Turn into numpy arrays
    fixed_array, fixed_affine = fixed_image.get_data()
    moving_array, moving_affine = moving_image.get_data()
    dvf_array, dvf_affine = dvf_label.get_data(is_image=False)
    # Shuffle arrays
    fixed_array, moving_array, dvf_array = helper.shuffle_inplace(
        fixed_array, moving_array, dvf_array)
    fixed_affine, moving_affine, dvf_affine = helper.shuffle_inplace(
        fixed_affine, moving_affine, dvf_affine)
    # Split into test and training set
    # Training/Validation/Test = 80/15/5 split
    test_fixed, test_moving, test_dvf, train_fixed, train_moving, train_dvf = helper.split_data(
        fixed_array, moving_array, dvf_array, split_ratio=0.05)
    # Test affine
    test_fixed_affine, test_moving_affine, test_dvf_affine, train_fixed_affine, train_moving_affine, train_dvf_affine = helper.split_data(
        fixed_affine, moving_affine, dvf_affine, split_ratio=0.05)
    # Split training into validation and training set
    validation_fixed, validation_moving, validation_dvf, train_fixed, train_moving, train_dvf = helper.split_data(
        train_fixed, train_moving, train_dvf, split_ratio=0.15)

    print("PCT Shape:", train_fixed.shape)
    print("PET Shape:", train_moving.shape)
    print("DVF Shape:", train_dvf.shape)

    # CNN Structure
    fixed_image = Input(
        shape=(train_fixed.shape[1:]))  # Ignore batch but include channel
    moving_image = Input(shape=(train_moving.shape[1:]))

    # Correlation layers
    correlation_out = myLayer.correlation_layer(fixed_image,
                                                moving_image,
                                                shape=train_fixed.shape[1:4],
                                                max_displacement=20,
                                                stride=2)

    x1 = Conv3D(64, (3, 3, 3),
                strides=2,
                activation=activation,
                padding='same',
                name='downsample1')(correlation_out)
    x1 = Conv3D(32, (3, 3, 3),
                strides=2,
                activation=activation,
                padding='same',
                name='downsample2')(x1)
    x1 = Conv3D(16, (3, 3, 3),
                strides=2,
                activation=activation,
                padding='same',
                name='downsample3')(x1)
    x1 = BatchNormalization(axis=-1, momentum=momentum)(x1)

    x1 = Conv3D(64, (3, 3, 3),
                activation=activation,
                padding='same',
                name='down_1a')(x1)
    x1 = Conv3D(64, (3, 3, 3),
                activation=activation,
                padding='same',
                name='down_1b')(x1)
    x1 = Conv3D(64, (3, 3, 3),
                activation=activation,
                padding='same',
                name='down_1c')(x1)
    x1 = BatchNormalization(axis=-1, momentum=momentum)(x1)

    x = MaxPooling3D(pool_size=(2, 2, 2), padding='same', name='Pool_1')(x1)

    x2 = Conv3D(128, (3, 3, 3),
                activation=activation,
                padding='same',
                name='down_2a')(x)
    x2 = Conv3D(128, (3, 3, 3),
                activation=activation,
                padding='same',
                name='down_2b')(x2)
    x2 = Conv3D(128, (3, 3, 3),
                activation=activation,
                padding='same',
                name='down_2c')(x2)
    x2 = BatchNormalization(axis=-1, momentum=momentum)(x2)

    x = MaxPooling3D(pool_size=(2, 2, 2), padding='same', name='Pool_2')(x2)

    x3 = Conv3D(256, (3, 3, 3),
                activation=activation,
                padding='same',
                name='down_3a')(x)
    x3 = Conv3D(256, (3, 3, 3),
                activation=activation,
                padding='same',
                name='down_3b')(x3)
    x3 = BatchNormalization(axis=-1, momentum=momentum)(x3)

    x = MaxPooling3D(pool_size=(2, 2, 2), padding='same', name='Pool_3')(x3)

    x4 = Conv3D(512, (3, 3, 3),
                activation=activation,
                padding='same',
                name='down_4a')(x)

    x = UpSampling3D(size=(2, 2, 2), name='UpSamp_4')(x4)
    y3 = Conv3DTranspose(256, (3, 3, 3),
                         activation=activation,
                         padding='same',
                         name='Up_3a')(x)
    y3 = Conv3DTranspose(256, (3, 3, 3),
                         activation=activation,
                         padding='same',
                         name='Up_3b')(y3)
    y3 = Conv3DTranspose(256, (3, 3, 3),
                         activation=activation,
                         padding='same',
                         name='Up_3c')(y3)
    y3 = BatchNormalization()(y3)

    merge3 = concatenate([x3, y3])

    x = UpSampling3D(size=(2, 2, 2), name='UpSamp_3')(merge3)
    y2 = Conv3DTranspose(128, (3, 3, 3),
                         activation=activation,
                         padding='same',
                         name='Up_2a')(x)
    y2 = Conv3DTranspose(128, (3, 3, 3),
                         activation=activation,
                         padding='same',
                         name='Up_2b')(y2)
    y2 = Conv3DTranspose(128, (3, 3, 3),
                         activation=activation,
                         padding='same',
                         name='Up_2c')(y2)
    y2 = BatchNormalization(axis=-1, momentum=momentum)(y2)

    merge2 = concatenate([x2, y2])

    x = UpSampling3D(size=(2, 2, 2), name='UpSamp_2')(merge2)
    y1 = Conv3DTranspose(64, (3, 3, 3),
                         activation=activation,
                         padding='same',
                         name='Up_1a')(x)
    y1 = Conv3DTranspose(64, (3, 3, 3),
                         activation=activation,
                         padding='same',
                         name='Up_1b')(y1)
    y1 = Conv3DTranspose(64, (3, 3, 3),
                         activation=activation,
                         padding='same',
                         name='Up_1c')(y1)
    y1 = BatchNormalization(axis=-1, momentum=momentum)(y1)

    merge1 = concatenate([x1, y1])

    # Transform into flow field (from VoxelMorph Github)
    upsample = Conv3DTranspose(64, (3, 3, 3),
                               strides=2,
                               activation=activation,
                               padding='same',
                               name='upsample_dvf1')(merge1)
    upsample = Conv3DTranspose(64, (3, 3, 3),
                               strides=2,
                               activation=activation,
                               padding='same',
                               name='upsample_dvf2')(upsample)
    upsample = Conv3DTranspose(64, (3, 3, 3),
                               strides=2,
                               activation=activation,
                               padding='same',
                               name='upsample_dvf3')(upsample)
    upsample = BatchNormalization(axis=-1, momentum=momentum)(upsample)

    dvf = Conv3D(64,
                 kernel_size=3,
                 activation=activation,
                 padding='same',
                 name='dvf_64features')(upsample)
    #dvf = Conv3D(3, kernel_size=3, activation=activation, padding='same', name='dvf')(dvf)
    dvf = Conv3D(3, kernel_size=1, activation=None, padding='same',
                 name='dvf')(dvf)

    # Callbacks
    reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                  factor=0.2,
                                  patience=5,
                                  min_lr=0.00001)
    history = LossHistory()
    checkpoint = ModelCheckpoint('best_model.h5',
                                 monitor='val_loss',
                                 verbose=1,
                                 save_best_only=True,
                                 period=1)
    tensorboard = TrainValTensorBoard(write_graph=False)
    callbacks = [reduce_lr, history, checkpoint, tensorboard]

    # Train
    model = Model(inputs=[fixed_image, moving_image], outputs=dvf)
    for layer in model.layers:
        print(layer.name, layer.output_shape)

    # print(model.summary())
    plot_model(model, to_file='model.png')
    #Adam = optimizers.Adam(lr=0.001)
    model.compile(optimizer='Adam', loss='mean_squared_error')
    model.fit_generator(
        generator=helper.generator(inputs=[train_fixed, train_moving],
                                   label=train_dvf,
                                   batch_size=batch_size),
        steps_per_epoch=math.ceil(train_fixed.shape[0] / batch_size),
        epochs=75,
        verbose=1,
        callbacks=callbacks,
        validation_data=helper.generator(
            inputs=[validation_fixed, validation_moving],
            label=validation_dvf,
            batch_size=batch_size),
        validation_steps=math.ceil(validation_fixed.shape[0] / batch_size))

    # accuracy = model.evaluate_generator(generator(
    #    inputs=[validation_fixed, validation_moving], label=validation_dvf, batch_size=batch_size), steps=1, verbose=1)
    model.save('model.h5')
    """Testing to see where issue with DVF is """
    dvf = model.predict(helper.generator([test_fixed, test_moving],
                                         label=test_dvf,
                                         predict=True,
                                         batch_size=1),
                        steps=math.ceil(test_fixed.shape[0] / batch_size),
                        verbose=1)
    helper.write_images(test_fixed,
                        test_fixed_affine,
                        file_path='./outputs/',
                        file_prefix='fixed')
    helper.write_images(test_moving,
                        test_moving_affine,
                        file_path='./outputs/',
                        file_prefix='moving')
    helper.write_images(dvf,
                        test_fixed_affine,
                        file_path='./outputs/',
                        file_prefix='dvf')
Esempio n. 14
0
from helpers import load_data, split_data, calculate_rmse, get_linear_blend_clf
from helpers import generate_submission
from ALS import get_ALS_predictions

# Load the data
path_dataset = "data/data_train.csv"

print('Loading the data...')
ratings = load_data(path_dataset)

# Split in training and testing sets
print('\nSplitting the data in train and test sets...')
train, test = split_data(ratings, p_test=0.1)

# Generate predictions for 6 different models
X, X_train, y_train, X_test, y_test = get_ALS_predictions(
    ratings,
    train,
    test,
    n_features_array=range(1, 31),
    lambda_user=0.2,
    lambda_item=0.02
)

# Linear blend of the previous models computed on the test set.
clf = get_linear_blend_clf(X_test, y_test)

print('\nRMSE Train: %f' % calculate_rmse(clf.predict(X_train.T), y_train))

print('RMSE Test: %f' % calculate_rmse(clf.predict(X_test.T), y_test))
Esempio n. 15
0
                    print("Parameters already computed, keep searching...".
                          format(u, i),
                          end="\r")
            else:
                print("Point too far from current best, keep searching...",
                      end="\r")
        _, _, c = ALS(train, test, u, i, num_features=num_features)
        costs[(u, i)] = c
        with open(costs_filename, "wb") as f:
            pkl.dump(costs, f)
    return get_best_lambdas(num_features)


def get_best_lambdas(num_features):
    """Return the best paramters."""
    CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
    costs_filename = CURRENT_DIR + "/cache/als_costs_{}.pkl".format(
        num_features)
    costs = deserialize_costs(costs_filename)
    if len(costs) == 0:
        return (None, None)
    return min(costs, key=lambda x: costs[x])


if __name__ == "__main__":
    CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
    path_dataset = CURRENT_DIR + "/../data/data_train.csv"
    ratings = load_data(path_dataset)
    tr, te = split_data(ratings, p_test=0.1)
    optimizer_lambdas(int(sys.argv[1]), tr, te, close_to_best=True)
Esempio n. 16
0
torch.manual_seed(1)
inputs, targets = h.generate_disc_data(n=1000)
'''
#Plot the distribution of the generated data, to see how it looks like

plt.scatter(inputs[:,0].tolist(), inputs[:,1].tolist(), c = targets.tolist(), cmap = 'cividis')
#plt.savefig('data.png')
plt.xlabel("X coordinate")
plt.ylabel("Y coordinate")
plt.title("Distribution of data points")
plt.show()
'''

#Split the data into train, validation and test sets
train_data, train_targets,\
  validation_data, validation_targets, test_data, test_targets = h.split_data(inputs, targets, 0.7, 0.1, 0.2)

#Data normalization
mean, std = inputs.mean(), inputs.std()

train_data.sub_(mean).div_(std)
validation_data.sub_(mean).div_(std)
test_data.sub_(mean).div_(std)

#Instantiate the model

Input_Units = 2
Output_Units = 2
Hidden_Units = 25

model = m.Sequential(m.Linear(Input_Units, Hidden_Units), m.ReLU(),
Esempio n. 17
0
def inference():
    print('Load data to Transform')
    fixed_predict, moving_predict, dvf_label = load.data_reader(
        fixed_dir, moving_dir, dvf_dir)

    print('Turn into numpy arrays')
    fixed_array, fixed_affine = fixed_predict.get_data()
    moving_array, moving_affine = moving_predict.get_data()
    dvf_array, dvf_affine = dvf_label.get_data(is_image=False)

    print('Shuffle')
    fixed_array, moving_array, dvf_array = helper.shuffle_inplace(
        fixed_array, moving_array, dvf_array)
    fixed_affine, moving_affine, dvf_affine = helper.shuffle_inplace(
        fixed_affine, moving_affine, dvf_affine)

    print('Split into test/training data')
    test_fixed, test_moving, test_dvf, train_fixed, train_moving, train_dvf = helper.split_data(
        fixed_array, moving_array, dvf_array, split_ratio=0.05)
    test_fixed_affine, test_moving_affine, test_dvf_affine, train_fixed_affine, train_moving_affine, train_dvf_affine = helper.split_data(
        fixed_affine, moving_affine, dvf_affine, split_ratio=0.05)

    print('Load models')
    print("Fixed input", test_fixed.shape)
    print("Moving input", test_moving.shape)
    model = load_model('best_model.h5')
    model.compile(optimizer='Adam',
                  loss='mean_squared_error',
                  metrics=["accuracy"])
    dvf = model.predict_generator(helper.generator([test_fixed, test_moving],
                                                   label=test_dvf,
                                                   predict=True,
                                                   batch_size=batch_size),
                                  steps=math.ceil(test_fixed.shape[0] /
                                                  batch_size),
                                  verbose=1)
    test_loss = model.evaluate_generator(
        helper.generator([test_fixed, test_moving],
                         label=test_dvf,
                         predict=True,
                         batch_size=batch_size),
        steps=math.ceil(test_fixed.shape[0] / batch_size),
        verbose=1)

    print('Save DVF')
    # Save images
    helper.write_images(test_fixed,
                        test_fixed_affine,
                        file_path='./outputs/',
                        file_prefix='fixed')
    helper.write_images(test_moving,
                        test_moving_affine,
                        file_path='./outputs/',
                        file_prefix='moving')
    helper.write_images(dvf,
                        test_fixed_affine,
                        file_path='./outputs/',
                        file_prefix='dvf')
    print("Test Loss:", test_loss)
    # Save warped
    print("Test Loss Shape:", test_loss.shape)
Esempio n. 18
0
"""

from als import run_als_asynchronously
from sgd import run_sgd_asynchronously
from helpers import load_data, split_data
import numpy as np

if __name__ == '__main__':
    # Initializing dataset
    path_dataset = "data/data_train.csv"
    ratings = load_data(path_dataset)

    num_items_per_user = np.array((ratings != 0).sum(axis=0)).flatten()
    num_users_per_item = np.array((ratings != 0).sum(axis=1).T).flatten()

    valid_ratings, train, test = split_data(ratings,
                                            num_items_per_user,
                                            num_users_per_item,
                                            min_num_ratings=10,
                                            p_test=0.1)

    # Uncomment these lines if you want to train on ALS and input your parameter tuples
    # args_list = [(train, test, 9, 0.1, 0.014), (train, test, 9, 0.1, 0.016), (train, test, 9, 0.105, 0.01)]
    # run_als_asynchronously(args_list)

    # Uncomment these lines if you want to train on SGD and input your parameter tuples
    args_list = [(train, test, 0.04, 9, 0.1, 0.014),
                 (train, test, 0.04, 9, 0.1, 0.016),
                 (train, test, 0.04, 9, 0.105, 0.01)]
    run_sgd_asynchronously(args_list)
Esempio n. 19
0
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.show()
    plt.plot(history.history['acc'])
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.savefig(id + '.png')
else:
    from sklearn.externals.joblib import dump
    print('Loading', observation_batches, 'batches from', input_observations,
          '...')
    x = load_observations(observation_batches,
                          input_observations,
                          mode='sample')
    print('Splitting data in training and test dataset', train_pct)
    x_tr, x_te, y_tr, y_te = split_data(x, train_pct)
    del x
    class_weights = {0: 1, 1: args.weight}
    if 'svm' in algorithms:
        print('Using support vector machine')
        from sklearn import svm
        s = svm.LinearSVC(C=0.5,
                          class_weight=class_weights,
                          max_iter=1e6,
                          verbose=1)
        s.fit(x_tr, y_tr)
        tpr, fpr = measure_performance(s.predict(x_te), y_te)
        print('True positive rate', tpr, 'False positive rate', fpr)
        id = str(int(time()))
        if isfile(svm_models_path):
            with open(svm_models_path, 'a') as file:
Esempio n. 20
0
def generateAugmentedImage(image_path, groundtruth_path,
                           aug_image_path, aug_groundtruth_path,
                           val_image_path, val_groundtruth_path,
                           howMany):
    """ Generate new images using rotation, shear, zoom, etc transformation
        Load the images from the disk (image_path) and writes the augmented images
        to write path
    """

    # Check if the directories exists, if not: create them
    if not os.path.exists(aug_image_path):
        os.makedirs(aug_image_path)
    if not os.path.exists(aug_groundtruth_path):
        os.makedirs(aug_groundtruth_path)
    if not os.path.exists(val_image_path):
        os.makedirs(val_image_path)
    if not os.path.exists(val_groundtruth_path):
        os.makedirs(val_groundtruth_path)

    tr_images, gt_images = load_all_images(100,'training/')
    tr_images = np.array(tr_images)
    gt_images = np.array(gt_images)

    Im_tr, GT_tr, Im_val, GT_val = split_data(tr_images, gt_images, training_ratio,10)

    # Create 2 generator with same seed for training and groundtruth transformation
    data_gen_args = dict(featurewise_center=False,
                         samplewise_center=False,
                         featurewise_std_normalization=False,
                         samplewise_std_normalization=False,
                         zca_whitening=False,
                         rotation_range=90,
                         width_shift_range=0.,
                         height_shift_range=0.,
                         shear_range=0.3,
                         zoom_range=0.,
                         fill_mode='reflect',
                         horizontal_flip=True,
                         vertical_flip=True)


    image_gen = ImageDataGenerator(**data_gen_args)
    gt_gen = ImageDataGenerator(**data_gen_args)


    # Provide the same seed and keyword arguments to the fit and flow methods
    seed = np.random.randint(2**32 - 1)

     # a hack to get the transformed images without given a vector of labels
    Y_trash = np.ones(Im_tr.shape[0])


    iter_tr = image_gen.flow(Im_tr,
                             Y_trash,
                             batch_size = 1,
                             shuffle = None,
                             seed=seed,
                             save_to_dir=aug_image_path,
                             save_prefix='aug', save_format='png')

    iter_gt = gt_gen.flow(GT_tr[...,np.newaxis],
                          Y_trash,
                          batch_size = 1,
                          shuffle = None,
                          seed=seed,
                          save_to_dir=aug_groundtruth_path,
                          save_prefix='aug', save_format='png')


    # Save the augmented images
    for i in range(howMany):
        if i % 100 == 0:
            print("Saving image " + str(i) + "...")
        iter_tr.next()
        iter_gt.next()

    # Save the validation images
    for i in range(Im_val.shape[0]):
        Image.fromarray(((255*Im_val[i]).astype(np.uint8))).save(os.path.normpath(val_image_path) + '/val_Sat_Image_' + str(i) + '.png')
        Image.fromarray(((255*GT_val[i]).astype(np.uint8))).save(os.path.normpath(val_groundtruth_path) + '/val_Sat_Image_' + str(i) + '.png')
Esempio n. 21
0
def get_prediction(neural_net,
                   global_vectors,
                   full_corpus,
                   total_training_tweets,
                   nr_pos_tweets,
                   kaggle_name,
                   epochs,
                   patience,
                   split=0.8):
    """ Creates a csv file with kaggle predictions and returns the predictions.
    Input:
        neural_net: Name of a neural net model
        global_vectors: global vectors created out the gensim-.txt files.
        total_training_tweets: (int) Number of tweets that are training tweets. Assums that the first poriton of the corpus is
        training tweets, the second part is the unseen test set.
        nr_pos_tweets: (int) number of traning tweets that are positiv
        kaggle_name: Name for csv file, must end in '.csv'.
   
    Output:
        pred_ones: the predicions (1 or -1)
        a .csv file with name 'kaggle_name'
    """
    num_of_dim = global_vectors.syn0.shape[1]

    # seperate traindata and testdata
    train_corpus = full_corpus[:total_training_tweets:]
    predict_corpus = full_corpus[total_training_tweets::]

    # Build a vector of all the words in a tweet
    train_document_vecs = np.concatenate([
        GM.buildDocumentVector(doc, num_of_dim, global_vectors)
        for doc in train_corpus
    ])
    train_document_vecs = sk.preprocessing.scale(train_document_vecs)

    labels = HL.create_labels(nr_pos_tweets, nr_pos_tweets, kaggle=False)

    train_document_vecs, labels = HL.shuffle_data(train_document_vecs, labels)
    train_x, val_x, train_y, val_y = HL.split_data(train_document_vecs, labels,
                                                   split)

    test_document_vecs = np.concatenate([
        GM.buildDocumentVector(doc, num_of_dim, global_vectors)
        for doc in predict_corpus
    ])
    test_document_vecs = sk.preprocessing.scale(test_document_vecs)

    model = neural_net(num_of_dim)

    # Defining callbacks to be used under fitting process
    early_stopping = early_stopping_callback(patience_=patience, verbose_=1)
    model_checkpoint = model_checkpoint_callback(
        "neural_model_prediction.hdf5", verbose_=1)

    history = model.fit(train_x,
                        train_y,
                        epochs=epochs,
                        batch_size=1024,
                        verbose=1,
                        callbacks=[early_stopping, model_checkpoint],
                        validation_data=(val_x, val_y))

    # Loading the best model found during training
    model = load_model('neural_model_prediction.hdf5')

    prediction = model.predict(test_document_vecs)

    prediction = [1 if i > 0.5 else -1 for i in prediction]

    # Creating prediction
    ids = list(range(1, 10000 + 1))
    HL.create_csv_submission(ids, prediction, kaggle_name)

    return prediction
Esempio n. 22
0
def infer(batch_size=2):
    # On server with PET and PCT in
    image_dir = "/hepgpu3-data1/dmcsween/DataTwoWay128/fixed"
    print("Load Data")
    image_data, __image, __label = load.data_reader(image_dir, image_dir, image_dir)

    image_array, image_affine = image_data.get_data()
    moving_array, moving_affine = __image.get_data()
    dvf_array, dvf_affine = __label.get_data()

    list_avail_keys = help.get_moveable_keys(image_array)
    # Get hamming set
    print("Load hamming Set")
    hamming_set = pd.read_csv("hamming_set.txt", sep=",", header=None)
    print(hamming_set)
    # Ignore moving and dvf
    validation_dataset, validation_moving, validation_dvf, train_dataset, train_moving, train_dvf = helper.split_data(
        image_array, moving_array, dvf_array, split_ratio=0.15)
    print("Valid Shape:", validation_dataset.shape)
    normalised_dataset = helper.normalise(validation_dataset)
    print('Load models')
    idx_list = [0, 9]
    K.clear_session()
    model = load_model('./logs/best_model.h5')
    myPredictGen = gen.predict_generator(
        normalised_dataset, list_avail_keys, hamming_set, hamming_idx=idx_list, batch_size=batch_size, N=10)
    opt = optimizers.SGD(lr=0.01, momentum=0.9)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=["accuracy"])
    output = model.predict_generator(generator=myPredictGen, steps=1, verbose=1)
    print(output)
Esempio n. 23
0
def train(tileSize=64, numPuzzles=23, num_permutations=10, batch_size=16):
    # On server with PET and PCT in
    image_dir = "/hepgpu3-data1/dmcsween/Data128/ResampleData/PlanningCT"

    print("Load Data")
    image_data, __image, __label = load.data_reader(image_dir, image_dir,
                                                    image_dir)

    image_array, image_affine = image_data.get_data()
    moving_array, moving_affine = __image.get_data()
    dvf_array, dvf_affine = __label.get_data()
    """
    list_avail_keys = help.get_moveable_keys(image_array)
    hamming_set = pd.read_csv(
        "hamming_set_PCT.txt", sep=",", header=None)
    """
    avail_keys = pd.read_csv("avail_keys_both.txt", sep=",", header=None)
    print("Len keys:", len(avail_keys))
    list_avail_keys = [(avail_keys.loc[i, 0], avail_keys.loc[i, 1],
                        avail_keys.loc[i, 2]) for i in range(len(avail_keys))]
    print(list_avail_keys)
    # Get hamming set
    print("Load hamming Set")
    hamming_set = pd.read_csv("hamming_set.txt", sep=",", header=None)

    #hamming_set = hamming_set.loc[:9]
    print("Ham Len", len(hamming_set))
    print(hamming_set)

    fixed_array, moving_array, dvf_array = helper.shuffle_inplace(
        image_array, moving_array, dvf_array)

    # Ignore moving and dvf
    validation_dataset, validation_moving, validation_dvf, train_dataset, train_moving, train_dvf = helper.split_data(
        fixed_array, moving_array, dvf_array, split_ratio=0.15)

    normalised_train = helper.norm(train_dataset)
    normalised_val = helper.norm(validation_dataset)
    # Output all data from a training session into a dated folder
    outputPath = "./logs"
    # hamming_list = [0, 1, 2, 3, 4]
    # img_idx = [0, 1, 2, 3, 4]
    # callbacks
    checkpoint = ModelCheckpoint(outputPath + '/best_model.h5',
                                 monitor='val_acc',
                                 verbose=1,
                                 save_best_only=True,
                                 period=1)
    reduce_lr_plateau = ReduceLROnPlateau(monitor='val_acc',
                                          patience=10,
                                          verbose=1)
    # early_stop = EarlyStopping(monitor='val_acc', patience=5, verbose=1)
    tensorboard = TrainValTensorBoard(write_graph=False)
    callbacks = [checkpoint, reduce_lr_plateau, tensorboard]
    # BUILD Model
    model = createSharedAlexnet3D_onemodel()
    # for layer in model.layers:
    #     print(layer.name, layer.output_shape)
    opt = optimizers.SGD(lr=0.01)
    plot_model(model, to_file='model.png')
    print(model.summary())
    model.compile(optimizer=opt,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    model.fit_generator(generator=gen.generator(normalised_train,
                                                list_avail_keys,
                                                hamming_set,
                                                batch_size=batch_size,
                                                N=num_permutations),
                        epochs=1000,
                        verbose=1,
                        steps_per_epoch=normalised_train.shape[0] //
                        batch_size,
                        validation_data=gen.generator(normalised_val,
                                                      list_avail_keys,
                                                      hamming_set,
                                                      batch_size=batch_size,
                                                      N=num_permutations),
                        validation_steps=normalised_val.shape[0] // batch_size,
                        callbacks=callbacks,
                        shuffle=False)
    model.save('model_best.h5')
Esempio n. 24
0
def cross_validation(ratings,
                     fold,
                     solver,
                     lambda_usr=0.02,
                     lambda_item=0.02,
                     k_features=5,
                     name=None,
                     kwargs=None):

    A = ratings.copy()
    train_array = []
    test_array = []
    # Check if the data have been generated and whether the data is correct
    # if file_flag is False, then new data must been generated
    file_flag = filecheck(fold)

    for i in range(fold):

        trainset_file_path = 'trainset_%s_th.npz' % (i)
        testset_file_path = 'testset_%s_th.npz' % (i)

        if (os.path.exists(trainset_file_path)
                and os.path.exists(testset_file_path) and file_flag):
            # if data have been generated before, just load them
            train = sp.load_npz(trainset_file_path)
            test = sp.load_npz(testset_file_path)

            train_array.append(train)
            test_array.append(test)
        else:
            # else generate k-fold dataset for training and testing
            p_test = 1 / (fold - i)
            _, test = split_data(A, p_test, seed=998)
            train = sp.lil_matrix(ratings.shape)
            train = ratings - test
            test = test.tocsc()

            train_array.append(train)
            test_array.append(test)
            A = A - test

            print('\n Crossvalidation dataset has been created')
            sp.save_npz(trainset_file_path, train.tocsr())
            sp.save_npz(testset_file_path, test.tocsr())
    '''
    11/Dec./2018, Zhantao Deng
        1. delete calculation of rmse since sgd provides rmse. 
        2. choose model showing the best err, rather than returning the last model 
        3. 
    '''
    # train period
    # Generate predictions by particular models specified by parameters

    # define rmse variables, intialize them with a large num
    rmse_test = [100]
    rmse_train = [100]
    for ind, item in enumerate(train_array):
        train = item
        test = test_array[ind]

        if name is None:
            X_whole, trainERR, testERR = solver(ratings,
                                                train,
                                                test,
                                                lambda_usr,
                                                lambda_item,
                                                k_features,
                                                ind,
                                                kwargs=kwargs)
        else:
            sys.exit('Only SGD, ALS and ALS_ours support our cross validation')

        # store rmse
        rmse_test.append(testERR)
        rmse_train.append(trainERR)

        # if train err is smaller than former train err, store the model as the best model
        if trainERR <= rmse_train[ind]:
            Best_X = X_whole

    # transform from float to integer
    Best_X = transform(Best_X)

    return Best_X, np.mean(rmse_test[1:]), np.mean(rmse_train[1:])
Esempio n. 25
0
def find_min_num_ratings(min_num_ratings_array, ratings, num_items_per_user,
                         num_users_per_item, p_test, lambda_item, lambda_user,
                         num_features):
    """ Compute the train and test RMSE of ALS for a set of minimum number of ratings for users and items

    :param min_num_ratings_array: array of minimum number of ratings
    :param ratings: sparse matrix containing the data, i.e. the ratings
    :param num_items_per_user: number of items per user
    :param num_users_per_item: number of user per item
    :param p_test: probability that a ratings is in the test set
    :param lambda_item: the regularization ALS parameter for the item features matrix
    :param lambda_user: the regularization ALS parameter for the user features matrix
    :param num_features: the number of features of our item's and user's matrices
    :return: the full reconstructed item and user features matrices and the train RMSE and the test RMSE for each
             minimum number of ratings
    """
    # Initialization of the arrays to store the rmse and the fully reconstructed features matrices
    full_user_features_array = []
    full_item_features_array = []
    rmse_test_full_array = []
    rmse_train_full_array = []

    for min_num_ratings in min_num_ratings_array:
        print("Minimum number of ratings : {}".format(min_num_ratings))

        # Split the data ratings with probability p_test and delete the ratings with less than min_num_ratings
        valid_ratings, train, test, valid_users, valid_items, train_full, test_full = split_data(
            ratings, num_items_per_user, num_users_per_item, min_num_ratings,
            p_test)

        # Call ALS to get the predicted features matrices to fill
        predicted_user_features, predicted_item_features, _, _ = ALS(
            train, test, lambda_user, lambda_item, num_features)

        # Reconstruct the full features matrices with the selected minimum number of ratings
        full_user_features, full_item_features = construct_full_features(
            predicted_user_features, predicted_item_features, valid_users,
            valid_items, min_num_ratings, train_full, lambda_user, lambda_item)

        # Add the features matrices in the array to return
        full_user_features_array.append(full_user_features)
        full_item_features_array.append(full_item_features)

        nz_row_te, nz_col_te = test_full.nonzero()
        nz_full_test = list(zip(nz_row_te, nz_col_te))
        nz_row_tr, nz_col_tr = train_full.nonzero()
        nz_full_train = list(zip(nz_row_tr, nz_col_tr))

        # Compute the RMSE for the test and trian set and add it in the array
        rmse_train_full = compute_error(train_full, full_user_features,
                                        full_item_features, nz_full_train)
        rmse_test_full = compute_error(test_full, full_user_features,
                                       full_item_features, nz_full_test)
        rmse_train_full_array.append(rmse_train_full)
        rmse_test_full_array.append(rmse_test_full)
    return full_item_features_array, full_user_features_array, rmse_train_full_array, rmse_test_full_array
def train(
        input_file="clean_train.csv",
        text_col="question_text",
        label_col="target",
        valid_ratio=0.2,
        max_sentence_length=91,
        sample_percent=1,
        class_weights=None,
        cell_type="gru",
        embedding="word2vec",
        embedding_path="GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin",
        embedding_dim=300,
        rnn_layers=3,
        hidden_size=128,
        one_minus_dropout=0.5,
        l2_reg=3.0,
        batch_size=32,
        epochs=5,
        learning_rate=1e-3,
        allow_soft_placement=True,
        log_device_placement=False,
        display_every=10,
        evaluate_every=100,
        checkpoint_every=100,
        num_checkpoints=5):
    # Load and split data
    print("Loading data..")
    X, Y = read_data(input_file,
                     text_col,
                     label_col,
                     sample_percent=sample_percent)

    # Create a vocanulary process
    # Its job is to assign each unique word an integer and then our sentences replace each word it's corresponding integer.
    # These mappings are later used again to substitue each word with its embedding
    # This method also trims or adds trailing zeros to padd and fit each sentence to a specific length
    print("Setting up vocabulary..")
    vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
        max_sentence_length)
    X = np.array(list(vocab_processor.fit_transform(X)))
    print("Vocabulary Size: ", len(vocab_processor.vocabulary_))
    num_classes = len(Y[0])

    # split in to train and validation
    X, Y, x_val, y_val = split_data(X, Y, valid_ratio)

    # initialize tensorflow config
    print("Initializing tensorflow session..")
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=allow_soft_placement,
            log_device_placement=log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            print("Initializing our RNN:")
            print("\nseq_length : ", X.shape[1], "\nnum_classes : ",
                  Y.shape[1], "\nvocab_size : ",
                  len(vocab_processor.vocabulary_), "\nembedding_size : ",
                  embedding_dim, "\ncell_type : ", cell_type,
                  "\nhidden_size : ", hidden_size, "\nl2 : ", l2_reg,
                  "\nclass_weights :  ", class_weights, "\nbatch_size : ",
                  batch_size, "\nrnn_layers :  ", rnn_layers)
            # Initiazlie our RNN
            rnn = RNN(seq_length=X.shape[1],
                      num_classes=Y.shape[1],
                      vocab_size=len(vocab_processor.vocabulary_),
                      embedding_size=embedding_dim,
                      cell_type=cell_type,
                      hidden_size=hidden_size,
                      l2=l2_reg,
                      class_weights=class_weights,
                      batch_size=batch_size,
                      rnn_layers=rnn_layers)

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            train_op = tf.train.AdamOptimizer(learning_rate).minimize(
                rnn.loss, global_step=global_step)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", timestamp))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", rnn.loss)
            acc_summary = tf.summary.scalar("accuracy", rnn.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge([loss_summary, acc_summary])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Validation summaries
            val_summary_op = tf.summary.merge([loss_summary, acc_summary])
            val_summary_dir = os.path.join(out_dir, "summaries", "val")
            val_summary_writer = tf.summary.FileWriter(val_summary_dir,
                                                       sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")

            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=num_checkpoints)

            # Write vocabulary
            vocab_processor.save(os.path.join(out_dir, "text_vocab"))

            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            # Initializing pretrained embeddings if embedding flag is up
            if embedding:
                # initial matrix with random uniform
                initW = np.random.uniform(
                    -0.25, 0.25,
                    (len(vocab_processor.vocabulary_), embedding_dim))

                # In case of glove, loading embedings is pretty easy
                # Just read each line, first word is the word
                # and evey thing else on the line is a vector embedding for that vector
                if "glove" in embedding:
                    with open(embedding_path, "r", encoding="utf8") as f:
                        for line in f:
                            first_word = line.partition(' ')[0]
                            rest = line[line.index(' ') + 1:]
                            # Find if word in our vocabulary
                            idx = vocab_processor.vocabulary_.get(first_word)
                            if idx != 0:
                                # If yes then substitue the glove embedding for it instead of the random one
                                initW[idx] = np.fromstring(rest,
                                                           dtype='float32',
                                                           sep=" ")
                # In case of word2vec, we are given a bin file
                elif "word2vec" in embedding:
                    with open(embedding_path, "rb") as f:
                        # First line is header containing information about number of records and size of one record
                        header = f.readline()
                        vocab_size, layer1_size = map(int, header.split())
                        # Then, number of bytes in each record  = (size of a float) * size of one record
                        binary_len = np.dtype('float32').itemsize * layer1_size
                        # for each record
                        for line in range(vocab_size):
                            word = []
                            while True:
                                # Keep reading a charachter
                                ch = f.read(1).decode('latin-1')
                                if ch == ' ':
                                    # until you find a space, then the first word is complete
                                    word = ''.join(word)
                                    break
                                if ch != '\n':
                                    word.append(ch)
                            # Try to find that first word in our vocabulary
                            idx = vocab_processor.vocabulary_.get(word)
                            if idx != 0:
                                # if found, add substitue the corespoding embedding vector with the random vector
                                initW[idx] = np.fromstring(f.read(binary_len),
                                                           dtype='float32')
                            else:
                                f.read(binary_len)

                sess.run(rnn.W_text.assign(initW))
                print("Successful to load ", embedding, "!\n")

            # Once we are done with the embeddings and basic tensorflow settings
            # We now start with actual training routine

            # Generate batches
            itr = batch_iterator(X, Y, batch_size, epochs)
            # For each batch
            for x_batch, y_batch, start, end in itr:
                # Train
                feed_dict = {
                    rnn.input_text: x_batch,
                    rnn.input_label: y_batch,
                    rnn.keep_prob: one_minus_dropout
                }
                _, step, summaries, loss, accuracy = sess.run([
                    train_op, global_step, train_summary_op, rnn.loss,
                    rnn.accuracy
                ], feed_dict)
                train_summary_writer.add_summary(summaries, step)

                # Training log display
                if step % display_every == 0:
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, acc {:g}".format(
                        time_str, step, loss, accuracy))

                # Evaluation
                if step % evaluate_every == 0:
                    print("\nEvaluation:")
                    total_preds = np.zeros(y_val.shape)
                    itr2 = batch_iterator(x_val,
                                          y_val,
                                          batch_size,
                                          1,
                                          shuffle=False)
                    avg_acc = 0
                    avg_loss = 0
                    steps = 0
                    for x_eval_batch, y_eval_batch, s, e in itr2:
                        feed_dict_val = {
                            rnn.input_text: x_eval_batch,
                            rnn.input_label: y_eval_batch,
                            rnn.keep_prob: 1.0
                        }
                        summaries_val, loss, accuracy, preds = sess.run([
                            val_summary_op, rnn.loss, rnn.accuracy,
                            rnn.predictions
                        ], feed_dict_val)
                        val_summary_writer.add_summary(summaries_val, step)
                        k = np.array([
                            one_hot_encode(num_classes, label)
                            for label in preds
                        ])
                        avg_acc += accuracy
                        avg_loss += loss
                        steps += 1
                        total_preds[s:e] = k
                    cf, f_score = confusion_matrix(y_val, total_preds, 2)
                    avg_acc /= steps
                    avg_loss /= steps
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: loss {:g}, acc {:g}, fscore {:g}\n".format(
                        time_str, avg_loss, avg_acc, f_score))
                    print("Confusion Matrix")
                    print(cf)
                # Model checkpoint
                if step % checkpoint_every == 0:
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=step)
                    print("Saved model checkpoint to {}\n".format(path))
Esempio n. 27
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018/11/22 21:53
# @Author : Qiangz
# @File : evaluation.py

from predata import allData
from helpers import split_data
from sklearn.model_selection import train_test_split
from model import *


# split data set to for mind reading test;train:test:val = 4:1:1
trainData, testData, valData = split_data(allData)
Esempio n. 28
0
def infer(batch_size=2):
    # On server with PET and PCT in
    image_dir = "/hepgpu3-data1/dmcsween/DataTwoWay128/fixed"
    #image_dir = "/hepgpu3-data1/dmcsween/Data128/ResampleData/PlanningCT"
    inputPath = "./all_logs/both_logs100perms"
    #inputPath = './mixed_hamming_logs'
    print("Load Data")
    image_data, __image, __label = load.data_reader(image_dir, image_dir,
                                                    image_dir)

    image_array, image_affine = image_data.get_data()
    moving_array, moving_affine = __image.get_data()
    dvf_array, dvf_affine = __label.get_data()
    """
    list_avail_keys = help.get_moveable_keys(image_array)
    # Get hamming set
    print("Load hamming Set")
    hamming_set = pd.read_csv("hamming_set.txt", sep=",", header=None)
    print(hamming_set)
    """
    avail_keys = pd.read_csv("avail_keys_both.txt", sep=",", header=None)
    list_avail_keys = [(avail_keys.loc[i, 0], avail_keys.loc[i, 1],
                        avail_keys.loc[i, 2]) for i in range(len(avail_keys))]
    # Get hamming set
    print("Load hamming Set")
    hamming_set = pd.read_csv("mixed_hamming_set.txt", sep=",", header=None)

    hamming_set = hamming_set.loc[:99]
    # Ignore moving and dvf
    test_dataset, validation_moving, validation_dvf, trainVal_dataset, train_moving, train_dvf = helper.split_data(
        image_array, moving_array, dvf_array, split_ratio=0.05)
    print("Valid Shape:", test_dataset.shape)
    normalised_dataset = helper.normalise(test_dataset)
    print('Load models')
    scores = np.zeros((15, 20))
    blank_idx = [n for n in range(23)]
    print(blank_idx)
    K.clear_session()
    model = load_model(inputPath + '/final_model.h5')
    opt = optimizers.SGD(lr=0.01)
    model.compile(optimizer=opt,
                  loss='categorical_crossentropy',
                  metrics=["accuracy"])
    idx_list = []
    # i is border size
    for i in range(15):
        for j in range(20):
            idx_list = [10, 10]
            print("Pre Eval:", i, j)
            myPredictGen = gen.evaluate_generator(normalised_dataset,
                                                  list_avail_keys,
                                                  hamming_set,
                                                  hamming_idx=idx_list,
                                                  batch_size=batch_size,
                                                  blank_idx=blank_idx,
                                                  border_size=i,
                                                  image_idx=[10, 10],
                                                  full_crop=False,
                                                  out_crop=True,
                                                  inner_crop=False,
                                                  N=100)
            accuracy = model.evaluate_generator(generator=myPredictGen,
                                                steps=5,
                                                verbose=1)
            print("%s: %.2f%%" % (model.metrics_names[1], accuracy[1] * 100))
            scores[i, j] = (accuracy[1] * 100)

    np.savetxt("scores_diff_border.txt", scores, delimiter=",", fmt='%1.2i')
    avg_score = np.mean(scores, axis=1)
    avg_perm = np.mean(scores, axis=0)
    error_score = np.std(scores, axis=1)
    error_perm = np.std(scores, axis=0)
    var_score = np.var(scores, axis=1)
    var_perm = np.var(scores, axis=0)
    print("Scores:", avg_score, error_score, var_score)
    print("Perms:", avg_perm, error_perm, var_perm)
    print("Done")
Esempio n. 29
0
def main(input_, format_, rounded=False, num_features=40, cache_name="test"):
    """Trains ALS and returns predictions.

    Performs ALS and predicts entries of 'format_'.

    To find optimal hyperparameters, samples the space of hyperparameters
    at least `min_num_costs` times and takes the best set of parameters.

    Arguments:
        input_ -- Training dataset
        format_ -- Entries for which emitting predictions.

    Keyword Arguments:
        min_num_costs {int} -- Min num of samples from hyperparameters
        space to take  (default: {130})

    Returns:
        list -- List of predictions, tuples of format ("id", rating)

    """
    # preprocess data
    ratings = preprocess_data(input_.copy())
    final = preprocess_data(format_.copy())

    # load data and split
    CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))

    # try to retrieve best matrix factorization
    print("Trying to retrieve cached optimal matrix factorization")
    factorized_filename = CURRENT_DIR +\
        "/cache/factorized_{}.pkl".format(cache_name)
    try:
        with open(factorized_filename, "rb") as f:
            print("Successfully retrieved cached optimal matrix factorization")
            factorized = pkl.load(f)
    except FileNotFoundError:
        # if failed, recompute and cache
        print("Unable to retrieve cached optimal matrix "
              "factorization, computing")
        min_ulambda, min_ilambda = get_best_lambdas(num_features)
        if min_ilambda is None or min_ilambda is None:
            print("Spliting train/test")
            train, test = split_data(ratings, p_test=0.1)
            min_ulambda, min_ilambda = optimizer_lambdas(150, train, test)
        factorized, _ = ALS(ratings,
                            lambda_user=min_ulambda,
                            lambda_item=min_ilambda,
                            max_steps=100,
                            num_features=num_features)
        with open(factorized_filename, "wb") as f:
            print("Caching optimal matrix factorization")
            pkl.dump(factorized, f)
    ufeats, ifeats = factorized

    # emitting predictions
    nnz_row, nnz_col = final.nonzero()
    nnz_final = list(zip(nnz_row, nnz_col))
    ret = []
    i = 1
    for row, col in nnz_final:
        if i % 100 == 0 or i == len(nnz_final):
            print("Emitting predictions {}/{}".format(i, len(nnz_final)),
                  end="\r")
            sys.stdout.flush()
        item_info = ifeats[:, row]
        user_info = ufeats[:, col]
        r = user_info.T.dot(item_info)
        ret.append(("r{}_c{}".format(row+1, col+1),
                    (int(np.clip(np.round(r), 1, 5)) if rounded else r)))
        i += 1
    print("")
    ret_df = pd.DataFrame(ret, columns=["Id", "ALS"])
    ret_df.set_index("Id", inplace=True)
    assert sorted(format_['Id'].values) == sorted(ret_df.index.values)
    return ret_df