Esempio n. 1
0
def lin_models(lasso=True,
               traits=['height', 'BMI', 'WHR', 'BHMD', 'SBP'],
               nbsnps=10000,
               verbose=0,
               hot=False,
               unif=False,
               reps=1):
    alpha = [0.01]
    R = {}
    for t in traits:
        print(t)
        x_tr, x_tst, y_tr, y_tst = retrieve_data(t, nbsnps, unif=unif)
        x_tr, x_val, y_tr, y_val = train_test_split(x_tr, y_tr, test_size=0.33)
        if hot:
            x_tr = convert_to_individual_alleles(x_tr)
            x_val = convert_to_individual_alleles(x_val)
            x_tst = convert_to_individual_alleles(x_tst)

        nb_snps = x_tr.shape[1]
        res = np.zeros((len(alpha), 3))
        n = 0
        for a in alpha:
            print(a)
            for i in range(0, reps):
                m = Sequential()
                if lasso:
                    m.add(Dense(1, input_dim=nb_snps,
                                kernel_regularizer=l1(a)))
                else:
                    m.add(Dense(1, input_dim=nb_snps,
                                kernel_regularizer=l2(a)))

                m.compile(loss='mse', optimizer='adam')
                m.fit(x_tr,
                      y_tr,
                      epochs=1000,
                      callbacks=[EarlyStopping()],
                      validation_data=(x_val, y_val),
                      verbose=verbose)
                if r(m.predict(x_val).ravel(), y_val)[0] > res[n, 0]:
                    print(r(m.predict(x_val).ravel(), y_val)[0])
                    print(i)
                    res[n, 0] = r(m.predict(x_val).ravel(), y_val)[0]
                    res[n, 1] = r(m.predict(x_tst).ravel(), y_tst)[0]

                K.clear_session()

            print(res[n, 1])
            n += 1

        R[t + "val"] = res[:, 0]
        R[t + "tst"] = res[:, 1]

    R["alpha"] = alpha
    print(pd.DataFrame(R).to_csv(float_format='%.3f', index=False))
    logging.info(pd.DataFrame(R).to_csv(float_format='%.3f', index=False))
 def score(self, X, y_prob_true, sample_weight=None):
     y_prob_pred = self.predict_proba(X)
     c = [
         r(y_prob_pred[i], y_prob_true[i]).correlation
         for i in range(len(y_prob_true))
     ]
     return np.average(c, weights=sample_weight)
Esempio n. 3
0
    def calculate_individual_rmse(self):

        rmse_list = []
        rmse_perc_pref = []

        for start_ind in range(self.data.shape[0])[::5]:
            counts = [i for i in self.data.iloc[start_ind:start_ind + 5]["Counts"]]

            pred_values = [self.calculate_value(count_value=count, bmi=self.data.iloc[start_ind]["BMI"])
                           for count in counts]

            true_speed = [i for i in self.data.iloc[start_ind:start_ind + 5]["Speed"]]

            rmse = np.sqrt(metrics.mean_squared_error(y_true=true_speed, y_pred=pred_values))
            rmse_list.append(float(rmse))
            rmse_perc_pref.append(float(100 * rmse / self.data.iloc[start_ind + 2]["Speed"]))

        pref_speeds = [self.pref_data.iloc[i]["Speed"] for i in range(self.pref_data.shape[0])]
        heights = [self.pref_data.iloc[i]["Height"] for i in range(self.pref_data.shape[0])]

        r_value = round(r(pref_speeds, rmse_perc_pref)[0], 3)

        rmse_df = pd.DataFrame(list(zip(self.pref_data["Subject"], self.pref_data["Age"],
                                        self.pref_data["Height"], self.pref_data["Weight"],
                                        self.pref_data["BMI"], pref_speeds, rmse_list, rmse_perc_pref)),
                               columns=["Subject", "Age", "Height", "Weight", "BMI",
                                        "Pref Speed", "RMSE", "RMSE (% Pref)"])

        corr_mat = rmse_df[["Pref Speed", "RMSE", "RMSE (% Pref)"]].corr()

        sb.heatmap(corr_mat, cmap="RdYlGn", annot=True)
        plt.title("Individual Participant Correlations")

        return rmse_df, corr_mat
    # Sorting the input user and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')

    # Movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(
        group['movieId'].tolist())]

    # User input movie ratings in list format to facilitate calculations
    tempRatingList = temp_df['rating'].tolist()

    # User group movie ratings also in list format
    tempGroupList = group['rating'].tolist()

    # Correlation score
    pearsonCorrelationDict[userid] = r(tempRatingList, tempGroupList)[0]

#%% Exploring the created dictionary
corr_df = pd.DataFrame.from_dict(pearsonCorrelationDict, orient="index")
corr_df.columns = ["similarity_index"]
corr_df["userId"] = corr_df.index
corr_df.index = range(len(corr_df))
corr_df.head()

#%% Top 50 similar users to input user
topUsers_df = corr_df.sort_values(by="similarity_index", ascending=False)[0:50]
topUsers_df.head()

#%% Now we're going to extract the movies ratings of the selected most similar users
topUsers_ratings = topUsers_df.merge(ratings_df,
                                     left_on="userId",
Esempio n. 5
0
def CNN(traits=['height', 'BMI', 'WHR', 'BHMD', 'SBP'],
        verbose=0,
        unif=False,
        nbsnps=10000,
        p=None,
        reps=1):
    #cnn1
    param = list({
        'optimizer': 'nadam',
        'size_window': 2,
        'activation': 'softplus',
        'nb_neurons': 64,
        'stride': 'one',
        'nb_cnn_layers': 1,
        'filters': 16,
        'weight_decay': 0.0,
        'nb_layers': 3,
        'dropout': 0.01,
        'batch_norm': True
    })
    #cnn2
    param.append({
        'optimizer': 'nadam',
        'size_window': 2,
        'activation': 'elu',
        'nb_neurons': 32,
        'stride': 'one',
        'nb_cnn_layers': 1,
        'filters': 32,
        'weight_decay': 0.0,
        'nb_layers': 3,
        'dropout': 0.01,
        'batch_norm': False
    })
    #cnn3
    param.append({
        'optimizer': 'rmsprop',
        'size_window': 3,
        'activation': 'linear',
        'nb_neurons': 32,
        'stride': 'one',
        'nb_cnn_layers': 1,
        'filters': 16,
        'weight_decay': 0.0,
        'nb_layers': 1,
        'dropout': 0.01,
        'batch_norm': False
    })
    R = {}
    for t in traits:
        best = 0
        print(t)
        x_tr, x_tst, y_tr, y_tst = retrieve_data(t, nbsnps, unif=unif)
        x_tr, x_val, y_tr, y_val = train_test_split(x_tr, y_tr, test_size=0.33)
        n_snps = x_tr.shape[1]
        x_tr = np.expand_dims(x_tr, axis=2)
        x_val = np.expand_dims(x_val, axis=2)
        x_tst = np.expand_dims(x_tst, axis=2)
        f = os.path.join(
            os.path.expanduser("~"), 'Code/genomic_cnn/models',
            "Model_" + t + "_cnn_" + str(n_snps / 1000) + "k" +
            ("_unif" if unif else "_best") + ".h5")
        n = 0
        if p is None:
            res = np.zeros((len(param), 2))
            for g in param:
                print(g)
                for x in range(0, reps):
                    m = compile_model_cnn(g, (n_snps, 1))
                    m.fit(x_tr,
                          y_tr,
                          epochs=1200,
                          verbose=verbose,
                          validation_data=(x_val, y_val),
                          callbacks=[early_stopper])
                    if r(m.predict(x_val).ravel(), y_val)[0] > res[n, 0]:
                        print(r(m.predict(x_val).ravel(), y_val)[0])
                        print(x)
                        res[n, 0] = r(m.predict(x_val).ravel(), y_val)[0]
                        res[n, 1] = r(m.predict(x_tst).ravel(), y_tst)[0]

                    if res[n, 0] > best:
                        print("A better network was found with r: %.3f" %
                              res[n, 0])
                        print(g)
                        m.save(f)
                        best = res[n, 0]

                n = n + 1

        else:
            res = np.zeros((reps, 2))
            g = param[p]
            for i in range(0, reps):
                m = compile_model_cnn(g, (n_snps, 1))
                m.fit(x_tr,
                      y_tr,
                      epochs=1200,
                      verbose=verbose,
                      validation_data=(x_val, y_val),
                      callbacks=[early_stopper])
                res[i, :] = (r(m.predict(x_val).ravel(),
                               y_val)[0], r(m.predict(x_tst).ravel(),
                                            y_tst)[0])

        R[t + "_tr"] = res[:, 0]
        R[t + "_tst"] = res[:, 1]

    print(pd.DataFrame(R).to_csv(float_format='%.3f', index=False))
    logging.info(pd.DataFrame(R).to_csv(float_format='%.3f', index=False))
Esempio n. 6
0
def MLP(traits=['height', 'BMI', 'WHR', 'BHMD', 'SBP'],
        verbose=0,
        unif=False,
        nbsnps=10000,
        p=None,
        reps=1,
        hot=False):
    #mlp1
    geneparam = list({
        'optimizer': 'rmsprop',
        'activation': 'elu',
        'nb_neurons': 32,
        'weight_decay': 0.01,
        'nb_layers': 1,
        'dropout': 0.02
    })

    # mlp2
    geneparam.append({
        'optimizer': 'adagrad',
        'activation': 'elu',
        'nb_neurons': 64,
        'weight_decay': 0.01,
        'nb_layers': 2,
        'dropout': 0.03
    })
    # mlp3
    geneparam.append({
        'optimizer': 'adam',
        'activation': 'softplus',
        'nb_neurons': 32,
        'weight_decay': 0.01,
        'nb_layers': 5,
        'dropout': 0.02
    })

    R = {}
    for t in traits:
        print(t)
        best = 0
        x_tr, x_tst, y_tr, y_tst = retrieve_data(t, nbsnps, unif=unif)
        x_tr, x_val, y_tr, y_val = train_test_split(x_tr, y_tr, test_size=0.33)
        if hot:
            x_tr = convert_to_individual_alleles(x_tr)
            x_val = convert_to_individual_alleles(x_val)
            x_tst = convert_to_individual_alleles(x_tst)
            n_snps = x_tr.shape[1]
            f = os.path.join(os.path.expanduser("~"), 'Code/genomic_cnn/models',
                             "Model_" + t + "_mlp_" + str(n_snps / 1000) \
                             + "kHot" + ("_unif" if unif else "_best") + ".h5")
        else:
            n_snps = x_tr.shape[1]
            f = os.path.join(
                os.path.expanduser("~"), 'Code/genomic_cnn/models',
                "Model_" + t + "_mlp_" + str(n_snps / 1000) + "k" +
                ("_unif" if unif else "_best") + ".h5")
        n = 0
        if p is None:
            res = np.zeros((len(geneparam), 2))
            for g in geneparam:
                print(g)
                for x in range(0, reps):
                    m = compile_model_mlp(g, n_snps)
                    m.fit(x_tr,
                          y_tr,
                          epochs=1200,
                          validation_data=(x_val, y_val),
                          callbacks=[early_stopper],
                          verbose=verbose)
                    if r(m.predict(x_val).ravel(), y_val)[0] > res[n, 0]:
                        print(r(m.predict(x_val).ravel(), y_val)[0])
                        print(x)
                        res[n, 0] = r(m.predict(x_val).ravel(), y_val)[0]
                        res[n, 1] = r(m.predict(x_tst).ravel(), y_tst)[0]

                    if res[n, 0] > best:
                        print("A better network was found with r: %.3f" %
                              res[n, 0])
                        print(g)
                        m.save(f)
                        best = res[n, 0]

                K.clear_session()
                n = n + 1

        else:
            res = np.zeros((reps, 2))
            g = geneparam[p]
            for i in range(0, reps):
                m = compile_model_mlp(g, n_snps)
                m.fit(x_tr,
                      y_tr,
                      epochs=1200,
                      verbose=verbose,
                      validation_data=(x_val, y_val),
                      callbacks=[early_stopper])
                res[i, :] = (r(m.predict(x_val).ravel(),
                               y_val)[0], r(m.predict(x_tst).ravel(),
                                            y_tst)[0])

        R[t + "_tr"] = res[:, 0]
        R[t + "_tst"] = res[:, 1]

    print(pd.DataFrame(R).to_csv(float_format='%.3f', index=False))
    logging.info(pd.DataFrame(R).to_csv(float_format='%.3f', index=False))