Beispiel #1
0
    def predict(self, X):
        """
        Use the trained weights of this linear classifier to predict labels for
        data points.

        Inputs:
        - X: N x D array of training data. Each row is a D-dimensional point.

        Returns:
        - y_pred: Predicted output for the data in X. y_pred is a 1-dimensional
          array of length N, and each element is a class label 0 or 1
        """
        y_pred = np.zeros(X.shape[0])

        ###########################################################################
        # Compute the predicted outputs for X                                     #
        # TODO: 1 line of code expected                                           #
        ###########################################################################
        y_pred += utils.bin_features(np.dot(X,self.theta))
        ###########################################################################
        #                           END OF YOUR CODE                              #
        ###########################################################################
        return y_pred
Beispiel #2
0
    df_an_ref.to_csv(df_an_ref_path, index=False)

#read files
else:
    print('read preprocessed files')
    df_an = pd.read_csv(df_an_path)
    df_an_ref = pd.read_csv(df_an_ref_path)

# Missing values
df_lb1.fillna(0)
df_lb2.fillna(0)
df_lb3.fillna(0)
df_an.fillna(0)

#binary data
X_bins_lb1 = u.bin_features(df_lb1.copy(), 0, 1, lb_measures)
X_bins_lb2 = u.bin_features(df_lb2.copy(), 0, 1, lb_measures)
X_bins_lb3 = u.bin_features(df_lb3.copy(), 0, 1, lb_measures)
X_bins_an = u.bin_features(df_an.copy(), 0, 1, lb_measures)

#prepare dfs
Xy_bins_lb1 = X_bins_lb1.copy()
Xy_bins_lb2 = X_bins_lb2.copy()
Xy_bins_lb3 = X_bins_lb3.copy()
Xy_bins_an = X_bins_an.copy()

Xy_bins_lb1['label'] = df_lb1['label']
Xy_bins_lb2['label'] = df_lb2['label']
Xy_bins_lb3['label'] = df_lb3['label']
Xy_bins_an['label'] = df_an['label']
Beispiel #3
0
import utils
import numpy as np
from sklearn import linear_model

# No modifications in this script
# complete the functions in util.py; then run the script

# load the spam data in

Xtrain,Xtest,ytrain,ytest = utils.load_spam_data()

# Preprocess the data 

Xtrain_std,mu,sigma = utils.std_features(Xtrain)
Xtrain_logt = utils.log_features(Xtrain)
Xtrain_bin = utils.bin_features(Xtrain)

Xtest_std = (Xtest - mu)/sigma
Xtest_logt = utils.log_features(Xtest)
Xtest_bin = utils.bin_features(Xtest)

# find good lambda by cross validation for these three sets

def run_dataset(X,ytrain,Xt,ytest,type,penalty):

    best_lambda = utils.select_lambda_crossval(X,ytrain,0.1,5.1,0.5,penalty)
    print "best_lambda = ", best_lambda

    # train a classifier on best_lambda and run it
    if penalty == "l2":
        lreg = linear_model.LogisticRegression(penalty=penalty,C=1.0/best_lambda, solver='lbfgs',fit_intercept=True)
        Inputs:
        - X: N x D array of training data. Each row is a D-dimensional point.

        Returns:
        - y_pred: Predicted output for the data in X. y_pred is a 1-dimensional
          array of length N, and each element is a class label 0 or 1
        """
        y_pred = np.zeros(X.shape[0])

        ###########################################################################
        # Compute the predicted outputs for X                                     #
        # TODO: 1 line of code expected                                           #
        ###########################################################################
<<<<<<< HEAD
	
	y_pred = utils.bin_features(utils.sigmoid(np.dot(self.theta,X.T))-0.5)
=======

>>>>>>> 89dd6a53aa0ff700b713b57c5d8d001424557b1d

        ###########################################################################
        #                           END OF YOUR CODE                              #
        ###########################################################################
        return y_pred



class RegLogisticRegressor:

    def __init__(self):
        self.theta = None
else:
    df_data = pd.read_csv(conference_data_processed_path)

X, y = df_data[cf_measures], df_data['label']
#fill missing values with 0
X = X.fillna(0)
print(X.shape, y.shape)

df_data2 = X.copy()
df_data2['label'] = y
print(df_data2.corr()['label'])

#for m in cf_measures:
#    u.feature_dist(X, m)

X_bins = u.bin_features(X.copy(), 0, 1, cf_measures)
#for m in cf_measures:
#    u.feature_dist(X_bins, m)

df_data_bins = X_bins.copy()
df_data_bins['label'] = y
df_data_bins['ontologies'] = df_data['ontologies']
print(df_data_bins.corr()['label'])

print(df_data_bins.head())

print(df_data_bins.label.value_counts())


def get_conference_data(measures, ont_comb_train, df_data):
    lst_ont_comb = []