def predict(self, X): """ Use the trained weights of this linear classifier to predict labels for data points. Inputs: - X: N x D array of training data. Each row is a D-dimensional point. Returns: - y_pred: Predicted output for the data in X. y_pred is a 1-dimensional array of length N, and each element is a class label 0 or 1 """ y_pred = np.zeros(X.shape[0]) ########################################################################### # Compute the predicted outputs for X # # TODO: 1 line of code expected # ########################################################################### y_pred += utils.bin_features(np.dot(X,self.theta)) ########################################################################### # END OF YOUR CODE # ########################################################################### return y_pred
df_an_ref.to_csv(df_an_ref_path, index=False) #read files else: print('read preprocessed files') df_an = pd.read_csv(df_an_path) df_an_ref = pd.read_csv(df_an_ref_path) # Missing values df_lb1.fillna(0) df_lb2.fillna(0) df_lb3.fillna(0) df_an.fillna(0) #binary data X_bins_lb1 = u.bin_features(df_lb1.copy(), 0, 1, lb_measures) X_bins_lb2 = u.bin_features(df_lb2.copy(), 0, 1, lb_measures) X_bins_lb3 = u.bin_features(df_lb3.copy(), 0, 1, lb_measures) X_bins_an = u.bin_features(df_an.copy(), 0, 1, lb_measures) #prepare dfs Xy_bins_lb1 = X_bins_lb1.copy() Xy_bins_lb2 = X_bins_lb2.copy() Xy_bins_lb3 = X_bins_lb3.copy() Xy_bins_an = X_bins_an.copy() Xy_bins_lb1['label'] = df_lb1['label'] Xy_bins_lb2['label'] = df_lb2['label'] Xy_bins_lb3['label'] = df_lb3['label'] Xy_bins_an['label'] = df_an['label']
import utils import numpy as np from sklearn import linear_model # No modifications in this script # complete the functions in util.py; then run the script # load the spam data in Xtrain,Xtest,ytrain,ytest = utils.load_spam_data() # Preprocess the data Xtrain_std,mu,sigma = utils.std_features(Xtrain) Xtrain_logt = utils.log_features(Xtrain) Xtrain_bin = utils.bin_features(Xtrain) Xtest_std = (Xtest - mu)/sigma Xtest_logt = utils.log_features(Xtest) Xtest_bin = utils.bin_features(Xtest) # find good lambda by cross validation for these three sets def run_dataset(X,ytrain,Xt,ytest,type,penalty): best_lambda = utils.select_lambda_crossval(X,ytrain,0.1,5.1,0.5,penalty) print "best_lambda = ", best_lambda # train a classifier on best_lambda and run it if penalty == "l2": lreg = linear_model.LogisticRegression(penalty=penalty,C=1.0/best_lambda, solver='lbfgs',fit_intercept=True)
Inputs: - X: N x D array of training data. Each row is a D-dimensional point. Returns: - y_pred: Predicted output for the data in X. y_pred is a 1-dimensional array of length N, and each element is a class label 0 or 1 """ y_pred = np.zeros(X.shape[0]) ########################################################################### # Compute the predicted outputs for X # # TODO: 1 line of code expected # ########################################################################### <<<<<<< HEAD y_pred = utils.bin_features(utils.sigmoid(np.dot(self.theta,X.T))-0.5) ======= >>>>>>> 89dd6a53aa0ff700b713b57c5d8d001424557b1d ########################################################################### # END OF YOUR CODE # ########################################################################### return y_pred class RegLogisticRegressor: def __init__(self): self.theta = None
else: df_data = pd.read_csv(conference_data_processed_path) X, y = df_data[cf_measures], df_data['label'] #fill missing values with 0 X = X.fillna(0) print(X.shape, y.shape) df_data2 = X.copy() df_data2['label'] = y print(df_data2.corr()['label']) #for m in cf_measures: # u.feature_dist(X, m) X_bins = u.bin_features(X.copy(), 0, 1, cf_measures) #for m in cf_measures: # u.feature_dist(X_bins, m) df_data_bins = X_bins.copy() df_data_bins['label'] = y df_data_bins['ontologies'] = df_data['ontologies'] print(df_data_bins.corr()['label']) print(df_data_bins.head()) print(df_data_bins.label.value_counts()) def get_conference_data(measures, ont_comb_train, df_data): lst_ont_comb = []