def main(): # Data Pre-Processing: Join the username table and service log table df1 = pd.read_csv("NewForm1.csv") df2 = pd.read_csv("serviceExecutionLog_dataset2.csv") df3 = pd.merge(df1, df2, on = ['userName', 'executionStartTime'], how = 'left') # Uppercase transformation df3['model'] = df3['model'].map(str.upper) # Write out to csv file df3.to_csv("NewForm1WithExecutionTime.csv") # Data Pre-Processing: Join the Climate Dataset table to feature to train df4 = pd.read_csv("/Users/dennis/Documents/SVM-Tasks/Climate_Datasets.csv") # Encoding: Grouping df4['Dataset Group'] = df4['Dataset Group'].map(datasetgrouping) # Duplicate & Fillna df4['userName'] = df4['userName'].fillna('Unknown') df4['Users Group'] = df4['userName'] df4['Users Group'] = df4['Users Group'].map(usergrouping) # Write out to FeaturesForTrain.csv df4.to_csv("FeaturesForTrain.csv") # Training/Testing Data and split Preparation X, y = df4.astype(str).map(str.strip), df4['userName'].as_matrix() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) # Pipeline building pipeline = Pipeline([('vect', TfidfVectorizer(stop_words = 'english', lowercase = False)), ('clf', SVC(kernel=['rbf', 'linear'], gamma=0.01, C=100, max_iter = 100))]) # Check the training data shape print X_train.shape # parameters setting parameters={'clf__gamma':(0.01, 0.02, 0.1, 0.3, 1), 'clf__C':(0.1, 0.3, 1, 3, 10, 30), } # training with grid_search: parameters fillin grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy') # training with grid_search with X_train data grid_search.fit(X_train, y_train) grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy') # Predictions predictions = grid_search.predict(X_test) predictions_probability = grid_search.predict_proba(X_test) # Prediction Results print 'Accuracy:', accuracy_score(y_test, predictions) print 'Confusion Matrix:', confusion_matrix(y_test, predictions) print 'Classification Report:', classification_report(y_test, predictions)
# Write out to FeaturesForTrain.csv df4.to_csv("FeaturesForTrain.csv") # Training/Testing Data and split Preparation X, y = df4.astype(str).map(str.strip), df4['userName'].as_matrix() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) # Pipeline building pipeline = Pipeline(['vect', TfidfVectorizer()), ('clf', LogisticRegression())]) # Check the training data shape print X_train.shape # parameters setting parameters={'clf__gamma':(0.01, 0.02, 0.1, 0.3, 1), 'clf__C':(0.1, 0.3, 1, 3, 10, 30), } # training with grid_search: parameters fillin grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy') # training with grid_search with X_train data grid_search.fit(X_train, y_train) grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy') # Predictions predictions = grid_search.predict(X_test) predictions_probability = grid_search.predict_proba(X_test) # Prediction Results print 'Accuracy:', accuracy_score(y_test, predictions) print 'Confusion Matrix:', confusion_matrix(y_test, predictions) print 'Classification Report:', classification_report(y_test, predictions)