Ejemplo n.º 1
0
def SVM_Ranking_Model_Extraction_And_Encoding():

    # Pandas readin Training Samples
    Training_Table_Raw = pd.read_csv("FeatureToTrainWithoutTester.csv")
    Training_Table_Raw = Training_Table_Raw.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Dataset Start Time', 'Dataset End Time', 'executionStartTime', 'Dataset Group', 'Users Group'], axis = 1)
    Training_Table = Training_Table_Raw.copy()

    # Feature Encoding
    Training_Table = transform_features(Training_Table)

    # Training/Testing DataSet Split 
    Train_Test_Split = Training_Table.copy()
    X, y = Train_Test_Split.drop('userName', axis = 1), Train_Test_Split['userName']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

    # SVM configuration
    parameters={'clf__gamma':(0.01, 0.02, 0.1, 0.3, 1), 'clf__C':(0.1, 0.3, 1, 3, 10, 30), }
    pipeline = Pipeline([('clf', SVC(kernel='rbf', gamma=0.01, C=100, max_iter = 100, probability = True))])
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=2, verbose=1, scoring='accuracy')
    Grid_Fit = grid_search.fit(X_train, y_train)
    
    predictions = grid_search.predict(X_test)

    Top_N_Recommendder = Accumulation(Training_Table_Raw, Training_Table, Grid_Fit)

    # Prediction Results 
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Confusion Matrix:', confusion_matrix(y_test, predictions)
    print 'Classification Report:', classification_report(y_test, predictions)

    return Top_N_Recommendder
def main():

    # Data Pre-Processing: Join the username table and service log table
    df1 = pd.read_csv("NewForm1.csv")
    df2 = pd.read_csv("serviceExecutionLog_dataset2.csv")
    df3 = pd.merge(df1, df2, on = ['userName', 'executionStartTime'], how = 'left')

    # Uppercase transformation
    df3['model'] = df3['model'].map(str.upper)

    # Write out to csv file
    df3.to_csv("NewForm1WithExecutionTime.csv")

    # Data Pre-Processing: Join the Climate Dataset table to feature to train
    df4 = pd.read_csv("/Users/dennis/Documents/SVM-Tasks/Climate_Datasets.csv")

    # Encoding: Grouping    
    df4['Dataset Group'] = df4['Dataset Group'].map(datasetgrouping)

    # Duplicate & Fillna
    df4['userName'] = df4['userName'].fillna('Unknown')
    df4['Users Group'] = df4['userName']
    df4['Users Group'] = df4['Users Group'].map(usergrouping)

    # Write out to FeaturesForTrain.csv
    df4.to_csv("FeaturesForTrain.csv")

    # Training/Testing Data and split  Preparation
    X, y = df4.astype(str).map(str.strip), df4['userName'].as_matrix()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

    # Pipeline building
    pipeline = Pipeline([('vect', TfidfVectorizer(stop_words = 'english', lowercase = False)), ('clf', SVC(kernel=['rbf', 'linear'], gamma=0.01, C=100, max_iter = 100))])

    # Check the training data shape
    print X_train.shape

    # parameters setting
    parameters={'clf__gamma':(0.01, 0.02, 0.1, 0.3, 1), 'clf__C':(0.1, 0.3, 1, 3, 10, 30), }

    # training with grid_search: parameters fillin
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy')

    # training with grid_search with X_train data
    grid_search.fit(X_train, y_train)
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy')
    
    # Predictions
    predictions = grid_search.predict(X_test)
    predictions_probability = grid_search.predict_proba(X_test)

    # Prediction Results 
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Confusion Matrix:', confusion_matrix(y_test, predictions)
    print 'Classification Report:', classification_report(y_test, predictions)
Ejemplo n.º 3
0
def classification(FV_N):
    " PCA reduction dimension & Random Forest Classification"

    pca = decomposition.PCA()
    RFC = RandomForestClassifier()

    estimators = [('reduce_dim', pca), ('Random_Forest', RFC)]
    pipe = Pipeline(estimators)

    # Search the best parameters for the classification
    #for i in range(100,700,100):
    #    cc=[i]+cc
    #nb_tree=[]
    #random_st=[]
    #for i in range(50,350,50):
    #    nb_tree=[i]+nb_tree
    #    random_st=[0]+random_st

    cc = [70, 80, 90]
    nb_tree = [200, 200, 200]
    random_st = [0, 0, 0]

    aa = [100, 200, 300]
    cc = []

    params = dict(reduce_dim__n_components=cc,
                  Random_Forest__n_estimators=nb_tree,
                  Random_Forest__random_state=random_st)

    grid_search = GridSearchCV(pipe, param_grid=params)

    X = FV_N

    yr = Get_true_y(Data_FRAMES)

    filename_yr = projectpath + 'io/Output/yr.npy'

    np.save(filename_yr, yr)

    yr = np.load(filename_yr)

    Data_FRAMES.loc[Data_FRAMES.indice == 1595]

    X = X[:yr.shape[0]]
    X.shape
    yr = yr[:X.shape[0]]
    np.save(filename_yr, yr)

    yr = np.load(filename_yr)

    grid_search.fit(X, yr)

    print(grid_search.best_estimator_)

    plt.figure()
    plt.axvline(
        grid_search.best_estimator_.named_steps['reduce_dim'].n_components,
        linestyle=':',
        label='n_components chosen')
    plt.legend(prop=dict(size=12))
    plt.show()

    plt.figure()
    plt.axvline(
        grid_search.best_estimator_.named_steps['Random_Forest'].n_estimators,
        linestyle=':',
        label='n_estimators chosen')
    plt.legend(prop=dict(size=12))
    plt.show()

    n_est_rdf = grid_search.best_estimator_.named_steps[
        'Random_Forest'].n_estimators

    n_compo_pca = grid_search.best_estimator_.named_steps[
        'reduce_dim'].n_components

    pca = decomposition.PCA(n_components=n_compo_pca, svd_solver='auto')
    pca.fit(X)

    variance_Ratio = pca.explained_variance_ratio_

    plt.figure(1, figsize=(4, 3))
    plt.clf()
    plt.axes([.2, .2, .7, .7])
    plt.plot(pca.explained_variance_ratio_.cumsum(), linewidth=1)
    plt.axis('tight')
    plt.xlabel('n_components')
    plt.ylabel('Cumulative Explained variance')

    M = pca.transform(X)

    plt.figure()
    plt.plot(M[yr == 1, 0], M[yr == 1, 1], 'or')
    plt.title('Astrocytes')
    plt.figure()
    plt.plot(M[yr == 2, 0], M[yr == 2, 1], 'ob')
    plt.title('Neurons')

    grid_search.predict(X)

    metrics.accuracy_score(yr, grid_search.predict(X))

    RFC = RandomForestClassifier(n_estimators=n_est_rdf, random_state=0)

    predictedVAL = cross_val_predict(RFC, X, yr, n_jobs=-1)

    metrics.accuracy_score(yr, predictedVAL)

    Conf_Mat = confusion_matrix(yr, predictedVAL)

    import seaborn as sns
    sns.heatmap(Conf_Mat.T, square=True, annot=True, cbar=False)
    plt.xlabel('True label')
    plt.ylabel('predicted label')

    return ()
def SVM_Ranking_Model_Extraction_And_Encoding():

    # Pandas readin Training Samples
    df = pd.read_csv("FeatureToTrainWithoutTester.csv")
    df2 = df.copy()
    df2 = df2.drop(['Dataset Start Time', 'Dataset End Time', 'executionStartTime', 'Dataset Group', 'Users Group'], axis = 1)
    df2.head()

    # Feature Encoding
    transform_features(df2)
    df2.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis = 1)
    df2.head()
    
    # Encoded Features
    df = pd.read_csv("Transform_features.csv")

    # Training/Testing DataSet Split 
    df3 = df.copy()
    y = df3['userName']
    df3 = df3.drop(['userName'], axis = 1)
    X = df3
    X_train, X_test, y_train, y_test = X, X, y, y

    # SVM configuration
    parameters={'clf__gamma':(0.01, 0.02, 0.1, 0.3, 1), 'clf__C':(0.1, 0.3, 1, 3, 10, 30), }
    pipeline = Pipeline([('clf', SVC(kernel='rbf', gamma=0.01, C=100, max_iter = 100, probability = True))])
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=2, verbose=1, scoring='accuracy')
    result2 = grid_search.fit(X_train, y_train)

    #coef = (result.best_estimator_.get_params()['clf'].coef_)
    #coef2 = coef_sum(coef)
    #coef2
    
    index = ['DatasetName', 'Agency', 'Instrument', 'Physical variable', 'var',
       'Units', 'Grid Dimension', 'Variable Name in Web Interface', 'model']


    # Model Estimation
    model = []

    for i in index:
        # Features' distance/relevant to category prediction
        model.append(feature_training(X_train, y_train, i))


    # Training data distance to single column PCA
    weight_set = numpy.zeros((len(X_train), len(index)))

    for j in range(0, len(X_train)):

        dict_index = 0

        for i in index:

            # Features' distance/relevant to category prediction
            model_extraction = model[dict_index]
            sample = X_train[j:j+1]
            weight = feature_distance(sample, i, model_extraction)
            weight_set[j, dict_index] = weight

            dict_index = dict_index + 1

            print "[INFO] Data Points: ", j, "Columns Iteration: ", dict_index
            print "[INFO] Weight : ", weight

        if j % 100 == 0:
            weight_set_file = pd.DataFrame(weight_set.copy())
            weight_set_file.to_csv("weight_set.csv")


    # Delivery: Training data with Label 
    Training_matrix = pd.DataFrame(weight_set.copy())
    Training_matrix['Label'] = y_train


    # SVM Ranking Formatting
    SVM_Rank_Formatted_Training_data  = Training_matrix.copy()

    for j in range(0, len(X_train)):
        for i in range(0, 9):
            SVM_Rank_Formatted_Training_data.ix[j, i] = str(i + 1) + ":" + str(SVM_Rank_Formatted_Training_data.ix[j, i])
            SVM_Rank_Formatted_Training_data.ix[j, 'Label'] = str(int(SVM_Rank_Formatted_Training_data.ix[j, 9]))

    # Columns Reorder
    Rank_format_columns = SVM_Rank_Formatted_Training_data.columns.tolist()
    Rank_format_columns = Rank_format_columns[-1:] + Rank_format_columns[:-1]
    SVM_Rank_Formatted_Training_data = SVM_Rank_Formatted_Training_data[Rank_format_columns]

    # Write to CSV format
    SVM_Rank_Formatted_Training_data.to_csv("SVM_Rank_Formatted_Training_data2.dat", index = False, sep = ' ', index_label = False, header = False)
    SVM_Rank_Formatted_Training_data.to_csv("SVM_Rank_Formatted_Training_data2.csv")

    predictions = grid_search.predict(X_test)

    # Prediction Results 
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Confusion Matrix:', confusion_matrix(y_test, predictions)
    print 'Classification Report:', classification_report(y_test, predictions)
Ejemplo n.º 5
0
randomforestclassifier()

# In[56]:

from sklearn import svm, grid_search


def svc_param_selection(X, y, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 1.1, 2, 3, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    #kernels = [‘linear’, ‘rbf’, ‘poly’]
    param_grid = {'C': Cs, 'gamma': gammas}
    grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search, grid_search.best_params_


grid_search, params = svc_param_selection(x_train, y_train, 10)
y_pred = grid_search.predict(x_test)
print('best param:', params)
kernals = "linear,rbf,poly"
kernals = kernals.split(',')
for kernel in kernals:
    svc = SVC(kernel=kernel, C=params['C'], gamma=params['gamma'])
    svc.fit(x_train, y_train)
    y_pred = svc.predict(x_test)
    print("kernal name:", kernel)
    print('Accuracy Score:')
    print(metrics.accuracy_score(y_test, y_pred))
    print(metrics.accuracy_score(Y_test, y_pred_class_svm),"SVM-SGD -countvectorizer")
    svm_t = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42)
    svm_t.fit(X_train_tfidf, Y_train)
    y_pred_svm_t = svm_t.predict(X_test_tfidf)
    print(metrics.accuracy_score(Y_test, y_pred_svm_t),"SVM-SGD -tfidf")
    #grid
    print("grid")
    from sklearn import svm, grid_search
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma' : gammas, 'kernel':('poly', 'rbf')}
    grid_search = GridSearchCV(svm.SVC(), param_grid, cv=5)
    grid_search.fit(X_train_dtm, Y_train)
    print(grid_search.best_score_)
    print(grid_search.best_params_)
    y_grid_search_svm = grid_search.predict(X_test_dtm)
    print(metrics.accuracy_score(Y_test,y_grid_search_svm),"grid search- SVM")
    



    
    '''
    #X_train, X_test, y_train, y_test = train_test_split(corpus, labels, random_state=1,train_size=0.90)
    #X_train_tfidf, vectorizer = generate_features(X_train)
    #X_test_tfidf, vectorizer = generate_features(X_test)

    
    from sklearn.naive_bayes import MultinomialNB,GaussianNB
    from sklearn.metrics import accuracy_score
    clf = MultinomialNB()
Ejemplo n.º 7
0
    # grid_search.fit(X_train, Y_train)
    # print grid_search.grid_scores_
    # print grid_search.best_estimator_
    #Best estimator was C=0.5.

#Narrowing down by order of magnitude.
parameters = {'C':[0.45, 0.46, 0.47, 0.48, 0.49, 0.50, 0.51, 0.52, 0.53, 0.54, 0.55]}
grid_search = grid_search.GridSearchCV(model_svm, parameters)
grid_search.fit(X_train, Y_train)
print grid_search.grid_scores_
print grid_search.best_estimator_
#Best estimator was C=0.45. Because we already compared 0.4 to 0.5 two searches above, and 0.5 was selected, we induce that 0.45 is the optimal value without searching between 0.40 and 0.45.

#Returning model results with optimal 'C' value.
expected = Y_test
predicted = grid_search.predict(X_test)

print classification_report(expected, predicted)
print metrics.confusion_matrix(expected, predicted)
print metrics.accuracy_score(expected, predicted)

#Support Vector Machine: Model fit, transform, and testing with optimized 'C' value
splits = cv.train_test_split(X_train_tfidf, dataset.target, test_size=0.2)
X_train, X_test, Y_train, Y_test = splits

model_svm = svm.LinearSVC(C=0.45, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
model_svm.fit(X_train, Y_train)
    # Write out to FeaturesForTrain.csv
    df4.to_csv("FeaturesForTrain.csv")

    # Training/Testing Data and split  Preparation
    X, y = df4.astype(str).map(str.strip), df4['userName'].as_matrix()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

    # Pipeline building
    pipeline = Pipeline(['vect', TfidfVectorizer()), ('clf', LogisticRegression())])

    # Check the training data shape
    print X_train.shape

    # parameters setting
    parameters={'clf__gamma':(0.01, 0.02, 0.1, 0.3, 1), 'clf__C':(0.1, 0.3, 1, 3, 10, 30), }

    # training with grid_search: parameters fillin
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy')

    # training with grid_search with X_train data
    grid_search.fit(X_train, y_train)
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy')
    
    # Predictions
    predictions = grid_search.predict(X_test)
    predictions_probability = grid_search.predict_proba(X_test)

    # Prediction Results 
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Confusion Matrix:', confusion_matrix(y_test, predictions)
    print 'Classification Report:', classification_report(y_test, predictions)
# TODO: Tune the hyper-parameters 'C' and 'kernel' (use rbf and linear).
#       Print the best params, using .best_params_, and print the best score, using .best_score_.
# Get the training and test set accuracy values after hyperparameter tuning.
# XXX

Cs = [1, 10, 100]
kernels = ['linear', 'rbf']

param_grid = {'C': Cs, 'kernel': kernels}

grid_search = GridSearchCV(SVC(), param_grid=param_grid, cv=5)
grid_search.fit(rescaledX, y_data)
print("best params ", grid_search.best_params_)
print("best score ", grid_search.best_score_)

tuningpredict = grid_search.predict(X_test)

print("Train Accuracy  ", accuracy_score(y_train,
                                         grid_search.predict(X_train)))
print("Test Accuracy ",
      accuracy_score(y_test, tuningpredict.round(), normalize=True))

svclassifier2 = SVC(kernel='linear', C=1)
svmd2 = svclassifier2.fit(X_train, y_train)
y_pred2 = svmd2.predict(X_test)
print("Train Accuracy  ", accuracy_score(y_train, svmd2.predict(X_train)))
print("Test Accuracy ", accuracy_score(y_test, y_pred2.round(),
                                       normalize=True))

# XXX
# TODO: Calculate the mean training score, mean testing score and mean fit time for the
Ejemplo n.º 10
0
    "criterion": ["gini", "entropy"],
    "max_features": [sqrtfeat],
    "max_depth": [5, 10, 25],
    "min_samples_split": [2, 5, 10, minsampsplit]
}

forest = RandomForestClassifier(oob_score=1)

print("Hyperparameter optimization using GridSearchCV...")
grid_search = model_selection.GridSearchCV(forest,
                                           grid_test1,
                                           n_jobs=-1,
                                           cv=10)

grid_search.fit(X, y)
Y_pred = grid_search.predict(X_test)

print(grid_search.score(X, y))

#
# random_forest = RandomForestClassifier(oob_score=True, n_estimators=30000,max_depth=25, n_jobs=-1)
# random_forest.fit(X,y)
#
#
# Y_pred = random_forest.predict(X_test)
# print(random_forest.score(X, y))

ResultSubmission = pd.DataFrame({
    'PassengerId':
    list(X_test_original['PassengerId']),
    'Survived':