validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

# Test options and evaluation metric
seed = 7
scoring = 'accuracy'

# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

print "--------different model accuray evaluation--------"
# evaluate each model in turn
results = []
names = []
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, random_state=seed)
	cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	model.fit(X_train, Y_train)
	predictions = model.predict(X_validation)
	msg = "%s: %f (%f), accuracy score: %f" % (name, cv_results.mean(), cv_results.std(), accuracy_score(Y_validation, predictions))
	print(msg)
Esempio n. 2
0
params = {'penalty':['l1', 'l2'], 'C':[1, 2, 3, 5, 10]}
lr = LogisticRegression(random_state = 0)
clf = GridSearchCV(lr, param_grid = params, scoring = accuracy_scorer, cv = 5, n_jobs = -1)
clf.fit(X_train, y_train)
print('Best score: {}'.format(clf.best_score_))
print('Best parameters: {}'.format(clf.best_params_))

lr_best = LogisticRegression(penalty = 'l1', C = 1, random_state = 0)


# In[ ]:


params = {'kernel':['linear', 'rbf'], 'C':[1, 3, 5, 10], 'degree':[3, 5, 10]}
svc = SVC(probability = True, random_state = 0)
clf = GridSearchCV(svc, param_grid = params, scoring = accuracy_scorer, cv = 5, n_jobs = -1)
clf.fit(X_train, y_train)
print('Best score: {}'.format(clf.best_score_))
print('Best parameters: {}'.format(clf.best_params_))

svc_best = SVC(C = 10, degree = 3, kernel = 'linear', probability = True, random_state = 0)


# In[ ]:


voting_clf = VotingClassifier(estimators=[('rf', rf_best), ('bag', bag_best), ('gbc', gbc_best), ('lr', lr_best), ('svc', svc_best)]
                              , voting='hard')
voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)
Esempio n. 3
0
import cv2, glob, random, math, numpy as np, dlib, itertools
from sklearn.svm import SVC
__author__ = "Paul van Gent, 2016" #Please leave this line in

emotions = ["anger", "contempt", "disgust", "fear", "happiness", "neutral", "sadness", "surprise"] #Emotion list
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat") #Or set this to whatever you named the downloaded file
clf = SVC(kernel='linear', probability=True, tol=1e-3)#, verbose = True) #Set the classifier as a support vector machines with polynomial kernel

def get_files(emotion): #Define function to get file list, randomly shuffle it and split 80/20
    files = glob.glob("dataset\\%s\\*" %emotion)
    random.shuffle(files)
    training = files[:int(len(files)*0.8)] #get first 80% of file list
    prediction = files[-int(len(files)*0.2):] #get last 20% of file list
    return training, prediction

def get_landmarks(image):
    detections = detector(image, 1)
    for k,d in enumerate(detections): #For all detected face instances individually
        shape = predictor(image, d) #Draw Facial Landmarks with the predictor class
        xlist = []
        ylist = []
        for i in range(1,68): #Store X and Y coordinates in two lists
            xlist.append(float(shape.part(i).x))
            ylist.append(float(shape.part(i).y))
            
        xmean = np.mean(xlist) #Get the mean of both axes to determine centre of gravity
        ymean = np.mean(ylist)
        xcentral = [(x-xmean) for x in xlist] #get distance between each point and the central point in both axes
        ycentral = [(y-ymean) for y in ylist]
Esempio n. 4
0
train = pd.read_csv('train.csv', header=None)
train_y = pd.read_csv('trainLabels.csv', header=None)
test = pd.read_csv('test.csv', header=None)
#test.ix[:,1:11].hist()

n_pca = 21
n_gmm = 4

pca = PCA(n_components=n_pca, whiten=True).fit(train)
train_pca = pca.transform(train)

X_train, X_val, y_train, y_val = \
        train_test_split(train_pca, train_y,test_size=0.2, random_state=0)

gmm = GMM(n_components=n_gmm, covariance_type='full').fit(X_train)
svc = SVC().fit(gmm.predict_proba(X_train), y_train)
svc.score(gmm.predict_proba(X_val), y_val)

forest = ensemble.ExtraTreesClassifier(n_estimators=400).fit(
    gmm.predict_proba(X_train), y_train)
forest.score(gmm.predict_proba(X_val), y_val)

test_pca = pca.transform(test)
gmm_all = GMM(n_components=n_gmm, covariance_type='full').fit(train_pca)

svc_all = SVC().fit(gmm_all.predict_proba(train_pca), train_y)
pred_svc = svc_all.predict(gmm_all.predict_proba(test_pca))

forest_all = ensemble.RandomForestClassifier(n_estimators=400).fit(
    gmm_all.predict_proba(train_pca), train_y)
pred_forest = forest_all.predict(gmm_all.predict_proba(test_pca))
Esempio n. 5
0
    def fit(self, train_data, train_labels, val_data, val_labels):
        """
        Fits to training data.
        
        Args:
            train_data (ndarray): Training data.
            train_labels (ndarray): Training labels.
            val_data (ndarray): Validation data.
            val_labels (ndarray): Validation labels.
        """
        split = np.append(-np.ones(train_labels.shape, dtype=np.float32),
                  np.zeros(val_labels.shape, dtype=np.float32))
        ps = PredefinedSplit(split)

        sh = train_data.shape
        train_data = np.append(train_data, val_data , axis=0)
        train_labels = np.append(train_labels , val_labels, axis=0)
        del val_data, val_labels
        
        if self.kernel == 'linear':
            if self.probability:
                clf = SVC(kernel='linear', class_weight='balanced',
                          random_state=6, decision_function_shape='ovr',
                          max_iter=1000, probability=self.probability,
                          **self.scikit_args)
            else:
                clf = LinearSVC(class_weight='balanced', dual=False,
                                random_state=6, multi_class='ovr',
                                max_iter=1000, **self.scikit_args)
        
            #Cross-validate over these parameters
            params = {'C': 2.0**np.arange(-9,16,2,dtype=np.float)}
        elif self.kernel == 'rbf':
            clf = SVC(random_state=6, class_weight='balanced', cache_size=16000,
                      decision_function_shape='ovr',max_iter=1000, tol=1e-4, 
                      probability=self.probability, **self.scikit_args)            
            params = {'C': 2.0**np.arange(-9,16,2,dtype=np.float),
                      'gamma': 2.0**np.arange(-15,4,2,dtype=np.float)}

        #Coarse search      
        gs = GridSearchCV(clf, params, refit=False, n_jobs=self.n_jobs,  
                          verbose=self.verbosity, cv=ps)
        gs.fit(train_data, train_labels)
        
        #Fine-Tune Search
        if self.kernel == 'linear':
            best_C = np.log2(gs.best_params_['C'])
            params = {'C': 2.0**np.linspace(best_C-2,best_C+2,10,
                                            dtype=np.float)}
        elif self.kernel == 'rbf':
            best_C = np.log2(gs.best_params_['C'])
            best_G = np.log2(gs.best_params_['gamma'])
            params = {'C': 2.0**np.linspace(best_C-2,best_C+2,10,
                                            dtype=np.float),
                      'gamma': 2.0**np.linspace(best_G-2,best_G+2,10,
                                                dtype=np.float)}            
        
        self.gs = GridSearchCV(clf, params, refit=self.refit, n_jobs=self.n_jobs,  
                          verbose=self.verbosity, cv=ps)
        self.gs.fit(train_data, train_labels)
        
        if not self.refit:
            clf.set_params(C=gs.best_params_['C'])
            if self.kernel == 'rbf':
                clf.set_params(gamma=gs.best_params_['gamma'])
            self.gs = clf
            self.gs.fit(train_data[:sh[0]], train_labels[:sh[0]])
Esempio n. 6
0
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

## Data without information about depth
X_train_ND, X_test_ND = np.delete(arr=X_train, obj=[0, 4, 6],
                                  axis=1), np.delete(arr=X_test,
                                                     obj=[0, 4, 6],
                                                     axis=1)

####### II: Classification #######

# Define Classifiers
nb = GaussianNB()
knn = KNeighborsClassifier()
svc = SVC(probability=True)

## Fit Classifiers without depth:
fit_nb_ND = nb.fit(X_train_ND, y_train)
fit_knn_ND = knn.fit(X_train_ND, y_train)
fit_svc_ND = svc.fit(X_train_ND, y_train)

# Predict with Classifiers
## Save methods in dict to iterate over them.
methods = {"Naive Bayes": nb, "KNN": knn, "SVM": svc}

## With Depth:
accuracies = []
precisions = []

for method_name, method in methods.items():
Esempio n. 7
0
t0 = time()
x_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

print("done in %0.3fs" % (time() - t0))


###############################################################################
# Train a SVM classification model

print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
#clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
clf = SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0, decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True,tol=0.001, verbose=False)
clf = clf.fit(x_train_pca, y_train)
#clf = cv2.createFisherFaceRecognizer()
#clf.train(x_train_pca, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
#print(clf.best_estimator_)

# Save the classifier
joblib.dump(clf, "recognition_clf.pkl", compress=3)




###############################################################################
X = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values

# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test) # No need to fit Test Set as its already fitted to Training Set

# Fitting classifier to the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = "rbf", random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix(Classification Evaluation Metric)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix : %s " % (cm))

# Visualizing the Training Set Results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
x1Values = np.arange(start = X_set[:, 0].min()-1, stop = X_set[:, 0].max()+1, step = 0.01)
x2Values = np.arange(start = X_set[:, 1].min()-1, stop = X_set[:, 1].max()+1, step = 0.01)
def main():
    st.title("Binary Classification Web App")
    st.sidebar.title("Binary Classification Web App")
    st.markdown("Are your mushrooms edible or poisonous? 🍄")
    st.sidebar.markdown("Are your mushrooms edible or poisonous? 🍄")

    #st.cache :
    #until and unless the function name and arguments are chaged the data is cached
    # just use the cached data to rerun

    #Label Encoding :
    #refers to converting the labels into numeric form
    #so as to convert it into the machine-readable form. Machine learning algorithms
    #can then decide in a better way on how those labels must be operated.
    #It is an important pre-processing step for the structured dataset in supervised learning.

    @st.cache(persist=True)
    def load_data():
        data = pd.read_csv("mushrooms.csv")
        labelencoder = LabelEncoder()
        for col in data.columns:
            data[col] = labelencoder.fit_transform(data[col])
        #st.write(data)  #to check the dataset after label encoding
        return data

    @st.cache(persist=True)
    def split(df):
        y = df.type
        x = df.drop(columns=['type'])
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.3,
                                                            random_state=0)
        return x_train, x_test, y_train, y_test

    def plot_metrics(metrics_list):
        if 'Confusion Matrix' in metrics_list:
            st.subheader("Confusion Matrix")
            plot_confusion_matrix(model,
                                  x_test,
                                  y_test,
                                  display_labels=class_names)
            st.pyplot()

        if 'ROC Curve' in metrics_list:
            st.subheader("ROC Curve")
            plot_roc_curve(model, x_test, y_test)
            st.pyplot()

        if 'Precision-Recall Curve' in metrics_list:
            st.subheader('Precision-Recall Curve')
            plot_precision_recall_curve(model, x_test, y_test)
            st.pyplot()

    df = load_data()
    class_names = ['edible', 'poisonous']  #for confusion matrix

    x_train, x_test, y_train, y_test = split(df)

    #take user input of hyperparameters
    st.sidebar.subheader("Choose Classifier")
    classifier = st.sidebar.selectbox("Classifier",
                                      ("Support Vector Machine (SVM)",
                                       "Logistic Regression", "Random Forest"))

    if classifier == 'Support Vector Machine (SVM)':
        st.sidebar.subheader("Model Hyperparameters")
        #choose parameters
        C = st.sidebar.number_input("C (Regularization parameter)",
                                    0.01,
                                    10.0,
                                    step=0.01,
                                    key='C_SVM')
        kernel = st.sidebar.radio("Kernel", ("rbf", "linear"), key='kernel')
        gamma = st.sidebar.radio("Gamma (Kernel Coefficient)",
                                 ("scale", "auto"),
                                 key='gamma')

        metrics = st.sidebar.multiselect(
            "What metrics to plot?",
            ('Confusion Matrix', 'ROC Curve', 'Precision-Recall Curve'))

        if st.sidebar.button("Classify", key='classify'):
            st.subheader("Support Vector Machine (SVM) Results")
            model = SVC(C=C, kernel=kernel, gamma=gamma)
            model.fit(x_train, y_train)
            accuracy = model.score(x_test, y_test)
            y_pred = model.predict(x_test)
            st.write("Accuracy: ", accuracy.round(2))
            st.write(
                "Precision: ",
                precision_score(y_test, y_pred, labels=class_names).round(2))
            st.write("Recall: ",
                     recall_score(y_test, y_pred, labels=class_names).round(2))
            plot_metrics(metrics)

    if classifier == 'Logistic Regression':
        st.sidebar.subheader("Model Hyperparameters")
        C = st.sidebar.number_input("C (Regularization parameter)",
                                    0.01,
                                    10.0,
                                    step=0.01,
                                    key='C_LR')
        max_iter = st.sidebar.slider("Maximum number of iterations",
                                     100,
                                     500,
                                     key='max_iter')

        metrics = st.sidebar.multiselect(
            "What metrics to plot?",
            ('Confusion Matrix', 'ROC Curve', 'Precision-Recall Curve'))

        if st.sidebar.button("Classify", key='classify'):
            st.subheader("Logistic Regression Results")
            model = LogisticRegression(C=C, penalty='l2', max_iter=max_iter)
            model.fit(x_train, y_train)
            accuracy = model.score(x_test, y_test)
            y_pred = model.predict(x_test)
            st.write("Accuracy: ", accuracy.round(2))
            st.write(
                "Precision: ",
                precision_score(y_test, y_pred, labels=class_names).round(2))
            st.write("Recall: ",
                     recall_score(y_test, y_pred, labels=class_names).round(2))
            plot_metrics(metrics)

    if classifier == 'Random Forest':
        st.sidebar.subheader("Model Hyperparameters")
        n_estimators = st.sidebar.number_input(
            "The number of trees in the forest",
            100,
            5000,
            step=10,
            key='n_estimators')
        max_depth = st.sidebar.number_input("The maximum depth of the tree",
                                            1,
                                            20,
                                            step=1,
                                            key='n_estimators')
        bootstrap = st.sidebar.radio("Bootstrap samples when building trees",
                                     ('True', 'False'),
                                     key='bootstrap')
        metrics = st.sidebar.multiselect(
            "What metrics to plot?",
            ('Confusion Matrix', 'ROC Curve', 'Precision-Recall Curve'))

        if st.sidebar.button("Classify", key='classify'):
            st.subheader("Random Forest Results")
            model = RandomForestClassifier(n_estimators=n_estimators,
                                           max_depth=max_depth,
                                           bootstrap=bootstrap,
                                           n_jobs=-1)
            model.fit(x_train, y_train)
            accuracy = model.score(x_test, y_test)
            y_pred = model.predict(x_test)
            st.write("Accuracy: ", accuracy.round(2))
            st.write(
                "Precision: ",
                precision_score(y_test, y_pred, labels=class_names).round(2))
            st.write("Recall: ",
                     recall_score(y_test, y_pred, labels=class_names).round(2))
            plot_metrics(metrics)

    if st.sidebar.checkbox("Show raw data", False):
        st.subheader("Mushroom Data Set (Classification)")
        st.write(df)
        st.markdown(
            "This [data set](https://archive.ics.uci.edu/ml/datasets/Mushroom) includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms "
            "in the Agaricus and Lepiota Family (pp. 500-525). Each species is identified as definitely edible, definitely poisonous, "
            "or of unknown edibility and not recommended. This latter class was combined with the poisonous one."
        )
y = dataset.iloc[:, 4].values

#Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

#Feature Scaling (Zscore, it standardizes the data) no need in 
from sklearn.preprocessing import StandardScalar
sc_X = StandardScalar()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

#Fitting regression model to the Training set
#Create regression model
from sklearn.svm import SVC
classifier = SVC(kernal = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

#Predicting the Test set results
y_pred = classifier.predict(X_test)

#Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

#Visualizing the Training set results (use this to see test set results by changing the variable)
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrind(np.arage(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01), np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape), alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
Esempio n. 11
0
nifti_masker = NiftiMasker(mask_img=mask_filename, sessions=session,
                           smoothing_fwhm=4, standardize=True,
                           memory="nilearn_cache", memory_level=1)
func_filename = haxby_dataset.func[0]
X = nifti_masker.fit_transform(func_filename)
# Restrict to non rest data
X = X[condition_mask]
session = session[condition_mask]

###########################################################################
# Build the decoder that we will use

# Define the prediction function to be used.
# Here we use a Support Vector Classification, with a linear kernel
from sklearn.svm import SVC
svc = SVC(kernel='linear')


# Define the dimension reduction to be used.
# Here we use a classical univariate feature selection based on F-test,
# namely Anova. We set the number of features to be selected to 500
from sklearn.feature_selection import SelectKBest, f_classif
feature_selection = SelectKBest(f_classif, k=500)

# We have our classifier (SVC), our feature selection (SelectKBest), and now,
# we can plug them together in a *pipeline* that performs the two operations
# successively:
from sklearn.pipeline import Pipeline
anova_svc = Pipeline([('anova', feature_selection), ('svc', svc)])

###########################################################################
#from sklearn import preprocessing
#le = preprocessing.LabelEncoder()
#bankdata = bankdata.apply(le.fit_transform)

droplist = ['class']
X = bankdata.drop(droplist, axis=1)
y = bankdata['class']

#从这儿开始才是算法,上面是处理输入的数据csv
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)
#labels = np.unique(X); print(labels)

from sklearn.svm import SVC
clf = SVC()  #kernel='rbf'
#clf = SVC(kernel='poly',degree=4)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

#y_pred = svclassifier.predict(X_test)
#
#from sklearn.metrics import classification_report, confusion_matrix
#print(confusion_matrix(y_test,y_pred))
#print(classification_report(y_test,y_pred)) 
#
#
Esempio n. 13
0
X = data[:, 0:4]
Y = data[:, 4]

val_size = 0.2
scoring = "accuracy"

(X_train, X_val, Y_train,
 Y_val) = model_selection.train_test_split(X, Y, test_size=val_size)

models = {
    "LR": LogisticRegression(solver="lbfgs", multi_class="auto"),
    "LDA": LinearDiscriminantAnalysis(solver='lsqr'),
    "KNN": KNeighborsClassifier(),
    "DTC": DecisionTreeClassifier(),
    "NB": GaussianNB(),
    "SVC": SVC(),
    "MLP": MLPClassifier(),
}

results = []
for name, model in models.items():
    kfold = model_selection.KFold(n_splits=10)
    cross_res = model_selection.cross_val_score(model,
                                                X_train,
                                                Y_train,
                                                cv=kfold,
                                                scoring=scoring)
    results.append((name, cross_res))

for name, res in results:
    print("{:6} {:2.4} {:2.4}").format(name, res.mean(), res.std())
    # Hyperparameter search over all possible dimensions for PCA reduction
    # 'pca__n_components': np.arange(1, 17),

    # 'svm__gamma': np.arange(0.001, 0.1, 0.001)
}

svm_classification_pipeline = Pipeline(
        [
            # Apply PCA to SVM Classification
            #('pca', PCA()),

            # Apply scaling to SVM Classification
            #('scale', StandardScaler()),

            ('svm', SVC())
        ]
    )

_accuracy_grid_search(values_train, hdi_class_train,
                        svm_classification_pipeline,
                        classification_svm_parameters)


# ## u)

# In[17]:


classification_svm_parameters = {
    # Use linear kernel for SVM Classification
Esempio n. 15
0
##### splitting data into train and test set
x_train, x_test, y_train, y_test = train_test_split(data['cleaned_text'],
                                                    data['labels'],
                                                    test_size=0.2,
                                                    random_state=10)

############### fit frequency based word embeddings into our data set to turn text into wordvectors

vectorizer = TfidfVectorizer(lowercase=True, stop_words=STOPWORDS)
vectorizer.fit(x_train)
x_train_vect = vectorizer.transform(x_train)
x_test_vect = vectorizer.transform(x_test)

############# Build our classifier with Linear Support vector machine

model = SVC(C=1, kernel='linear', class_weight='balanced')
model.fit(x_train_vect, y_train)

y_pred = model.predict(x_test_vect)

cm = confusion_matrix(y_test, y_pred)  ########## confusion matrix for test set

pipeline = make_pipeline(
    vectorizer,
    model)  #### save our model with pipeline function for future analysis


def predict(text):

    score = pipeline.predict([clean_text(text)])
# Provided to give you a starting point. Try a variety of classifiers.
# Stratified ShuffleSplit cross-validator.
# Provides train/test indices to split data in train/test sets.
# This cross-validation object is a merge of StratifiedKFold and ShuffleSplit,
# which returns stratified randomized folds. The folds are made by preserving the percentage of samples for each class.

# NaiveBayes
from sklearn.naive_bayes import GaussianNB

nb_clf = GaussianNB()

# SVM
from sklearn.svm import SVC

svm_clf = SVC()

# DecisionTree
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()

# RandomForest
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=25)

# AdaBoost
from sklearn.ensemble import AdaBoostClassifier

ab_clf = AdaBoostClassifier()
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth=6, max_features=7)
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_test)
print(confusion_matrix(y_test, pred_rfc))
print(classification_report(y_test, pred_rfc))
print(accuracy_score(y_test, pred_rfc))
rfc.fit(X_train_all, y_train_all)
pred_all_rfc = rfc.predict(X_test_all)
sub_rfc = pd.DataFrame()
sub_rfc['PassengerId'] = df_test['PassengerId']
sub_rfc['Survived'] = pred_all_rfc
#sub_rfc.to_csv('randforest.csv',index=False)

from sklearn.svm import SVC
svc = SVC(gamma = 0.01, C = 100)#, probability=True)
svc.fit(X_train_sc, y_train_sc)
pred_svc = svc.predict(X_test_sc)
print(confusion_matrix(y_test_sc, pred_svc))
print(classification_report(y_test_sc, pred_svc))
print(accuracy_score(y_test_sc, pred_svc))

svc.fit(X_train_all_sc, y_train_all_sc)
pred_all_svc = svc.predict(X_test_all_sc)

sub_svc = pd.DataFrame()
sub_svc['PassengerId'] = df_test['PassengerId']
sub_svc['Survived'] = pred_all_svc
sub_svc.to_csv('svc.csv',index=False)

# In[ ]:

from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression(max_iter=100)
logmodel.fit(X_train, y_train)
ypred = logmodel.predict(X_test)
print(logmodel.score(X_train, y_train))
print(confusion_matrix(y_test, ypred))
print(classification_report(y_test, ypred))

# *4. SVM*

# In[ ]:

from sklearn.svm import SVC
modelsvc = SVC(probability=True, gamma='auto')
modelsvc.fit(X_train, y_train)
ypred = modelsvc.predict(X_test)
print(modelsvc.score(X_train, y_train))
print(confusion_matrix(y_test, ypred))
print(classification_report(y_test, ypred))

# *6. Decision Tree*

# In[ ]:

from sklearn.tree import DecisionTreeClassifier
dmodel = DecisionTreeClassifier()
dmodel.fit(X_train, y_train)
ypred = dmodel.predict(X_test)
print(dmodel.score(X_train, y_train))
Esempio n. 19
0
X = dataset.iloc[:, [2,3]].values
y = dataset.iloc[:, 4].values

# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

# fitting classifier to the Training set
from sklearn.svm import SVC
classifier = SVC(kernel ='rbf', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
Esempio n. 20
0
feats=[]
humor = []
for key in dict.keys():
    value = dict[key]
    feats.append(value[0].tolist())
    humor.append(value[1].tolist())
feats = np.array(feats)
humor = np.array(humor)



if options.clf == 'GaussianProc':
    clf = GaussianProcessClassifier()
elif options.clf == "SVC":
    clf = SVC()
elif options.clf == "LinearSVC":
    clf = LinearSVC(max_iter=10000,dual=False)
elif options.clf == "DecisionTree":
    clf = DecisionTreeClassifier()
elif options.clf == "RandomForest":
    clf = RandomForestClassifier()
elif options.clf == "AdaBoost":
    clf = AdaBoostClassifier(n_estimators=100)
elif options.clf == "XGBoost":
    clf = XGBClassifier()
elif options.clf == "KNN":
    clf = KNeighborsClassifier(n_neighbors=5)
elif options.clf == "GaussianNB":
    clf = GaussianNB()
elif options.clf == "RBF":
#summary of the model predicion
print(classification_report(y_test,y_pred))
print('Confusion Matrix:\n',confusion_matrix(y_test,y_pred))

#accuracy score of the model
from sklearn.metrics import accuracy_score
print('accuracy score :',accuracy_score(y_pred,y_test))

"""### **Support Vector Machine(SVM)**"""

#Support Vector Machine(SVM)
#importing the library
from sklearn.svm import SVC
#creating local variable classifier
classifier = SVC(kernel='linear',random_state=0)
#Training the model
classifier.fit(X_train,y_train)

#predicting the value of Y
y_pred = classifier.predict(X_test)

#importing metrics for evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#summary of the model predicion
print(classification_report(y_test,y_pred))
print('Confusion Matrix:\n',confusion_matrix(y_test,y_pred))

#accuracy score of the model
Esempio n. 22
0
def get_res(x_train, y_train, x_test, y_test):

    knn = KNeighborsClassifier()
    knn.fit(x_train, y_train)

    lg = LogisticRegression(penalty='l2')
    lg.fit(x_train, y_train)

    dtc = DecisionTreeClassifier()
    dtc.fit(x_train, y_train)

    gb = GradientBoostingClassifier(n_estimators=200)
    gb.fit(x_train, y_train)

    ab = AdaBoostClassifier()
    ab.fit(x_train, y_train)

    gnb = GaussianNB()
    gnb.fit(x_train, y_train)

    svm = SVC()
    svm.fit(x_train, y_train)

    mnb = MultinomialNB(alpha=0.01)
    mnb.fit(x_train, y_train)

    bnb = BernoulliNB(alpha=1.0,
                      binarize=0.31,
                      fit_prior=True,
                      class_prior=None)
    bnb.fit(x_train, y_train)

    rtc = RandomForestClassifier(n_estimators=10,
                                 max_depth=20,
                                 random_state=47)
    rtc.fit(x_train, y_train)

    num_list = [
        knn.score(x_test, y_test),
        lg.score(x_test, y_test),
        dtc.score(x_test, y_test),
        gb.score(x_test, y_test),
        ab.score(x_test, y_test),
        gnb.score(x_test, y_test),
        svm.score(x_test, y_test),
        mnb.score(x_test, y_test),
        bnb.score(x_test, y_test),
        rtc.score(x_test, y_test)
    ]
    name_list = [
        'KNN', 'Logistic', 'DecisionTree', 'GradientBoosting', 'AdaBoost',
        'GaussianNB', 'SVC', 'MultinomialNB', 'BernoulliNB', 'RandomForest'
    ]
    plt.title('title')
    num_list = np.around(num_list, decimals=3)
    autolabel(
        plt.bar(range(len(num_list)),
                num_list,
                color='rb',
                tick_label=name_list,
                width=0.4))
    plt.show()
Esempio n. 23
0
forest['target_size'] = forest['size_category']
forest = forest.drop('size_category', axis=1)
forest.columns

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
train, test = train_test_split(forest, test_size=0.3)
test.head()
forest.shape
train_X = train.iloc[:, 0:45]
train_X.columns
train_y = train.iloc[:, -1]
test_X = test.iloc[:, 0:45]
test_y = test.iloc[:, -1]

model_linear = SVC(kernel="linear")
model_linear.fit(train_X, train_y)
pred_test_linear = model_linear.predict(test_X)

np.mean(pred_test_linear == test_y)  # Accuracy = 1.0

# Kernel = poly
model_poly = SVC(kernel="poly")
model_poly.fit(train_X, train_y)
pred_test_poly = model_poly.predict(test_X)

np.mean(pred_test_poly == test_y)  # Accuracy = 1.0

# kernel = rbf
model_rbf = SVC(kernel="rbf")
model_rbf.fit(train_X, train_y)
Esempio n. 24
0
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=0)

# Feature scaling
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

# Fitting the SVM to the Training Set
from sklearn.svm import SVC

cl = SVC(kernel='linear', random_state=0)
cl.fit(X_train, Y_train)

# Predicint the test set results
y_pred = cl.predict(X_test)

# Making the confustion matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(Y_test, y_pred)

# Visualising the Training set results
from matplotlib.colors import ListedColormap

X_set, Y_set = X_train, Y_train
X1, X2 = np.meshgrid(
Esempio n. 25
0
import pandas as pd
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis()]

def classify_test(X_train,y_train,X_test,y_test):
    for clf in classifiers:
        try:
            clf.fit(X_train, y_train)
        except:
            print('{} is wrong'.format(clf.__class__.__name__))   
        else:     
            name = clf.__class__.__name__
            print("="*30)
    from sklearn.naive_bayes import BernoulliNB
    from sklearn.svm import SVC

    print ' '
    print '============================='
    print 'Bernoulli SVC Classifier:'
    classifierBi = SklearnClassifier(BernoulliNB()).train(train_set)
    classifierBi.classify_many(test)

    for pdist in classifierBi.prob_classify_many(test):
        print pdist.prob('human'), pdist.prob('auto')

    for i in range(len(classifierBi.classify_many(test))):
        print classifierBi.classify_many(test)[i]

    classifierSVC = SklearnClassifier(SVC(), sparse=True).train(train_set)
    classifierSVC.classify_many(test)

    # svc = nltk.classify.accuracy(classifierSVC, test_set)
    # print 'accuracy is %.2f' %round(svc*100,4), '%'
    def SVC():
        classifierBi = SklearnClassifier(BernoulliNB()).train(train_set)
        return classifierSVC.classify_many(test)

    print "Performance of running Bernoulli SVC Classifier on test set: ", timeit.timeit(
        "SVC", setup="from __main__ import SVC", number=1)

    print ' '
    print '============================='
    print 'Linear SVC Classifier:'
    classifierLinSVC = SklearnClassifier(LinearSVC(),
Esempio n. 27
0
    # Success
    print ("{} trained on {} samples.".format(learner.__class__.__name__, sample_size))
    # Return the results
    return results


# Import the three supervised learning models from sklearn

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier


# TODO: Initialize the three models
clf_A = SVC()
clf_B = DecisionTreeClassifier(min_samples_split=20)
clf_C = AdaBoostClassifier()

# Calculate the number of samples for 1%, 10%, and 100% of the training data
# HINT: samples_100 is the entire training set i.e. len(y_train)
# HINT: samples_10 is 10% of samples_100
# HINT: samples_1 is 1% of samples_100
samples_100 = len(y_train)
samples_10 = len(y_train) // 10
samples_1 = len(y_train) // 100


# Collect results on the learners
results = {}
results = train_predict(clf_A, samples_1, X_train, y_train, X_test, y_test)
Esempio n. 28
0
def train(args):
    print("train call")
    print("Loading embeddings.")
    fname = "{}/labels.csv".format(args.workDir)
    labels = pd.read_csv(fname, header=None).as_matrix()[:, 1]
    labels = map(itemgetter(1),
                 map(os.path.split,
                     map(os.path.dirname, labels)))  # Get the directory.
    fname = "{}/reps.csv".format(args.workDir)
    embeddings = pd.read_csv(fname, header=None).as_matrix()
    le = LabelEncoder().fit(labels)
    labelsNum = le.transform(labels)
    nClasses = len(le.classes_)
    print("Training for {} classes.".format(nClasses))

    if args.classifier == 'LinearSvm':
        clf = SVC(C=1, kernel='linear', probability=True)
    elif args.classifier == 'GridSearchSvm':
        print("""
        Warning: In our experiences, using a grid search over SVM hyper-parameters only
        gives marginally better performance than a linear SVM with C=1 and
        is not worth the extra computations of performing a grid search.
        """)
        param_grid = [
            {'C': [1, 10, 100, 1000],
             'kernel': ['linear']},
            {'C': [1, 10, 100, 1000],
             'gamma': [0.001, 0.0001],
             'kernel': ['rbf']}
        ]
        clf = GridSearchCV(SVC(C=1, probability=True), param_grid, cv=5)
    elif args.classifier == 'GMM':  # Doesn't work best
        clf = GMM(n_components=nClasses)

    # ref:
    # http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html#example-classification-plot-classifier-comparison-py
    elif args.classifier == 'RadialSvm':  # Radial Basis Function kernel
        # works better with C = 1 and gamma = 2
        clf = SVC(C=1, kernel='rbf', probability=True, gamma=2)
    elif args.classifier == 'DecisionTree':  # Doesn't work best
        clf = DecisionTreeClassifier(max_depth=20)
    elif args.classifier == 'GaussianNB':
        clf = GaussianNB()

    # ref: https://jessesw.com/Deep-Learning/
    elif args.classifier == 'DBN':
        from nolearn.dbn import DBN
        clf = DBN([embeddings.shape[1], 500, labelsNum[-1:][0] + 1],  # i/p nodes, hidden nodes, o/p nodes
                  learn_rates=0.3,
                  # Smaller steps mean a possibly more accurate result, but the
                  # training will take longer
                  learn_rate_decays=0.9,
                  # a factor the initial learning rate will be multiplied by
                  # after each iteration of the training
                  epochs=300,  # no of iternation
                  # dropouts = 0.25, # Express the percentage of nodes that
                  # will be randomly dropped as a decimal.
                  verbose=1)

    if args.ldaDim > 0:
        clf_final = clf
        clf = Pipeline([('lda', LDA(n_components=args.ldaDim)),
                        ('clf', clf_final)])

    clf.fit(embeddings, labelsNum)

    fName = "{}/classifier.pkl".format(args.workDir)
    print("Saving classifier to '{}'".format(fName))
    with open(fName, 'w') as f:
        pickle.dump((le, clf), f)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

classifiers = [
    KNeighborsClassifier(3),
    SVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
	AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression()]

log_cols = ["Classifier", "Accuracy"]
log 	 = pd.DataFrame(columns=log_cols)

sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)

X = train[0::, 1::]
Esempio n. 30
0
second_pc = pca.components_[1]

#print var, sum(var), eigenfaces.shape, ei_mean.shape, X_train_pca.shape

###############################################################################
# Train a SVM classification model
print "Fitting the classifier to the training set"
t0 = time()
param_grid = {
    'C': [1e3, 5e3, 1e4, 5e4, 1e5],
    'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}
# for sklearn version 0.16 or prior, the class_weight parameter value is 'auto'
#Grid encuantra el mejor parametro de C y gamma pa ser utilizado con el kernel rbf
clf = GridSearchCV(
    SVC(kernel='rbf', class_weight='balanced', probability=True), param_grid)
clf = clf.fit(X_train_pca, y_train)
print "done in %0.3fs" % (time() - t0)
print "Best estimator found by grid search:"
print clf.best_estimator_

###############################################################################
# Quantitative evaluation of the model quality on the test set
print "Predicting the people names on the testing set"
t0 = time()
y_pred = clf.predict(X_test_pca)
y_proba = clf.predict_proba(X_test_pca)
print "done in %0.3fs" % (time() - t0)

#Guardar Variables del modelo ya entrenado
with open('Clasificador.pkl', 'w') as f:  # Python 3: open(..., 'wb')