Example #1
0
def example():
    """
    ========
    集成方法
    ========
    """
    # bagging 方法
    # 使用均匀取样,每个样例的权重相等 平均预测
    from sklearn.ensemble import BaggingClassifier
    from sklearn.neighbors import KNeighborsClassifier
    bagging = BaggingClassifier(KNeighborsClassifier(),
                                max_samples=0.5,
                                max_features=0.5)

    # 随机森林 (bagging + dt 随机抽样训练样本)
    from sklearn.ensemble import RandomForestClassifier
    X = [[0, 0], [1, 1]]
    Y = [0, 1]
    clf = RandomForestClassifier(n_estimators=10)
    clf = clf.fit(X, Y)

    # 极端随机森林(每次利用全部样本,训练,但分叉属性的划分值完全随机进行左右分叉)
    from sklearn.model_selection import cross_val_score
    from sklearn.datasets import make_blobs
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import ExtraTreesClassifier
    from sklearn.tree import DecisionTreeClassifier

    X, y = make_blobs(n_samples=10000,
                      n_features=10,
                      centers=100,
                      random_state=0)

    clf = DecisionTreeClassifier(max_depth=None,
                                 min_samples_split=2,
                                 random_state=0)
    scores = cross_val_score(clf, X, y, cv=5)
    print(scores.mean())

    clf = RandomForestClassifier(n_estimators=10,
                                 max_depth=None,
                                 min_samples_split=2,
                                 random_state=0)
    scores = cross_val_score(clf, X, y, cv=5)
    print(scores.mean())

    clf = ExtraTreesClassifier(n_estimators=10,
                               max_depth=None,
                               min_samples_split=2,
                               random_state=0)
    scores = cross_val_score(clf, X, y, cv=5)
    print(scores.mean())

    # #######################################################
    # boosting 提升方法(通常弱分类器组合)(不可并行)
    # 选部分数据作为第一次训练集,分错样本+剩余训练数据作为下一次训练集,循环,分类好的分类器权重大

    # AdaBoost
    from sklearn.model_selection import cross_val_score
    from sklearn.datasets import load_iris
    from sklearn.ensemble import AdaBoostClassifier

    iris = load_iris()
    clf = AdaBoostClassifier(n_estimators=100)
    scores = cross_val_score(clf, iris.data, iris.target, cv=5)
    print(scores.mean())

    # GradientBoosting 梯度提升(通常弱分类器组合)
    # 样例:https://www.cnblogs.com/peizhe123/p/5086128.html
    from sklearn.datasets import make_hastie_10_2
    from sklearn.ensemble import GradientBoostingClassifier

    X, y = make_hastie_10_2(random_state=0)
    X_train, X_test = X[:2000], X[2000:]
    y_train, y_test = y[:2000], y[2000:]

    clf = GradientBoostingClassifier(n_estimators=100,
                                     learning_rate=1.0,
                                     max_depth=1,
                                     random_state=0).fit(X_train, y_train)
    print(clf.score(X_test, y_test))
    """
    =============
    多标签、多分类
    =============
    """
    # 多标签形式
    from sklearn.preprocessing import MultiLabelBinarizer
    y = [[2, 3, 4], [2], [0, 1, 3], [0, 1, 2, 3, 4], [0, 1, 2]]
    print(MultiLabelBinarizer().fit_transform(y))

    # 利用ovr,ovo多分类
    from sklearn import datasets
    from sklearn.multiclass import OneVsRestClassifier
    from sklearn.multiclass import OneVsOneClassifier
    from sklearn.svm import LinearSVC
    iris = datasets.load_iris()
    X, y = iris.data, iris.target
    clf_ovr = OneVsRestClassifier(LinearSVC())
    print(clf_ovr.fit(X, y).predict(X))
    # 利用ovr可以进行多标签预测
    # also supports multilabel classification. To use this feature,
    # feed the classifier an indicator matrix, in which cell [i, j] indicates the presence of label j in sample i

    clf_ovo = OneVsOneClassifier(LinearSVC())
    print(clf_ovo.fit(X, y).predict(X))
    """
    =============
    特征选择
    =============
    """

    # 1.方差移除
    from sklearn.feature_selection import VarianceThreshold
    X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]
    sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
    print(sel.fit_transform(X))

    # 2.单变量特征选择
    #   SelectKBest
    # 	SelectPercentile
    # 	SelectFpr, SelectFdr, SelectFwe
    from sklearn.datasets import load_iris
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import chi2
    iris = load_iris()
    X, y = iris.data, iris.target
    print(X.shape)

    X_new = SelectKBest(chi2, k=2).fit_transform(X, y)
    print(X_new.shape)

    # scores_和pvalues_
    # p值越小,拒绝原假设,原假设:该特征和y不相关
    skb = SelectKBest(chi2, k=2).fit(X, y)
    print(skb.scores_)
    print(skb.pvalues_)

    # source function
    """
    For regression: f_regression, mutual_info_regression
    For classification: chi2, f_classif, mutual_info_classif
    """

    # 3.递归特征消除
    from sklearn.svm import SVC
    from sklearn.datasets import load_digits
    from sklearn.feature_selection import RFE
    import matplotlib.pyplot as plt

    # Load the digits dataset
    digits = load_digits()
    X = digits.images.reshape((len(digits.images), -1))
    y = digits.target

    # Create the RFE object and rank each pixel
    svc = SVC(kernel="linear", C=1)
    rfe = RFE(estimator=svc, n_features_to_select=1, step=1)
    rfe.fit(X, y)
    ranking = rfe.ranking_.reshape(digits.images[0].shape)

    # Plot pixel ranking
    plt.matshow(ranking, cmap=plt.cm.Blues)
    plt.colorbar()
    plt.title("Ranking of pixels with RFE")
    plt.show()

    # 4.1 SelectFromModel
    from sklearn.svm import LinearSVC
    from sklearn.datasets import load_iris
    from sklearn.feature_selection import SelectFromModel
    iris = load_iris()
    X, y = iris.data, iris.target
    print(X.shape)

    lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
    model = SelectFromModel(lsvc, prefit=True)
    X_new = model.transform(X)
    print(X_new.shape)

    # 4.2 树结构 feature_importances_
    from sklearn.ensemble import ExtraTreesClassifier
    from sklearn.datasets import load_iris
    from sklearn.feature_selection import SelectFromModel
    iris = load_iris()
    X, y = iris.data, iris.target
    print(X.shape)

    clf = ExtraTreesClassifier(n_estimators=50)
    clf = clf.fit(X, y)
    print(clf.feature_importances_)

    model = SelectFromModel(clf, prefit=True)
    X_new = model.transform(X)
    print(X_new.shape)
    """
    =============
    神经网络
    =============
    """
    # 神经网络,参数讲解
    from sklearn.neural_network import MLPClassifier
    X = [[0., 0.], [1., 1.]]
    y = [0, 1]
    clf = MLPClassifier(solver='lbfgs',
                        alpha=1e-5,
                        hidden_layer_sizes=(5, 2),
                        random_state=1)

    print(clf.fit(X, y))
    print(clf.predict([[2., 2.], [-1., -2.]]))
    print(clf.coefs_)
    print(clf.intercepts_)
    print(clf.loss_)
    taxa_de_acertos = 100.0 * total_de_acertos / total_de_elementos # Cálculo da taxa de acertos.

    msg = "Taxa de acertos do vencedor entre os dois algoritmos no mundo real: {0}".format(taxa_de_acertos) # Definição do elemento taxa de acertos" ({0}) para a função "format()".
    print(msg) # Exibição da variável "msg".

resultados = {} # Instanciação do dicionário "resultados".

from sklearn.multiclass import OneVsRestClassifier # Implementação do algoritmo "OneVsRestClassifier" da biblioteca "scikit-learn".
from sklearn.svm import LinearSVC # Implementação do algoritmo "LinearSVC" da biblioteca "scikit-learn".
modeloOneVsRest = OneVsRestClassifier(LinearSVC(random_state = 0)) # Atribuição do algoritmo "OneVSRestClassifier" recebendo o modelo "LinearSVC" que recebe o parâmetro "random_state = 0" (fazendo-o rodar de maneira fixa) para a variável "modelo".
resultadoOneVsRest = fit_and_predict("OneVsRest", modeloOneVsRest, treino_dados, treino_marcacoes) # Definição da variável "resultadoOneVsRest" para armazenar o retorno "taxa_de_acertos" contido no método "fit_and_predict".
resultados[resultadoOneVsRest] = modeloOneVsRest # Atribuição da variável "modeloOneVsRest" para uma posição do dicionário "resultados".

from sklearn.multiclass import OneVsOneClassifier # Implementação do algoritmo "OneVsOneClassifier" da biblioteca "scikit-learn".
modeloOneVsOne = OneVsOneClassifier(LinearSVC(random_state = 0)) # Atribuição do algoritmo "OneVSOneClassifier" recebendo o modelo "LinearSVC" que recebe o parâmetro "random_state = 0" (fazendo-o rodar de maneira fixa) para a variável "modelo".
resultadoOneVsOne = fit_and_predict("OneVsOne", modeloOneVsOne, treino_dados, treino_marcacoes) # Definição da variável "resultadoOneVsOne" para armazenar o retorno "taxa_de_acertos" contido no método "fit_and_predict".
resultados[resultadoOneVsOne] = modeloOneVsOne # Atribuição da variável "modeloOneVsOne" para uma posição do dicionário "resultados".

from sklearn.naive_bayes import MultinomialNB # Implementação do algoritmo "MultinomialNB" da biblioteca "scikit-learn".
modeloMultinomial = MultinomialNB() # Atribuição do algoritmo "MultinomialNB" para a variável "modeloMultinomial".
resultadoMultinomial = fit_and_predict("MultinomialNB", modeloMultinomial, treino_dados, treino_marcacoes) # Definição da variável "resultadoMultinomial" para armazenar o retorno "taxa_de_acertos" contido no método "fit_and_predict".
resultados[resultadoMultinomial] = modeloMultinomial # Atribuição da variável "modeloMultinomial" para uma posição do dicionário "resultados".

from sklearn.ensemble import AdaBoostClassifier # Implementação do algoritmo "AdaBoostClassifier" da biblioteca "scikit-learn".
modeloAdaBoost = AdaBoostClassifier() # Atribuição do algoritmo "AdaBoostClassifier" para a variável "modeloAdaBoost".
resultadoAdaBoost = fit_and_predict("AdaBoostClassifier", modeloAdaBoost, treino_dados, treino_marcacoes) # Definição da variável "resultadoAdaBoost" para armazenar o retorno "taxa_de_acertos" contido no método "fit_and_predict".
resultados[resultadoAdaBoost] = modeloAdaBoost # Atribuição da variável "modeloAdaBoost" para uma posição do dicionário "resultados".

print(resultados) # Exibição do vetor "resultados".
Example #3
0
import sklearn.metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from numpy import *
import datasets

if not datasets.Quizbowl.loaded:
    datasets.loadQuizbowl()

print '\n\nRUNNING ON EASY DATA\n'
    
print 'training oaa'
X = datasets.QuizbowlSmall.X
Y = datasets.QuizbowlSmall.Y
oaa = OneVsOneClassifier(LinearSVC(random_state=0)).fit(X, Y)
print 'predicting oaa'
oaaDevPred = oaa.predict(datasets.QuizbowlSmall.Xde)
print 'error = %g' % mean(oaaDevPred != datasets.QuizbowlSmall.Yde)

print 'training ava'
ava = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X, Y)
print 'predicting ava'
avaDevPred = ava.predict(datasets.QuizbowlSmall.Xde)
print 'error = %g' % mean(avaDevPred != datasets.QuizbowlSmall.Yde)

print '\n\nRUNNING ON HARD DATA\n'
    
print 'training oaa'
X = datasets.QuizbowlHardSmall.X
Y = datasets.QuizbowlHardSmall.Y
Example #4
0
    def fit(self,
            x_train,
            y_train,
            neighbors,
            L,
            classifier=None,
            model_type='ori',
            **kwargs):
        """
        Method to initialize training data and fit the classifiers.
        
        Parameters:
        - - - - -

            x_train : training feature data, partitioned by response
            
            y_train : training response vectors, partitioned by response
            
            model_type : type of classification scheme for multi-class 
                         A   prediction models

            kwargs : optional arguments for classifier
        """

        if not classifier:
            classifier = rfc(n_estimators=self.n_estimators,
                             max_depth=self.max_depth,
                             n_jobs=-1)

        labels = np.arange(1, L + 1)
        self.labels = labels
        self.neighbors = neighbors

        x_train = du.mergeValueArrays(x_train)
        y_train = du.mergeValueLists(y_train)

        self.input_dim = x_train.shape[1]

        labelKeys = x_train.keys()

        # get valid arguments for supplied classifier
        # get valid parameters passed by user
        # update classifier parameters
        # save base models
        classifier_params = inspect.getargspec(classifier.__init__)
        classArgs = cu.parseKwargs(classifier_params, kwargs)
        classifier.set_params(**classArgs)

        print 'depth: {}'.format(classifier.max_depth)
        print 'nEst: {}'.format(classifier.n_estimators)

        model_selector = {
            'oVo': OneVsOneClassifier(classifier),
            'oVr': OneVsRestClassifier(classifier),
            'ori': classifier
        }

        models = {}.fromkeys(labels)

        for i, lab in enumerate(labels):
            if lab in labelKeys and lab in neighbors.keys():

                # compute confusion set of labels
                labelNeighbors = set([lab]).union(
                    neighbors[lab]).intersection(labels)

                # copy the model (due to passing by object-reference)
                models[lab] = copy.deepcopy(model_selector[model_type])

                # extract data for confusion set, train model
                training = du.mergeValueArrays(x_train, keys=labelNeighbors)
                response = du.mergeValueLists(y_train, keys=labelNeighbors)

                models[lab].fit(training, np.squeeze(response))

        self.models = models
Example #5
0
# Convert string data to numerical data
label_encoder = []
X_encoded = np.empty(X.shape)
for i, item in enumerate(X[0]):
    if item.isdigit():
        X_encoded[:, i] = X[:, i]
    else:
        label_encoder.append(preprocessing.LabelEncoder())
        X_encoded[:, i] = label_encoder[-1].fit_transform(X[:, i])

X = X_encoded[:, :-1].astype(int)
y = X_encoded[:, -1].astype(int)

# Create SVM classifier
classifier = OneVsOneClassifier(LinearSVC(random_state=0))

# Train the classifier
classifier.fit(X, y)

# Cross validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X, y, test_size=0.2, random_state=5)
classifier = OneVsOneClassifier(LinearSVC(random_state=0))
classifier.fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)

# Compute the F1 score of the SVM classifier
f1 = cross_validation.cross_val_score(classifier,
                                      X,
                                      y,
from sklearn.linear_model import SGDClassifier

# multiclass classification
sgd_clf = SGDClassifier(max_iter=1000, random_state=42)
sgd_clf.fit(X_train, y_train)
# print(sgd_clf.predict([some_digit]))    # [3.]
some_digit_scores = sgd_clf.decision_function([some_digit])
# print(some_digit_scores)                # return 10 scores
# print(np.argmax(some_digit_scores))     # 3
# print(sgd_clf.classes_[np.argmax(some_digit_scores)])   # 3.0

from sklearn.multiclass import OneVsOneClassifier

# creates a multiclass classifier using the OvO strategy, based on a SGDClassifier
ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42))
ovo_clf.fit(X_train, y_train)
# print(ovo_clf.predict([some_digit]))

from sklearn.ensemble import RandomForestClassifier

# Training a RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(X_train, y_train)
# print(forest_clf.predict([some_digit]))

from sklearn.model_selection import cross_val_score

# Let's evaluate the SGDClassifier's accuracy using the cross_val_score() function
sgd_clf_acc = cross_val_score(sgd_clf,
                              X_train,
#%%
some_digit_scores = sgd_clf.decision_function([some_digit])
some_digit_scores

#%%
np.argmax(some_digit_scores)

#%%
sgd_clf.classes_

#%%
sgd_clf.classes_[5]

#%%
from sklearn.multiclass import OneVsOneClassifier
ovo_clf = OneVsOneClassifier(
    SGDClassifier(max_iter=5, tol=-np.infty, random_state=42))
ovo_clf.fit(X_train, y_train)
ovo_clf.predict([some_digit])

#%%
len(ovo_clf.estimators_)

#%%
forest_clf.fit(X_train, y_train)
forest_clf.predict([some_digit])

#%%
forest_clf.predict_proba([some_digit])

#%%
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")
    "auth_div_avg", "auth_div_max", "auth_hindex_avg", "auth_hindex_max",
    "auth_soc_avg", "auth_soc_max", "team"
]):
    citation_data = pd.DataFrame.from_csv("./workspace/SVR.txt")
    citation_data = citation_data.loc[citation_data['paper_year'] <= 1995]
    citation_data.iloc[np.random.permutation(len(citation_data))]

    X = np.array(citation_data[features].values)
    #X = preprocessing.scale(X)
    y = (citation_data["paper_cat"].values.tolist())
    return X, y


X, y = Build_Data_Set()

clf = OneVsOneClassifier(SVC(random_state=0, kernel='poly'))
clf.fit(X, y)
#Saving the classifier
import pickle

save_classifier = open("./workspace/polysvr.pickle", "wb")
pickle.dump(clf, save_classifier)
save_classifier.close()


#testing
def Build_Data_Set(features=[
    "citation2yrs", "RDI", "rcount", "auth_prod_avg", "authprod_max",
    "auth_div_avg", "auth_div_max", "auth_hindex_avg", "auth_hindex_max",
    "auth_soc_avg", "auth_soc_max", "team"
]):
Example #9
0
def linear(x,y,xt):
	OVO = OneVsOneClassifier(LogisticRegression())
	OVO.fit(x,y)
	pred=OVO.predict(xt)
	return pred
Example #10
0
    #QDA(),
    #DecisionTreeClassifier(),
    #RandomForestClassifier(n_estimators=10, n_jobs=-1),
    #ExtraTreesClassifier(n_estimators=10, n_jobs=-1),
    AdaBoostClassifier(n_estimators=50, learning_rate=1.0),
    #NearestCentroid(),
    #KNeighborsClassifier(),
    #LinearRegression(normalize=False, n_jobs=-1),
    #LinearRegression(normalize=True, n_jobs=-1),
    #LinearRegression(n_jobs=-1),
    LogisticRegression(),
    #SVC(kernel='rbf', gamma=2, C=1), # VERY SLOW
    #SVC(kernel='linear', C=0.025), # VERY SLOW
    OneVsRestClassifier(
        LinearSVC(penalty='l1', loss='squared_hinge', dual=False, tol=1e-4)),
    OneVsOneClassifier(
        LinearSVC(penalty='l1', loss='squared_hinge', dual=False, tol=1e-4)),
    OneVsRestClassifier(SVC(kernel='rbf', gamma=2, C=1), n_jobs=-1),
    OneVsOneClassifier(SVC(kernel='rbf', gamma=2, C=1), n_jobs=-1)
]

names = [
    #'GaussianNB',
    'MultinomialNB',
    'LDA',
    #'QDA',
    #'DecisionTreeClassifier',
    #'RandomForestClassifier',
    #'ExtraTreesClassifier',
    'AdaBoostClassifier',
    #'NearestCentroid',
    #'KNeighborsClassifier',
ovr.fit(X_train[:,:2], y_train)
print("ovr.score:",ovr.score(X_test[:,:2],y_test))
#ovr.score: 0.6


################################################################################


#逻辑线性回归支持多分类- OVO
from sklearn.linear_model import LogisticRegression
log_reg_ovo = LogisticRegression(multi_class='multinomial', solver='newton-cg')
log_reg_ovo.fit(X_train[:,:2], y_train)
print("log_reg_ovo.score:",log_reg_ovo.score(X_test[:,:2],y_test))
#log_reg_ovo.score: 0.8

# 逻辑线性回归OVO多分类的决策边界
plot_decision_boundary(log_reg_ovo, axis=[4, 8, 1.5, 4.5])
plt.scatter(X[y==0, 0], X[y==0, 1], color='g', label='y==0')
plt.scatter(X[y==1, 0], X[y==1, 1], color='b', label='y==1')
plt.scatter(X[y==2, 0], X[y==2, 1], color='r', label='y==2')
plt.legend()
plt.show()

#OVO类,可以将任意二分类转换为多分类
from sklearn.multiclass import OneVsOneClassifier
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
ovo = OneVsOneClassifier(lr)
ovo.fit(X_train[:,:2], y_train)
print("ovo.score:",ovo.score(X_test[:,:2],y_test))
#ovo.score: 0.6333333333333333
                            loss='squared_hinge',
                            dual=True,
                            tol=0.0001,
                            C=1.0,
                            multi_class='ovr',
                            fit_intercept=True,
                            intercept_scaling=1,
                            class_weight=None,
                            verbose=0,
                            random_state=None,
                            max_iter=1000)
        elif args.classifier == 'multiclass':
            clf = OneVsRestClassifier(
                SVC(C=1, kernel='linear', probability=True))
        elif args.classifier == 'multiclass2':
            clf = OneVsOneClassifier(
                SVC(C=1, kernel='linear', probability=True))
        elif args.classifier == 'sgd':
            clf = linear_model.SGDClassifier()
        else:
            print('unknown classifier', args.classifier)
            sys.exit()

        print('train svm')
        clf.fit(trainEmbeddings, labelsNum)
        print('test elvis PM')
        pmRes = testElvis(args, le, clf, testPMs)
        print('test elvis not PM')
        notPmRes = testElvisNotPm(args, le, clf)
        #print('PM found, PM not found, no face found, wrong PM found')
        print(pmRes)
        for i in range(len(testPMs)):
Example #13
0
def getFitness(individual, X, y):
    """
    Feature subset fitness function
    """

    if individual.count(0) != len(individual):
        # get index with value 0
        cols = [index for index in range(
            len(individual)) if individual[index] == 0]

        # get features subset
        X_parsed = X.drop(X.columns[cols], axis=1)
        X_subset = pd.get_dummies(X_parsed)

        # X_subset = X
        #
        # for col in cols:
        #     X_subset[col].values[:] = 0

        # apply classification algorithm
        clf = AdaBoostClassifier()
        clf = BaggingClassifier()
        clf = BernoulliNB()

        clf = CalibratedClassifierCV()
        clf = CategoricalNB()
        clf = ClassifierChain()
        clf = ComplementNB()

        clf = DecisionTreeClassifier()
        clf = DummyClassifier()

        clf = ExtraTreeClassifier()
        clf = ExtraTreesClassifier()

        clf = GaussianNB()
        clf = GaussianProcessClassifier()
        clf = GradientBoostingClassifier()

        # clf = HistGradientBoostingClassifier()

        clf = KNeighborsClassifier()

        clf = LabelPropagation()
        clf = LabelSpreading()
        clf = LinearDiscriminantAnalysis()
        clf = LinearSVC()
        clf = LogisticRegression()
        clf = LogisticRegressionCV()

        clf = MLPClassifier()
        clf = MultiOutputClassifier()
        clf = MultinomialNB()

        clf = NearestCentroid()
        clf = NuSVC()

        clf = OneVsOneClassifier()
        clf = OneVsRestClassifier()
        clf = OutputCodeClassifier()

        clf = PassiveAggressiveClassifier()
        clf = Perceptron()

        clf = QuadraticDiscriminantAnalysis()

        clf = RadiusNeighborsClassifier()
        clf = RandomForestClassifier()
        clf = RidgeClassifier()
        clf = RidgeClassifierCV()

        clf = SGDClassifier()
        clf = SVC()
        clf = StackingClassifier()

        clf = VotingClassifier()

        # clf.fit(X, y)
        # clf.fit(X_subset, y_train)
        clf.fit(X_subset, y)

        # y_pred_ANN = clf.predict(X_test)
        # y_pred = clf.predict(X_subset)

        # score = cross_val_score(clf, X, y, cv=5)
        #
        # print(max(score), min(score))

        return (avg(cross_val_score(clf, X_subset, y, cv=5)),)
        # return (avg(score),)
        # return accuracy_score(y, y_pred_ANN)
    else:
        return (0,)
	#print training_files.data

predict_files = sklearn.datasets.load_files("/../../../SVM/dataset_prediction")

print "Predict",predict_files.data

vectorizer = TfidfVectorizer(encoding='utf-8')
X_t = vectorizer.fit_transform((open(f).read() for f in training_files.filenames))
print("n_samples: %d, n_features: %d" % X_t.shape)
assert sp.issparse(X_t)
	
	


X_p = vectorizer.transform((open(f).read() for f in predict_files.filenames))
	
y= OneVsOneClassifier(LinearSVC(random_state=0)).fit(X_t,training_files.target).predict(X_p)	
print y[0]
if y[0]==0:
	f1=open("/../../../SVM/out.txt",'w')
	f1.write("0")
	f1.close()
elif y[0]==1:
	f1=open("/../../../SVM/out.txt",'w')
	f1.write("1")
	f1.close()
elif y[0]==2:
	f1=open("/../../../SVM/out.txt",'w')
	f1.write("2")
	f1.close()
Example #15
0
    sgd_clf.fit(X_train, y_train)  # train the classifier
    sgd_clf.predict([some_digit])  # some_digit == 5
    some_digit_scores = sgd_clf.decision_function([some_digit])  # the classifier runs with the OvA strategy
    # OvA strategy makes sure all classes have their own binary classifier. And all of them will run over the object.

    # print(some_digit_scores)
    # the output is list of ten scores, which are the scores of number 0-9. The sixth is highest for some_digit is 5.
    # output: [[-211564.05865206 -219445.21022825 -461783.93374972  -16252.73324556 -288195.70441995   34930.7725491
    # -335369.12969411 -282270.17392149 -25547.54596887 -339794.68286819]] the sixth (#5) is the biggest one.

    some_digit_max = np.argmax(some_digit_scores)
    # print(some_digit_max)  # output: 5  The max score's index is 5.
    # print(sgd_clf.classes_)  # output: [0. 1. 2. 3. 4. 5. 6. 7. 8. 9.]

    # 2. OvO
    ovo_clf = OneVsOneClassifier(SGDClassifier(max_iter=5, random_state=42))  # force sgd into OvO
    ovo_clf.fit(X_train, y_train)
    ovo_predicted = ovo_clf.predict([some_digit])
    # print(ovo_predicted)  # output: [5.]
    # The binary classifier.predict returns True in this case but OvO returns 5 for OvO runs 45 classifiers.

    ovo_clf_count = len(ovo_clf.estimators_)  # show the count of ovo_clf's classifiers
    # print(ovo_clf_count)  # output: 45      That is N*(N-1)/2, N=10

    # multi-classifier forest as the OvO contrast
    forest_clf.fit(X_train, y_train)  # forest_clf is capable of handling multi-classes. No need to force it into OvO
    forest_clf.predict([some_digit])  # so its predict returns [5]
    forest_probability_predicted = forest_clf.predict_proba([some_digit])
    # print(forest_probability_predicted)
    # output: [[0.1 0.  0.  0.  0.  0.9 0.  0.  0.  0. ]] the probability of each class can be assigned
Example #16
0
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(comments)

def benchmark(classifier, data, target):
    cls_name = type(classifier).__name__
    print("Classifying using {}...".format(cls_name))
    predicted = cross_val_predict(classifier, data, target, cv=10)

    print("{} Report".format(cls_name))
    print("===================")
    print(metrics.classification_report(target, predicted))

    pp = pprint.PrettyPrinter(indent=2)
    print("")
    print("Accuracy: {}".format(metrics.accuracy_score(target,predicted)))
    print("Confusion Matrix")
    print(metrics.confusion_matrix(target,predicted))
    print("===========")
    print("")

classifiers = [
    RandomForestClassifier(n_estimators=12, max_depth=None,
                           min_samples_split=2, random_state=0),
    OneVsOneClassifier(LinearSVC(random_state=2)),
    OneVsRestClassifier(LinearSVC(random_state=2)),
]

for classifier in classifiers:
    benchmark(classifier, tfidf, categories)
#Scikit-Learn trained 10 binary classifiers, got their decision scores for the image and selected the class with the
#highest score

#Calling the decision_functio() will show this byu returning 10 scores, on per class instead of just one score
some_digit_scores = sgd_clf.decision_function([some_digit])
print(some_digit_scores
      )  #The highest score will indeed be the correct class ("5")

#The highest score from the array output above will be the correct class ("5") and is proven with the code below
print(np.argmax(some_digit_scores))
print(sgd_clf.classes_)
print(sgd_clf.classes_[5])

#Can force either OvO or OvA strategy by creating instance of these classes and passing a binary classifier to it, as
#seen in this code
ovo_clf = OneVsOneClassifier(SGDClassifier(
    random_state=42))  #Create instance of OvO class and pass SGDClassifier
ovo_clf.fit(X_train,
            y_train)  #Classification on all target classes (0 through 9)
print(ovo_clf.predict([some_digit]))
print(len(ovo_clf.estimators_))

#Can also train a RandomForestClassifier
forest_clf.fit(X_train, y_train)
print(forest_clf.predict([some_digit]))
#Scikit-Learn didn't need to run OvA or OvO because Random Forest classifiers can directly classify multiple classes
#We can call predict_proba() to get a list of the probabilities that the classifier assigned to each instance for each
#class
print(forest_clf.predict_proba([
    some_digit
]))  #Model is highly confident that a "5" is indeed a "5" (can also be other
#values according to non-zero probabilities)
Example #18
0
    ings = [
        WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', w))
        for w in entry['ingredients']
    ]

    test_ingredients.append(' '.join(ings))

#used to encode labels as numbers for use with RandomForestClassifier
le = LabelEncoder()

#encode cuisines as numbers
train_cuisines = le.fit_transform(train_cuisines)

#used to create bag of ingredients vocabulary and create features for each entry
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform(train_ingredients).toarray()
test_features = vectorizer.transform(test_ingredients).toarray()

clf = OneVsOneClassifier(LinearSVC(random_state=0)).fit(
    train_features, train_cuisines)
result = clf.predict(test_features)

output = pd.DataFrame(data={
    'id': test_ids,
    'cuisine': le.inverse_transform(result)
})

#force explicit ordering of columns
output = output[['id', 'cuisine']]
output.to_csv('ovo.csv', index=False)
Example #19
0
#get reduced matrix
k = 50
lsi_model = TruncatedSVD(n_components=k, random_state=42)
nmf_model = NMF(n_components=k)
train_LSI_array = lsi_model.fit_transform(train_tf)
train_NMF_array = nmf_model.fit_transform(train_tf)
test_LSI_array = lsi_model.transform(test_tf_array)
test_NMF_array = nmf_model.transform(test_tf_array)

svm_train_original = SVC(C=100, kernel='linear')

for i in range(0, 2):
    print("===========================================================")
    if (i == 0):
        svm_train = OneVsOneClassifier(svm_train_original)
        print("svm(One vs One):")
    else:
        svm_train = OneVsRestClassifier(svm_train_original)
        print("svm(One vs Rest):")
    #LSI
    svm_train.fit(train_LSI_array, train_data.target)
    test_result = svm_train.predict(test_LSI_array)
    LSI_precision = precision_score(test_data.target,
                                    test_result,
                                    average='weighted')
    LSI_recall = recall_score(test_data.target,
                              test_result,
                              average='weighted')
    LSI_confusionMatrix = confusion_matrix(test_data.target, test_result)
    LSI_accuracy = svm_train.score(test_LSI_array, test_data.target)
Example #20
0
def test_ovo_exceptions():
    ovo = OneVsOneClassifier(LinearSVC(random_state=0))
    with pytest.raises(NotFittedError):
        ovo.predict([])
Example #21
0
data = pd.read_csv("Forest_fire.csv")
data = np.array(data)

X = data[1:, 1:-1]
y = data[1:, -1]
y = y.astype('int')
X = X.astype('int')
#print(X,y)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)
log_reg = LogisticRegression()
lin_reg = LinearRegression()
svm_class = OneVsOneClassifier(SVC(random_state=0))
lin_reg.fit(X_train, y_train)

log_reg.fit(X_train, y_train)
svm_class.fit(X_train, y_train)
y_pred = lin_reg.predict(X_test)
#print("Accuracy of")
#print('\n Linear Regession:',lin_reg.score(X_test,y_test))
print('\n Logistic Regession:', log_reg.score(X_test, y_test))
#print('\n Support Vector Regession:',svm_class.score(X_test,y_test))
#print("Error is",np.sqrt(metrics.mean_squared_error(y_test,y_pred)))
b = log_reg.predict_proba(X_test)
print(y_test, b)
ammonia = input("Enter Temperature in Celsius")
oxygen = input("Enter oxygen concentration in ppm")
humidity = input("Enter Humidity percentage")
Example #22
0
def test_pairwise_n_features_in():
    """Check the n_features_in_ attributes of the meta and base estimators

    When the training data is a regular design matrix, everything is intuitive.
    However, when the training data is a precomputed kernel matrix, the
    multiclass strategy can resample the kernel matrix of the underlying base
    estimator both row-wise and column-wise and this has a non-trivial impact
    on the expected value for the n_features_in_ of both the meta and the base
    estimators.
    """
    X, y = iris.data, iris.target

    # Remove the last sample to make the classes not exactly balanced and make
    # the test more interesting.
    assert y[-1] == 0
    X = X[:-1]
    y = y[:-1]

    # Fitting directly on the design matrix:
    assert X.shape == (149, 4)

    clf_notprecomputed = svm.SVC(kernel="linear").fit(X, y)
    assert clf_notprecomputed.n_features_in_ == 4

    ovr_notprecomputed = OneVsRestClassifier(clf_notprecomputed).fit(X, y)
    assert ovr_notprecomputed.n_features_in_ == 4
    for est in ovr_notprecomputed.estimators_:
        assert est.n_features_in_ == 4

    ovo_notprecomputed = OneVsOneClassifier(clf_notprecomputed).fit(X, y)
    assert ovo_notprecomputed.n_features_in_ == 4
    assert ovo_notprecomputed.n_classes_ == 3
    assert len(ovo_notprecomputed.estimators_) == 3
    for est in ovo_notprecomputed.estimators_:
        assert est.n_features_in_ == 4

    # When working with precomputed kernels we have one "feature" per training
    # sample:
    K = X @ X.T
    assert K.shape == (149, 149)

    clf_precomputed = svm.SVC(kernel="precomputed").fit(K, y)
    assert clf_precomputed.n_features_in_ == 149

    ovr_precomputed = OneVsRestClassifier(clf_precomputed).fit(K, y)
    assert ovr_precomputed.n_features_in_ == 149
    assert ovr_precomputed.n_classes_ == 3
    assert len(ovr_precomputed.estimators_) == 3
    for est in ovr_precomputed.estimators_:
        assert est.n_features_in_ == 149

    # This becomes really interesting with OvO and precomputed kernel together:
    # internally, OvO will drop the samples of the classes not part of the pair
    # of classes under consideration for a given binary classifier. Since we
    # use a precomputed kernel, it will also drop the matching columns of the
    # kernel matrix, and therefore we have fewer "features" as result.
    #
    # Since class 0 has 49 samples, and class 1 and 2 have 50 samples each, a
    # single OvO binary classifier works with a sub-kernel matrix of shape
    # either (99, 99) or (100, 100).
    ovo_precomputed = OneVsOneClassifier(clf_precomputed).fit(K, y)
    assert ovo_precomputed.n_features_in_ == 149
    assert ovr_precomputed.n_classes_ == 3
    assert len(ovr_precomputed.estimators_) == 3
    assert ovo_precomputed.estimators_[0].n_features_in_ == 99  # class 0 vs class 1
    assert ovo_precomputed.estimators_[1].n_features_in_ == 99  # class 0 vs class 2
    assert ovo_precomputed.estimators_[2].n_features_in_ == 100  # class 1 vs class 2
some_digit_scores = sgd_clf.decision_function([some_digit])
print('decision_function[some_digit]=\n{}\n\n'.format(some_digit_scores))

digit_val = np.argmax(some_digit_scores)
print('[some_digit] =\n{}\n\n'.format(digit_val))
print('Class Values of SGD =\n{}\n\n'.format(sgd_clf.classes_))
print('Class list의 6th(Position of Number 5) Value =\n{}\n\n'.format(
    sgd_clf.classes_[5]))

#--------------------------------------------------------
# sklearn을 이용한 OvO Classifier
# page=143
#--------------------------------------------------------
from sklearn.multiclass import OneVsOneClassifier
ovo_clf = OneVsOneClassifier(SGDClassifier(max_iter=5, random_state=42))
ovo_clf.fit(X_train, y_train)
ovo_some_digital_pre = ovo_clf.predict([some_digit])
print('[some_digit predict using OvO] =\n{}\n\n'.format(ovo_some_digital_pre))
print('Length of OvO_clf_estimators =\n{}\n\n'.format(len(
    ovo_clf.estimators_)))
#print('OvO_clf_estimators =\n{}\n\n'.format(ovo_clf.estimators_))

#--------------------------------------------------------
# sklearn을 이용한 RandonForestClssifier Training
# page=143
#--------------------------------------------------------
forest_clf.fit(X_train, y_train)
forest_some_digit = forest_clf.predict([some_digit])
print('[some_digit predicted value in randim forest] =\n{}\n\n'.format(
    forest_some_digit))
def question_i():
    logger.info("EXECUTING: QUESTION I")
    logger.info("Multi-Class Classification")

    category = [
        'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale',
        'soc.religion.christian'
    ]
    train, test = utility.load_dataset(category)

    logger.info("Processing Training Dataset")
    for data, pos in zip(train.data, range(len(train.data))):
        processedData = utility.preprocess_data(data)
        train.data[pos] = ' '.join(processedData)

    logger.info("Processing Testing Dataset")
    for data, pos in zip(test.data, range(len(test.data))):
        processedData = utility.preprocess_data(data)
        test.data[pos] = ' '.join(processedData)

    logger.info("Creating TFxIDF Vector Representations")

    stop_words = text.ENGLISH_STOP_WORDS  # omit stop words

    # using CountVectorizer and TFxIDF Transformer
    count_vect = CountVectorizer(stop_words=stop_words, lowercase=True)
    train_counts = count_vect.fit_transform(train.data)
    test_counts = count_vect.transform(test.data)
    tfidf_transformer = TfidfTransformer(norm='l2', sublinear_tf=True)
    train_idf = tfidf_transformer.fit_transform(train_counts)
    test_idf = tfidf_transformer.transform(test_counts)

    logger.info("Performing LSI on TFxIDF Matrices")
    # apply LSI to TDxIDF matrices
    svd = TruncatedSVD(n_components=50)
    train_lsi = svd.fit_transform(train_idf)
    test_lsi = svd.transform(test_idf)

    logger.info("TFxIDF Matrices Transformed")

    logger.info("Size of Transformed Training Dataset: {0}".format(
        train_lsi.shape))
    logger.info("Size of Transformed Testing Dataset: {0}".format(
        test_lsi.shape))

    clf_list = [
        OneVsOneClassifier(GaussianNB()),
        OneVsOneClassifier(svm.SVC(kernel='linear')),
        OneVsRestClassifier(GaussianNB()),
        OneVsRestClassifier(svm.SVC(kernel='linear'))
    ]
    clf_name = [
        'OneVsOneClassifier Naive Bayes', 'OneVsOneClassifier SVM',
        'OneVsRestClassifier Naive Bayes', 'OneVsRestClassifier SVM'
    ]

    # perform classification
    for clf, clf_n in zip(clf_list, clf_name):
        logger.info("Training {0} Classifier ".format(clf_n))
        clf.fit(train_lsi, train.target)
        logger.info("Testing {0} Classifier".format(clf_n))
        test_predicted = clf.predict(test_lsi)
        utility.calculate_statistics(test.target, test_predicted)
Example #25
0
                           memory_level=1)
X = nifti_masker.fit_transform(func_filename)
X = X[non_rest]
session = session[non_rest]

### Predictor #################################################################

### Define the prediction function to be used.
# Here we use a Support Vector Classification, with a linear kernel
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.pipeline import Pipeline

svc_ovo = OneVsOneClassifier(
    Pipeline([('anova', SelectKBest(f_classif, k=500)),
              ('svc', SVC(kernel='linear'))]))

svc_ova = OneVsRestClassifier(
    Pipeline([('anova', SelectKBest(f_classif, k=500)),
              ('svc', SVC(kernel='linear'))]))

### Cross-validation scores ###################################################
from sklearn.cross_validation import cross_val_score

cv_scores_ovo = cross_val_score(svc_ovo, X, y, cv=5, verbose=1)

cv_scores_ova = cross_val_score(svc_ova, X, y, cv=5, verbose=1)

print(79 * "_")
print('OvO', cv_scores_ovo.mean())
Example #26
0
def algorithm(method_A, OneVsRest, OneVsOne, randomized):

    print("Selecting algorithm...")
    print("      ")

    if method_A == "svm":

        print("Starting with " + method_A)
        print("      ")

        parameters_svm = {
            'kernel': ('linear', 'rbf'),
            'C': [1, 3, 10, 100],
            'gamma': [0.01, 0.001]
        }
        model = svm.SVC()
        model = search_par(randomized, model, parameters_svm)

    if method_A == "random_forest":

        print("Starting with " + method_A)
        print("      ")

        parameters_random = {
            "max_depth": [2, 3, None],
            "max_features": [2, 4, 6],
            "min_samples_split": [2, 4, 6],
            "min_samples_leaf": [2, 4, 6],
            "bootstrap": [True, False],
            "criterion": ["gini", "entropy"]
        }
        model = RandomForestClassifier(n_estimators=100)
        model = search_par(randomized, model, parameters_random)

    if method_A == "logistic":

        print("Starting with " + method_A)
        print("      ")

        parameters_logistic = {'C': [100, 1000], 'tol': [0.001, 0.0001]}
        model = LogisticRegression(solver='lbfgs', multi_class='multinomial')
        model = search_par(randomized, model, parameters_logistic)

    if method_A == "neural_networks":

        print("Starting with " + method_A)
        print("      ")

        #model = MLPClassifier()

        model = Sequential()
        model.add(
            Dense(991, input_dim=179, init='normal')
        )  # number of features of the data +1 node for the bias term.
        model.add(Activation('relu'))
        model.add(Dropout(0.2))
        model.add(
            Dense(495, init='normal')
        )  #In sum, for most problems, one could probably get decent performance (even without a second optimization step) by setting the hidden layer configuration using just two rules: (i) number of hidden layers equals one; and (ii) the number of neurons in that layer is the mean of the neurons in the input and output layers.
        model.add(Activation('relu'))
        model.add(Dropout(0.5))
        model.add(
            Dense(99, init='normal')
        )  # If the NN is a classifier, then it also has a single node unless softmax is used in which case the output layer has one node per class label in your model.
        model.add(Activation('softmax'))

        sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
        model.compile(loss='categorical_crossentropy',
                      optimizer='rmsprop',
                      metrics=['accuracy'])

        OneVsRest = False
        OneVsOne = False

    if OneVsRest:

        print("Using OneVsRest ")
        print("      ")

        return OneVsRestClassifier(model)

    if OneVsOne:

        print("Using OneVsOne")
        print("      ")

        return OneVsOneClassifier(model)

    print("Algorithm selected: " + method_A)
    print("      ")

    return model
    print(msg)
    return taxa_de_acerto


resultados = {}

from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
modeloOneVsRest = OneVsRestClassifier(LinearSVC(random_state=0))
resultadoOneVsRest = fit_and_predict_kf("OneVsRest", modeloOneVsRest,
                                        treino_dados, treino_marcacoes)
resultados[resultadoOneVsRest] = modeloOneVsRest

from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC
modeloOneVsOne = OneVsOneClassifier(LinearSVC(random_state=0))
resultadoOneVsOne = fit_and_predict_kf("OneVsOne", modeloOneVsOne,
                                       treino_dados, treino_marcacoes)
resultados[resultadoOneVsOne] = modeloOneVsOne

from sklearn.naive_bayes import MultinomialNB
modeloMultinomial = MultinomialNB()
resultadoMultinomial = fit_and_predict_kf("MultinomialNB", modeloMultinomial,
                                          treino_dados, treino_marcacoes)
resultados[resultadoMultinomial] = modeloMultinomial

from sklearn.ensemble import AdaBoostClassifier
modeloAdaBoost = AdaBoostClassifier()
resultadoAdaBoost = fit_and_predict_kf("AdaBoostClassifier", modeloAdaBoost,
                                       treino_dados, treino_marcacoes)
resultados[resultadoAdaBoost] = modeloAdaBoost
            #SVC(random_state=RS),
            #LogisticRegression(random_state=RS, n_jobs=-1),
            #RandomForestClassifier(random_state=RS),
            #GradientBoostingClassifier(random_state=RS),
            #BalancedRandomForestClassifier(random_state=RS),
            #StackingClassifier([('brf', BalancedRandomForestClassifier(random_state=RS, n_estimators=1000))]),
            #StackingClassifier([('brf', BalancedRandomForestClassifier(random_state=RS, n_estimators=1000))], GradientBoostingClassifier(random_state=RS)),
            #StackingClassifier([('brf', BalancedRandomForestClassifier(random_state=RS, n_estimators=1000))], MLPClassifier(random_state=RS, hidden_layer_sizes=[100]*5)),
            #StackingClassifier([('brf', BalancedRandomForestClassifier(random_state=RS, n_estimators=1000))], KNeighborsClassifier()),
            #RandomForestClassifier(random_state=RS, n_estimators=1000),
            #BalancedRandomForestClassifier(random_state=RS, n_estimators=1000),
            #OneVsRestClassifier(GradientBoostingClassifier(random_state=RS)),
            #OneVsOneClassifier(GradientBoostingClassifier(random_state=RS, n_estimators=1000)),
            #OneVsOneClassifier(RandomForestClassifier(random_state=RS, n_estimators=1000)),
            OneVsOneClassifier(
                BalancedRandomForestClassifier(random_state=RS,
                                               n_estimators=1000)),
            StackingClassifier(
                [('rs',
                  OneVsOneClassifier(
                      GradientBoostingClassifier(random_state=RS,
                                                 n_estimators=1000)))],
                OneVsOneClassifier(
                    BalancedRandomForestClassifier(random_state=RS,
                                                   n_estimators=1000))),
            #StackingClassifier([('rs', OneVsOneClassifier(RandomForestClassifier(random_state=RS, n_estimators=1000)))], OneVsOneClassifier(GradientBoostingClassifier(random_state=RS, n_estimators=1000)))

            #OneVsRestClassifier(MLPClassifier(hidden_layer_sizes= [100]*5, random_state=RS)),
            #OneVsOneClassifier(MLPClassifier(hidden_layer_sizes= [100]*5, random_state=RS)),
            #OneVsRestClassifier(SVC(decision_function_shape='ovr', random_state=RS)),
            #OneVsOneClassifier(SVC(decision_function_shape='ovo', random_state=RS)),
Example #29
0
def test_ovo_exceptions():
    ovo = OneVsOneClassifier(LinearSVC(random_state=0))
    assert_raises(ValueError, ovo.predict, [])
Example #30
0
from sklearn import datasets
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC
iris = datasets.load_iris()
X, y = iris.data, iris.target
print OneVsOneClassifier(LinearSVC(random_state=0)).fit(X, y).predict(X)