def test_ecoc_float_y():
    # Test that the OCC errors on float targets
    X = iris.data
    y = iris.data[:, 0]

    ovo = OutputCodeClassifier(LinearSVC())
    assert_raise_message(ValueError, "Unknown label type", ovo.fit, X, y)
    ovo = OutputCodeClassifier(LinearSVC(), code_size=-1)
    assert_raise_message(ValueError, "code_size should be greater than 0,"
                         " got -1", ovo.fit, X, y)
Esempio n. 2
0
def test_ecoc_fit_predict():
    # A classifier which implements decision_function.
    ecoc = OutputCodeClassifier(LinearSVC(), code_size=2)
    ecoc.fit(iris.data, iris.target).predict(iris.data)
    assert_equal(len(ecoc.estimators_), n_classes * 2)

    # A classifier which implements predict_proba.
    ecoc = OutputCodeClassifier(MultinomialNB(), code_size=2)
    ecoc.fit(iris.data, iris.target).predict(iris.data)
    assert_equal(len(ecoc.estimators_), n_classes * 2)
Esempio n. 3
0
def test_ecoc_float_y():
    # Test that the OCC errors on float targets
    X = iris.data
    y = iris.data[:, 0]

    ovo = OutputCodeClassifier(LinearSVC())
    msg = "Unknown label type"
    with pytest.raises(ValueError, match=msg):
        ovo.fit(X, y)

    ovo = OutputCodeClassifier(LinearSVC(), code_size=-1)
    msg = "code_size should be greater than 0, got -1"
    with pytest.raises(ValueError, match=msg):
        ovo.fit(X, y)
Esempio n. 4
0
def menu(mode, mult_clf_mode, bin_clf_mode):

    # Define o classificador binário
    if (bin_clf_mode == "--svc"):
        bin_clf = svm.SVC(class_weight='balanced')
        filename = "svc"
    elif (bin_clf_mode == "--mlp"):
        bin_clf = MLPClassifier()
        filename = "mlp"
    else:
        print("Escolha o terceiro argumento como --svc ou --mlp")
        exit()

    # Define o classificador multiclasse
    if (mult_clf_mode == "--ovr"):
        mult_clf = OneVsRestClassifier(bin_clf, n_jobs=-1)
        filename = "ovr_" + filename
    elif (mult_clf_mode == "--ovo"):
        mult_clf = OneVsOneClassifier(bin_clf, n_jobs=-1)
        filename = "ovo_" + filename
    elif (mult_clf_mode == "--eoc"):
        mult_clf = OutputCodeClassifier(bin_clf, code_size=3.0, n_jobs=-1)
        filename = "eoc_" + filename
    else:
        print("Escolha o segundo argumento como --ovr ou --ovo ou --eoc")
        exit()

    if (mode == "--train"):
        training(mult_clf, filename)
    elif (mode == "--test"):
        test(filename)
    else:
        print("Escolha o primeiro argumento como --train ou --test")
        exit()
Esempio n. 5
0
def test_ecoc_float_y():
    # Test that the OCC errors on float targets
    X = iris.data
    y = iris.data[:, 0]

    ovo = OutputCodeClassifier(LinearSVC())
    assert_raise_message(ValueError, "Unknown label type", ovo.fit, X, y)
Esempio n. 6
0
def test_ecoc_gridsearch():
    ecoc = OutputCodeClassifier(LinearSVC(random_state=0), random_state=0)
    Cs = [0.1, 0.5, 0.8]
    cv = GridSearchCV(ecoc, {'estimator__C': Cs})
    cv.fit(iris.data, iris.target)
    best_C = cv.best_estimator_.estimators_[0].C
    assert_true(best_C in Cs)
Esempio n. 7
0
def run_test(**kwargs):
    b = fetch_sw_orl()
    tic = time.time()

    # split the data in
    X_train, X_test, y_train, y_true = train_test_split(b.data,
                                                        b.target,
                                                        test_size=0.2,
                                                        stratify=b.target)

    hog_train = []
    for img_array in X_train:
        fd, _ = hog(img_array.reshape(b.shape),
                    orientations=8,
                    pixels_per_cell=(PPC, PPC),
                    cells_per_block=(1, 1),
                    visualize=True,
                    multichannel=False)
        hog_train.append(fd)

    clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2)
    clf.fit(hog_train, y_train)
    tok = time.time()

    hog_test = []
    for img_arry in X_test:
        fd, _ = hog(img_arry.reshape(b.shape),
                    orientations=8,
                    pixels_per_cell=(PPC, PPC),
                    cells_per_block=(1, 1),
                    visualize=True,
                    multichannel=False)
        hog_test.append(fd)
    y_pred = clf.predict(hog_test)
    return tok - tic, accuracy_score(y_true, y_pred)
def get_classifier_by_type(clftype, model_train_feature, model_train_label,
                           Classifier, kwargs):
    """ Get classifiers
    """
    print 'Train multi-class classifiers, type = %s' % (clftype)
    if clftype == 'multiclass':
        clf = Classifier(**kwargs).fit(model_train_feature, model_train_label)
    elif clftype == 'onevsrest':
        from sklearn.multiclass import OneVsRestClassifier
        clf = OneVsRestClassifier(Classifier(**kwargs)).fit(
            model_train_feature, model_train_label)
    elif clftype == 'onevsone':
        from sklearn.multiclass import OneVsOneClassifier
        clf = OneVsOneClassifier(Classifier(**kwargs),
                                 n_jobs=-1).fit(model_train_feature,
                                                model_train_label)
    elif clftype == 'occ':
        from sklearn.multiclass import OutputCodeClassifier
        clf = OutputCodeClassifier(Classifier(**kwargs),
                                   code_size=2,
                                   random_state=0).fit(model_train_feature,
                                                       model_train_label)
    else:
        print 'Unsupported clf type:', clftype
        sys.exit(1)

    return clf
def af_vecAvg_MaxEnt_OutputCode(data):
    job = Job('af_vecAvg_MaxEnt_OutputCode', cv = cv_n_fold)
    pipeline = Pipeline(steps=[("vecAvg", Word2VecTransformer(fld.get_path(fld.model_meta_data, fl_word_vectors), 
                                                              dim = 300,
                                                              all_text_data = list(data.df[data.fs_ind]))),
                               ('m', OutputCodeClassifier(LogisticRegression(),
                                                          code_size = 10))])
    parameters = dict(m__estimator__C = [0.01])
    job.run(pipeline, parameters, data)
    return None
def voting_classifier():

	# create the classifier objects
	f_sel = SVC(C=1, kernel='linear', shrinking=True, class_weight='balanced')
	classifiers = {
		'knn':Pipeline([('f_sel',SelectFromModel(f_sel)), ('clf',KNeighborsClassifier())]),
		'logistic':LogisticRegression(),
		'lda':LinearDiscriminantAnalysis(),
		'svm':Pipeline([('f_sel',SelectFromModel(f_sel)), ('clf',SVC())]),
		'tree':DecisionTreeClassifier(),
		'randomforest':RandomForestClassifier(),
		'extratrees':ExtraTreesClassifier(),
		'gradboost':GradientBoostingClassifier(),
		'adaboost':AdaBoostClassifier(),
		'mlp':MLPClassifier(),
		'ecoc':OutputCodeClassifier(SVC(C=2,kernel='linear',shrinking=True,probability=True,class_weight='balanced'), code_size=2)}
		
	# create ensemble of the classifiers
	clfs = []
	[clfs.append((name,classifiers.get(name))) for name in classifier_names]
	
	# create the voting classifier
	voting_type = classification_method[0:4]
	eclf = VotingClassifier(estimators=clfs, voting=voting_type)
	
	# specify parameters of the classifiers
	param_set = {}
	if 'knn' in classifier_names: #89.9,90.8 'n_neighbors':17, 'p':1, 'weights':'distance'
		param_set.update({'knn__clf__n_neighbors':[17], 'knn__clf__p':[1], 'knn__clf__weights':['distance'], 'knn__clf__algorithm':['auto'], 'knn__clf__n_jobs':[3]})
	if 'logistic' in classifier_names: #94.4 'C':1, 'solver':'newton-cg'
		param_set.update({'logistic__C':[2], 'logistic__solver':['lbfgs'], 'logistic__class_weight':['balanced'], 'logistic__max_iter':[100]})
	if 'lda' in classifier_names: #94.9 'solver':'lsqr'
		param_set.update({'lda__solver':['lsqr'], 'lda__shrinkage':['auto']})
	if 'svm' in classifier_names: #95.3 'C':1, 'kernel':'linear'
		param_set.update({'svm__clf__C':[2], 'svm__clf__kernel':['linear'], 'svm__clf__shrinking':[True], 'svm__clf__probability':[True], 'svm__clf__class_weight':['balanced'], 'svm__clf__decision_function_shape':['ovo']})
	if 'tree' in classifier_names: #82.3 'max_depth':15
		param_set.update({'tree__max_depth':[10,15,20], 'tree__class_weight':['balanced'], 'tree__presort':[True]})
	if 'randomforest' in classifier_names: #91.8 'n_estimators':300, 'min_samples_leaf':None, 'max_depth':25
		param_set.update({'randomforest__n_estimators':[100], 'randomforest__max_features':[10,25,50], 'randomforest__min_samples_leaf':[50] ,'randomforest__max_depth':[None], 'randomforest__bootstrap':[True], 'randomforest__class_weight':['balanced'], 'randomforest__oob_score':[True], 'randomforest__n_jobs':[3]})
	if 'extratrees' in classifier_names: #92.8 'n_estimators':500, 'max_depth':50
		param_set.update({'extratrees__n_estimators':[300], 'extratrees__max_features':['auto'], 'extratrees__min_samples_leaf':[50], 'extratrees__max_depth':[None], 'extratrees__bootstrap':[False], 'extratrees__class_weight':['balanced'], 'extratrees__oob_score':[False], 'extratrees__n_jobs':[3]})
	if 'gradboost' in classifier_names: #92.3 'n_estimators':100, 'learning_rate':0.1, 'min_samples_leaf':50
		param_set.update({'gradboost__n_estimators':[100], 'gradboost__max_features':['auto'], 'gradboost__learning_rate':[0.1], 'gradboost__min_samples_leaf':[50]})
	if 'adaboost' in classifier_names:
		param_set.update({'adaboost__n_estimators':[100], 'adaboost__learning_rate':[0.1]})
	if 'mlp' in classifier_names: # 95.0 'hidden_layer_sizes':(50,), 'alpha':10, 'solver':'lbfgs'
		param_set.update({'mlp__hidden_layer_sizes':[(50,)], 'mlp__alpha':[10], 'mlp__solver':['lbfgs']})
	
	# run grid search or randomized search
	if tuning_method=='grid':
		search = GridSearchCV(eclf, param_grid=param_set, cv=2, n_jobs=3)
	elif tuning_method=='rand':
		search = RandomizedSearchCV(eclf, param_distributions=param_set, n_iter=10, cv=2, n_jobs=3)
	
	return search
Esempio n. 11
0
def clasificar_ECOC(X, y, df, trainInputs, trainOutputs, testInputs, testOutputs, graphname):
	print("\n[" + str(graphname) + "]")
	kernelRBF=1.0*RBF(1.0)
	clf=OutputCodeClassifier(estimator = DecisionTreeClassifier())
	clf=clf.fit(trainInputs, trainOutputs)
	precisionTrain = clf.score(trainInputs, trainOutputs)
	precisionTest = clf.score(testInputs, testOutputs)
	print("\tCCR train = %.2f%% | CCR test = %.2f%%" % (precisionTrain*100, precisionTest*100))
	prediccion_test = clf.predict(testInputs)
	print(prediccion_test)
	print(testOutputs)
	return precisionTest
def aa_tfidf_MaxEnt_OutputCode(data):
    job = Job('aa_tfidf_MaxEnt_OutputCode', cv = cv_n_fold)
    pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english',
                                                         max_features = 2000,
                                                         min_df = 5)),
                               ('m', OutputCodeClassifier(LogisticRegression(),
                                                          code_size = 10))])
    parameters = dict(tfidf__norm = ['l2'],
                      tfidf__ngram_range = [(1, 2)],
                      m__estimator__C = [0.01])
    job.run(pipeline, parameters, data)
    return None
def ab_tfidf_elasticnet_OutputCode(data):
    job = Job('ab_tfidf_elasticnet_OutputCode', cv = cv_n_fold)
    pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english',
                                                         min_df = 5)),
                               ('elnet', OutputCodeClassifier(
                               SGDClassifier(penalty="elasticnet"),
                               code_size = 100))])
    parameters = dict(tfidf__norm = ['l2'],
                      tfidf__ngram_range = [(1, 2)], # ,      # [(1, 3)]
                      elnet__estimator__alpha = [0.0001],  # [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
                      elnet__estimator__l1_ratio = [0.1]) # [0.1, 0.5, 0.8, 0.9, 0.99]
    job.run(pipeline, parameters, data)
    return None
Esempio n. 14
0
 def train(corpus):
     time = datetime.datetime.now()
     logging.info('Static Embedding Oracle')
     Y, X_dic = EmbeddingOracle.parseCorpus(corpus.trainingSents,
                                            EmbeddingOracle)
     vec = DictVectorizer()
     X = vec.fit_transform(X_dic)
     clf = OutputCodeClassifier(LinearSVC(random_state=0),
                                code_size=2,
                                random_state=0)
     clf.fit(X, Y)
     logging.info('Traingin Time: ' +
                  str(int((datetime.datetime.now() - time).seconds / 60.)))
     return clf, vec
Esempio n. 15
0
def test_ecoc_delegate_sparse_base_estimator():
    # Non-regression test for
    # https://github.com/scikit-learn/scikit-learn/issues/17218
    X, y = iris.data, iris.target
    X_sp = sp.csc_matrix(X)

    # create an estimator that does not support sparse input
    base_estimator = CheckingClassifier(
        check_X=check_array,
        check_X_params={"ensure_2d": True, "accept_sparse": False},
    )
    ecoc = OutputCodeClassifier(base_estimator, random_state=0)

    with pytest.raises(TypeError, match="A sparse matrix was passed"):
        ecoc.fit(X_sp, y)

    ecoc.fit(X, y)
    with pytest.raises(TypeError, match="A sparse matrix was passed"):
        ecoc.predict(X_sp)

    # smoke test to check when sparse input should be supported
    ecoc = OutputCodeClassifier(LinearSVC(random_state=0))
    ecoc.fit(X_sp, y).predict(X_sp)
    assert len(ecoc.estimators_) == 4
Esempio n. 16
0
def scikit_outputcode(X, y, X_test, y_test=None):
    from sklearn.multiclass import OutputCodeClassifier
    from sklearn.svm import LinearSVC
    predictions = OutputCodeClassifier(LinearSVC(random_state=0),
                                       code_size=2,
                                       random_state=0).fit(X,
                                                           y).predict(X_test)
    correctcount = 0
    totalcount = 0
    for index, each in enumerate(predictions):
        if y_test[index] == each:
            correctcount += 1
        totalcount += 1

    print str(correctcount) + " / " + str(totalcount) + " = " + str(
        float(correctcount) / totalcount)
def ECOC():

    print('Aplicando metodo multiclase ERROR CORRECTING OUTPUT CODES')
    for indice in lista_datasets:

        print('Base de datos: ' + str(indice))
        dataset = arff.loadarff('./datasets/' + str(indice))
        df = pd.DataFrame(dataset[0])
        input = df.iloc[:, df.columns != 'class']
        output = pd.factorize(df['class'])[0]
        X_train, X_test, Y_train, Y_test = train_test_split(input, output, test_size=0.25)

        clf = OutputCodeClassifier(KNeighborsClassifier(n_neighbors=5), code_size=2, random_state=0)
        clf.fit(X_train, Y_train)

        print('Porcentaje de bien clasificados ERROR CORRECTING OUTPUT CODES')
        print(clf.score(X_test, Y_test))
    print('--------------------------')
Esempio n. 18
0
    def _model1(self, visDataObjects, features, labels):
        """Ted's round one.

    Find max margin in:
      for t in vis_types:
        for x in columns:
          yield margin(x_axis | t, x)

    Repeat for y.

    Then we basis so (independently) pick the best axis assignment for a chart
    type.
    """
        from sklearn.multiclass import OutputCodeClassifier
        from sklearn.svm import LinearSVC
        clf = OutputCodeClassifier(LinearSVC(random_state=0),
                                   code_size=2,
                                   random_state=0)
        pass
 def __init__(self, labels, data, load=False, save=False):
     if load:
         with open(clfData, 'rb') as input:
             self.classifier = pickle.load(input)
         with open(vecData, 'rb') as input:
             self.verctorizer = pickle.load(input)
         return
     self.verctorizer = DictVectorizer()
     featureVec = self.verctorizer.fit_transform(data)
     self.classifier = OutputCodeClassifier(LinearSVC(random_state=0),
                                            code_size=2,
                                            random_state=0)
     # self.classifier = LogisticRegression( solver='sag')
     self.classifier.fit(featureVec, labels)
     if save:
         with open(clfData, 'wb') as output:
             pickle.dump(self.classifier, output, pickle.HIGHEST_PROTOCOL)
         with open(vecData, 'wb') as output:
             pickle.dump(self.verctorizer, output, pickle.HIGHEST_PROTOCOL)
Esempio n. 20
0
 def evaluateOutputCode(X, Y, printReport=False):
     time = datetime.datetime.now()
     X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                         Y,
                                                         test_size=0.2,
                                                         random_state=42)
     clf = OutputCodeClassifier(LinearSVC(random_state=0),
                                code_size=2,
                                random_state=0)
     clf.fit(X_train, Y_train)
     if printReport:
         print 'Training time:' + str(datetime.datetime.now() - time)
         print 'Evaluation result: OneVsOne: ' + str(
             clf.score(X_test, Y_test))
     Y_test = clf.predict(X_test)
     if printReport:
         print '0: ' + str((Y_test == 0).sum())
         print '1: ' + str((Y_test == 1).sum())
         print '2: ' + str((Y_test == 2).sum())
     return [clf.score(X_test, Y_test), (Y_test == 1).sum(), clf]
Esempio n. 21
0
    def _multiclass_refit(self, clf):
        """Return advanced choices of the classification method"""

        if self.args.multiclass == 'one-vs-rest':
            from sklearn.multiclass import OneVsRestClassifier
            print('[ML] Using one-vs-rest method to re-train')
            clf = OneVsRestClassifier(clf)

        elif self.args.multiclass == 'one-vs-one':
            from sklearn.multiclass import OneVsOneClassifier
            self.args.get_prob = False
            print('[ML] Using one-vs-one method to re-train')
            print('[ML] WARNING: Set get_prob to False')
            clf = OneVsOneClassifier(clf)

        elif self.args.multiclass == 'error-correcting':
            from sklearn.multiclass import OutputCodeClassifier
            print('[ML] Using error-correcting method to re-train')
            clf = OutputCodeClassifier(clf, code_size=2)

        return clf
Esempio n. 22
0
def OutputCodeClassifier(data, label, pred_data, pred_last):
    '''
    0.76473194506
    Number of mislabeled points out of a total 841 points : 211
    0.749108204518
    需要规范化
    '''
    data = np.array(data)
    pred_data = np.array(pred_data)
    label = np.array(label)
    pred_last = np.array(pred_last)
    from sklearn.multiclass import OutputCodeClassifier
    from sklearn.svm import LinearSVC
    clf = OutputCodeClassifier(LinearSVC(random_state=0),
                               code_size=2,
                               random_state=0)
    clf.fit(data, label)

    print clf.score(data, label)
    pred_result = clf.predict(pred_data)
    print("Number of mislabeled points out of a total %d points : %d" %
          (pred_data.shape[0], (pred_last != pred_result).sum()))
    print clf.score(pred_data, pred_last)
    return pred_result
Esempio n. 23
0
def getFitness(individual, X, y):
    """
    Feature subset fitness function
    """

    if individual.count(0) != len(individual):
        # get index with value 0
        cols = [index for index in range(
            len(individual)) if individual[index] == 0]

        # get features subset
        X_parsed = X.drop(X.columns[cols], axis=1)
        X_subset = pd.get_dummies(X_parsed)

        # X_subset = X
        #
        # for col in cols:
        #     X_subset[col].values[:] = 0

        # apply classification algorithm
        clf = AdaBoostClassifier()
        clf = BaggingClassifier()
        clf = BernoulliNB()

        clf = CalibratedClassifierCV()
        clf = CategoricalNB()
        clf = ClassifierChain()
        clf = ComplementNB()

        clf = DecisionTreeClassifier()
        clf = DummyClassifier()

        clf = ExtraTreeClassifier()
        clf = ExtraTreesClassifier()

        clf = GaussianNB()
        clf = GaussianProcessClassifier()
        clf = GradientBoostingClassifier()

        # clf = HistGradientBoostingClassifier()

        clf = KNeighborsClassifier()

        clf = LabelPropagation()
        clf = LabelSpreading()
        clf = LinearDiscriminantAnalysis()
        clf = LinearSVC()
        clf = LogisticRegression()
        clf = LogisticRegressionCV()

        clf = MLPClassifier()
        clf = MultiOutputClassifier()
        clf = MultinomialNB()

        clf = NearestCentroid()
        clf = NuSVC()

        clf = OneVsOneClassifier()
        clf = OneVsRestClassifier()
        clf = OutputCodeClassifier()

        clf = PassiveAggressiveClassifier()
        clf = Perceptron()

        clf = QuadraticDiscriminantAnalysis()

        clf = RadiusNeighborsClassifier()
        clf = RandomForestClassifier()
        clf = RidgeClassifier()
        clf = RidgeClassifierCV()

        clf = SGDClassifier()
        clf = SVC()
        clf = StackingClassifier()

        clf = VotingClassifier()

        # clf.fit(X, y)
        # clf.fit(X_subset, y_train)
        clf.fit(X_subset, y)

        # y_pred_ANN = clf.predict(X_test)
        # y_pred = clf.predict(X_subset)

        # score = cross_val_score(clf, X, y, cv=5)
        #
        # print(max(score), min(score))

        return (avg(cross_val_score(clf, X_subset, y, cv=5)),)
        # return (avg(score),)
        # return accuracy_score(y, y_pred_ANN)
    else:
        return (0,)
Esempio n. 24
0
def test_ecoc_exceptions():
    ecoc = OutputCodeClassifier(LinearSVC(random_state=0))
    assert_raises(ValueError, ecoc.predict, [])
Esempio n. 25
0
    def __init__(self, X, y, people, df_features, feature_names, conf_dict):

        self.X = X
        self.y = np.array(y)
        self.feature_names = feature_names

        self.people = people.reset_index(drop=True)
        self.X_df = df_features
        self.y_df = y.reset_index(drop=True)

        self.app_list = config_dict["app_list"]
        self.labels_numeric = {name: i for i, name in enumerate(self.app_list)}
        self.n_classes = len(self.labels_numeric)

        self.clf_name = config_dict["classifier"]

        #        self.feature_selection = config_dict["feature_selection"] #True/False
        #        self.num_features = config_dict["num_features"]
        #        self.one_vs_all_type = config_dict["one_vs_all_type"]
        self.feature_selection = conf_dict["feature_selection"]  #True/False
        self.num_features = conf_dict["num_features"]
        self.one_vs_all_type = conf_dict["one_vs_all_type"]

        self.chosen_feature_names = None
        self.chosen_features_all_folds = []

        self.clf_dict = {}
        #self.clf_dict["one_vs_all"] = OneVsRestClassifier(SVC(kernel='rbf', C=1000, gamma=0.001))

        self.clf_dict["output_code"] = OutputCodeClassifier(SVC(kernel='rbf',
                                                                C=1000,
                                                                gamma=0.001),
                                                            code_size=2,
                                                            random_state=0)

        params_rf = {
            'n_estimators': 100,
            'max_depth': 20,
            'max_features': 'sqrt',
            'min_samples_leaf': 1,
            'min_samples_split': 10,
            'random_state': 0
        }
        #params_rf = {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_features': 'auto', 'max_depth': 20, 'random_state': 0}
        self.clf_dict["rf"] = RandomForestClassifier(**params_rf)
        #        self.clf_dict["svm"] = SVC(kernel='rbf', C=1000, gamma=0.001)
        #        self.clf_dict["svm"] = SVC(kernel='linear', C=1, gamma=0.001)
        params_svm = {
            'C': 10,
            'degree': 2,
            'gamma': 'scale',
            'kernel': 'sigmoid'
        }
        self.clf_dict["svm"] = SVC(**params_svm)

        #Naive Bayes classifier is a general term which refers to conditional independence of each of the features in the model, while Multinomial Naive Bayes classifier is a specific instance of a Naive Bayes classifier which uses a multinomial distribution for each of the features.
        self.clf_dict["nb"] = MultinomialNB(alpha=0.00001)
        self.clf_dict["gnb"] = GaussianNB(var_smoothing=0.05)

        self.clf_dict["knn"] = KNeighborsClassifier(n_neighbors=8)

        params_dt = {
            'criterion': 'gini',
            'max_depth': 20,
            'max_features': 'auto',
            'min_samples_leaf': 2,
            'min_samples_split': 2,
            'random_state': 42,
            'splitter': 'best'
        }
        self.clf_dict["dt"] = DecisionTreeClassifier(**params_dt)

        self.clf_dict["one_vs_all"] = OneVsRestClassifier(
            self.clf_dict[conf_dict["one_vs_all_type"]])

        self.fs_dict = {}
        self.fs_dict["selectKbest_chi2"] = SelectKBest(chi2,
                                                       k=self.num_features)
        self.fs_dict["selectKbest_fclassif"] = SelectKBest(f_classif,
                                                           k=self.num_features)
Esempio n. 26
0
        WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', w))
        for w in entry['ingredients']
    ]

    test_ingredients.append(' '.join(ings))

#used to encode labels as numbers for use with RandomForestClassifier
le = LabelEncoder()

#encode cuisines as numbers
train_cuisines = le.fit_transform(train_cuisines)

#used to create bag of ingredients vocabulary and create features for each entry
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform(train_ingredients).toarray()
test_features = vectorizer.transform(test_ingredients).toarray()

clf = OutputCodeClassifier(LinearSVC(random_state=0),
                           code_size=2,
                           random_state=2)
result = clf.fit(train_features, train_cuisines).predict(test_features)

output = pd.DataFrame(data={
    'id': test_ids,
    'cuisine': le.inverse_transform(result)
})

#force explicit ordering of columns
output = output[['id', 'cuisine']]
output.to_csv('ecoc.csv', index=False)
Esempio n. 27
0
def main():

    filenameLB = 'mfcc_lb.csv'
    allsongcat = pickle.load(open('mfcc_fv.p', 'rb'))
    hcdf = pickle.load(open('hcdf_fv.p', 'rb'))

    with open('mfcc_lb.csv') as f:
        reader = csv.reader(f)
        for row in reader:
            labels = row

    # select training and test sets
    '''
    TEidx = np.array(random.sample(range(0,1000), 100))
    
    training = []
    test = []
    
    trainingLB = []
    testLB = []

    # make numpy arrays
    for i in range(1000):
        if i in TEidx:
            test.append(featureDict[i])
            testLB.append(int(labels[i]))
        else:
            training.append(featureDict[i])
            trainingLB.append(int(labels[i]))
        
    # fit with classifier and predict
    X = np.array(training)
    Y = np.array(trainingLB)

    '''
    l = [allsongcat, hcdf]
    all_feats = combineFeatures(l)
    feats_shuf = []
    labels_shuf = []
    index_shuf = range(len(labels))
    shuffle(index_shuf)
    for i in index_shuf:
        feats_shuf.append(all_feats[i])
        labels_shuf.append(labels[i])

    X = np.array(feats_shuf)
    Y = np.array(labels_shuf)

    kf = KFold(1000, n_folds=10)
    #rf = RandomForestClassifier(n_estimators=50, max_features = 'log2')
    sgd = SGDClassifier(loss="hinge", penalty="l2")
    #svc = svm.SVC(kernel='linear')
    dtree = DecisionTreeClassifier(max_depth=3)
    lsvc = LinearSVC(random_state=0)
    cla = OutputCodeClassifier(sgd, code_size=128, random_state=0)

    cm_all = np.zeros((10, 10), dtype=np.int)

    cb = np.zeros((10, 20))
    losses = []

    with open('ECOC_sgd_error.csv', 'w') as f1:
        wrtest = csv.writer(f1,
                            quoting=csv.QUOTE_NONNUMERIC,
                            lineterminator='\n')
        scores = 0.0
        for train, test in kf:
            X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[
                test]
            cla.fit(X_train, y_train)
            predictions = cla.predict(X_test)
            loss = zero_one_loss(predictions, y_test)
            losses.append(loss)
            scores += loss
            # print y_test
            # print predictions

            cb = cla.code_book_

            np.savetxt('codebook.csv', cb, delimiter=',')

            # Compute confusion matrix
            cm = confusion_matrix(
                y_test,
                predictions,
                labels=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])
            np.set_printoptions(precision=2)
            #print(cm_all)
            cm_all = np.add(cm_all, cm)

        # make ECOC coding matrix 0-1 binary
        cb[cb <= 0] = 0
        wrtest.writerow(losses)
    print cb

    print scores / 10
def single_classifier(clf_name):

	# create the classifier objects
	classifiers = {
		'knn':KNeighborsClassifier(),
		'logistic':LogisticRegression(),
		'lda':LinearDiscriminantAnalysis(),
		'svm':SVC(),
		'tree':DecisionTreeClassifier(),
		'randomforest':RandomForestClassifier(),
		'extratrees':ExtraTreesClassifier(),
		'gradboost':GradientBoostingClassifier(),
		'adaboost':AdaBoostClassifier(),
		'mlp':MLPClassifier(),
		'ecoc':OutputCodeClassifier(SVC(C=2,kernel='linear',shrinking=True,class_weight='balanced'), code_size=2)}

	# feature selection using a pipeline
	if f_sel_method=='none':
		pipe = Pipeline([('clf',classifiers[clf_name])])
		param_set = {}
	elif f_sel_method=='anova':
		pipe = Pipeline([('f_sel',SelectPercentile(score_func=f_classif)), ('clf',classifiers[clf_name])])
		param_set = {'f_sel__percentile':[25,50,75,100]}
	elif f_sel_method=='mutualinfo':
		pipe = Pipeline([('f_sel',SelectPercentile(score_func=mutual_info_classif)), ('clf',classifiers[clf_name])])
		param_set = {'f_sel__percentile':[25,50,75,100]}
	elif f_sel_method=='recursivesvm':
		f_sel = SVC(C=1, kernel='linear', shrinking=True, class_weight='balanced')
		pipe = Pipeline([('f_sel',RFECV(estimator=f_sel)), ('clf',classifiers[clf_name])])
		param_set = {'f_sel__step':[10], 'f_sel__cv':[2], 'f_sel__scoring':['accuracy']}
	elif f_sel_method=='frommodelsvm':
		f_sel = SVC(C=1, kernel='linear', shrinking=True, class_weight='balanced')
		pipe = Pipeline([('f_sel',SelectFromModel(f_sel)), ('clf',classifiers[clf_name])])
		param_set = {}
	elif f_sel_method=='frommodeltree':
		f_sel = ExtraTreesClassifier(n_estimators=100, class_weight='balanced')
		pipe = Pipeline([('f_sel',SelectFromModel(f_sel)), ('clf',classifiers[clf_name])])
		param_set = {}

	# specify parameters of the classifiers
	if clf_name=='knn': #89.9,90.8 'n_neighbors':17, 'p':1, 'weights':'distance'
		param_set.update({'clf__n_neighbors':[1,9,13,17,25,50], 'clf__p':[1,2,3,5], 'clf__weights':['distance'], 'clf__algorithm':['auto'], 'clf__n_jobs':[3]})
	elif clf_name=='logistic': #94.4 'C':1, 'solver':'newton-cg'
		param_set.update({'clf__C':[1,2,3,4], 'clf__solver':['newton-cg'], 'clf__class_weight':['balanced'], 'clf__max_iter':[100]})
	elif clf_name=='lda': #94.9 'solver':'lsqr'
		param_set.update({'clf__solver':['lsqr','eigen'], 'clf__shrinkage':['auto']})
	elif clf_name=='svm': #95.3 'C':1, 'kernel':'linear'
		param_set.update({'clf__C':[0.75,1,1.25,1.5,2], 'clf__kernel':['linear'], 'clf__shrinking':[True], 'clf__probability':[False], 'clf__class_weight':['balanced'], 'clf__decision_function_shape':['ovr']})
	elif clf_name=='tree': #82.3 'max_depth':15
		param_set.update({'clf__min_samples_leaf':[10,50,75,100], 'clf__class_weight':['balanced'], 'clf__presort':[True]})
	elif clf_name=='randomforest': #91.8 'n_estimators':300, 'min_samples_leaf':None, 'max_depth':25
		param_set.update({'clf__n_estimators':[500,1000], 'clf__max_features':[5,10,25], 'clf__min_samples_leaf':[1,10,25] ,'clf__max_depth':[None], 'clf__bootstrap':[True], 'clf__class_weight':['balanced'], 'clf__oob_score':[False], 'clf__n_jobs':[3]})
	elif clf_name=='extratrees': #92.8 'n_estimators':500, 'max_depth':50
		param_set.update({'clf__n_estimators':[100,500,1000], 'clf__max_features':[5,10,20,25,50,100,150], 'clf__min_samples_leaf':[1,10,25,50,100], 'clf__max_depth':[None], 'clf__bootstrap':[False], 'clf__class_weight':['balanced'], 'clf__oob_score':[False], 'clf__n_jobs':[3]})
	elif clf_name=='gradboost': #92.3 'n_estimators':100, 'learning_rate':0.1, 'min_samples_leaf':50
		param_set.update({'clf__n_estimators':[100], 'clf__max_features':['auto'], 'clf__learning_rate':[0.1], 'clf__min_samples_leaf':[50]})
	elif clf_name=='adaboost': #57.9 'n_estimators':100, 'learning_rate':0.1
		param_set.update({'clf__n_estimators':[100,500], 'clf__learning_rate':[0.01,0.1]})
	elif clf_name=='mlp': #95.0 'hidden_layer_sizes':(50,), 'alpha':10, 'solver':'lbfgs'
		param_set.update({'clf__hidden_layer_sizes':[(50,),(60,),(100,)], 'clf__alpha':[0.5,1,2,5,7], 'clf__solver':['adam']})
	elif clf_name=='ecoc':
		param_set.update({})
		
	# run grid search or randomized search
	if tuning_method=='grid':
		search = GridSearchCV(pipe, param_grid=param_set, cv=2, n_jobs=3)
	elif tuning_method=='rand':
		search = RandomizedSearchCV(pipe, param_distributions=param_set, n_iter=10, cv=2, n_jobs=3)
					
	return search
Esempio n. 29
0
# predict
predictions = classifier.predict(valid_X)
accuracy_score(valid_label, predictions)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

##
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
                              GradientBoostingClassifier)

from sklearn.multiclass import OutputCodeClassifier

classifier = OutputCodeClassifier(GradientBoostingClassifier(max_depth=5,
                                                             n_estimators=14),
                                  code_size=2,
                                  random_state=0)

classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
accuracy_score(y_test, predictions)

# creating a confusion matrix
cm = confusion_matrix(y_test, dtree_predictions)

### test data
test['age_bin'] = test['age'].apply(lambda x: age_bin(x))

test = test[~test['image_name'].isin(wrong_im_test)]

encode_columns_test = test[['age_bin', 'gender', 'view_position']]
Esempio n. 30
0
def test_ecoc_exceptions():
    ecoc = OutputCodeClassifier(LinearSVC(random_state=0))
    with pytest.raises(NotFittedError):
        ecoc.predict([])