def test_ecoc_fit_predict():
    # A classifier which implements decision_function.
    ecoc = OutputCodeClassifier(LinearSVC(), code_size=2)
    pred = ecoc.fit(iris.data, iris.target).predict(iris.data)
    assert_equal(len(ecoc.estimators_), n_classes * 2)

    # A classifier which implements predict_proba.
    ecoc = OutputCodeClassifier(MultinomialNB(), code_size=2)
    pred = ecoc.fit(iris.data, iris.target).predict(iris.data)
    assert_equal(len(ecoc.estimators_), n_classes * 2)
def train_svm(labels,array, num_folds, num_jobs, params = 2):
	#obtain the best parameter settings for an svm outputcode classifier
	bestParameters = dict()
	if len(labels) > 2:
		print("outputcodeclassifier")
		#param_grid = {'estimator__C': [0.001, 0.005, 0.01,0.1, 0.5, 1,2.5, 5, 10,15,25, 50,75, 100, 500, 1000],
		#	'estimator__kernel': ['linear','rbf','poly'], 
		#	'estimator__gamma': [0.0005,0.001, 0.002, 0.008,0.016, 0.032,0.064, 0.128,0.256, 0.512, 1.024, 2.048],
		#	'estimator__degree': [1,2,3,4]}
		param_grid = {'estimator__C': [0.001, 0.005],
			'estimator__kernel': ['linear','rbf'], 
			'estimator__gamma': [0.0005,0.001],
			'estimator__degree': [1]}
		model = OutputCodeClassifier(svm.SVC(probability=True))
	else:
		print("svc model")
		param_grid = {'C': [0.001, 0.005, 0.01, 0.5, 1, 5, 10, 50, 100, 500, 1000],
			'kernel': ['linear','rbf','poly'], 
			'gamma': [0.0005, 0.002, 0.008, 0.032, 0.128, 0.512, 1.024, 2.048],
			'degree': [1,2,3,4]}
		model = svm.SVC(probability=True)
	
	paramsearch = RandomizedSearchCV(model, param_grid, cv=num_folds, verbose=2,n_iter = params,n_jobs=num_jobs) 
	print("Grid search...")
	paramsearch.fit(array,numpy.asarray(labels))
	print("Prediction...")
	parameters = paramsearch.best_params_
	
	for parameter in parameters.keys():
		print(parameter + ": " + str(parameters[parameter]) + "\n")
	print("best score: " + str(paramsearch.best_score_) + "\n\n")
	
	#for score in paramsearch.grid_scores_:
	#	print 'mean score:',score.mean_validation_score
	#	print 'list scores:',score.cv_validation_scores
	#train an svm outputcode classifier using the best parameters
	
	if len(labels) > 2:
		test = svm.SVC(probability=True, C=parameters['estimator__C'],
			kernel=parameters['estimator__kernel'],gamma=parameters['estimator__gamma'],
			degree=parameters['estimator__degree'])
		out_test = OutputCodeClassifier(test,n_jobs=1)
		out_test.fit(array,labels)
	else:
		test = svm.SVC(probability=True, C=parameters['C'],
			kernel=parameters['kernel'],gamma=parameters['gamma'],
			degree=parameters['degree'])
		#test.fit(array,labels)
	return test	
Example #3
0
def test_ecoc_fit_predict():
    # A classifier which implements decision_function.
    ecoc = OutputCodeClassifier(LinearSVC(random_state=0),
                                code_size=2,
                                random_state=0)
    ecoc.fit(iris.data, iris.target).predict(iris.data)
    assert_equal(len(ecoc.estimators_), n_classes * 2)

    # A classifier which implements predict_proba.
    ecoc = OutputCodeClassifier(MultinomialNB(), code_size=2, random_state=0)
    ecoc.fit(iris.data, iris.target).predict(iris.data)
    assert_equal(len(ecoc.estimators_), n_classes * 2)
 def __init__(self, labels, data, load=False, save=False):
     if load:
         with open(clfData, 'rb') as input:
             self.classifier = pickle.load(input)
         with open(vecData, 'rb') as input:
             self.verctorizer = pickle.load(input)
         return
     self.verctorizer = DictVectorizer()
     featureVec = self.verctorizer.fit_transform(data)
     self.classifier = OutputCodeClassifier(LinearSVC(random_state=0),
                                            code_size=2,
                                            random_state=0)
     # self.classifier = LogisticRegression( solver='sag')
     self.classifier.fit(featureVec, labels)
     if save:
         with open(clfData, 'wb') as output:
             pickle.dump(self.classifier, output, pickle.HIGHEST_PROTOCOL)
         with open(vecData, 'wb') as output:
             pickle.dump(self.verctorizer, output, pickle.HIGHEST_PROTOCOL)
def af_vecAvg_MaxEnt_OutputCode(data):
    job = Job('af_vecAvg_MaxEnt_OutputCode', cv = cv_n_fold)
    pipeline = Pipeline(steps=[("vecAvg", Word2VecTransformer(fld.get_path(fld.model_meta_data, fl_word_vectors), 
                                                              dim = 300,
                                                              all_text_data = list(data.df[data.fs_ind]))),
                               ('m', OutputCodeClassifier(LogisticRegression(),
                                                          code_size = 10))])
    parameters = dict(m__estimator__C = [0.01])
    job.run(pipeline, parameters, data)
    return None
def voting_classifier():

	# create the classifier objects
	f_sel = SVC(C=1, kernel='linear', shrinking=True, class_weight='balanced')
	classifiers = {
		'knn':Pipeline([('f_sel',SelectFromModel(f_sel)), ('clf',KNeighborsClassifier())]),
		'logistic':LogisticRegression(),
		'lda':LinearDiscriminantAnalysis(),
		'svm':Pipeline([('f_sel',SelectFromModel(f_sel)), ('clf',SVC())]),
		'tree':DecisionTreeClassifier(),
		'randomforest':RandomForestClassifier(),
		'extratrees':ExtraTreesClassifier(),
		'gradboost':GradientBoostingClassifier(),
		'adaboost':AdaBoostClassifier(),
		'mlp':MLPClassifier(),
		'ecoc':OutputCodeClassifier(SVC(C=2,kernel='linear',shrinking=True,probability=True,class_weight='balanced'), code_size=2)}
		
	# create ensemble of the classifiers
	clfs = []
	[clfs.append((name,classifiers.get(name))) for name in classifier_names]
	
	# create the voting classifier
	voting_type = classification_method[0:4]
	eclf = VotingClassifier(estimators=clfs, voting=voting_type)
	
	# specify parameters of the classifiers
	param_set = {}
	if 'knn' in classifier_names: #89.9,90.8 'n_neighbors':17, 'p':1, 'weights':'distance'
		param_set.update({'knn__clf__n_neighbors':[17], 'knn__clf__p':[1], 'knn__clf__weights':['distance'], 'knn__clf__algorithm':['auto'], 'knn__clf__n_jobs':[3]})
	if 'logistic' in classifier_names: #94.4 'C':1, 'solver':'newton-cg'
		param_set.update({'logistic__C':[2], 'logistic__solver':['lbfgs'], 'logistic__class_weight':['balanced'], 'logistic__max_iter':[100]})
	if 'lda' in classifier_names: #94.9 'solver':'lsqr'
		param_set.update({'lda__solver':['lsqr'], 'lda__shrinkage':['auto']})
	if 'svm' in classifier_names: #95.3 'C':1, 'kernel':'linear'
		param_set.update({'svm__clf__C':[2], 'svm__clf__kernel':['linear'], 'svm__clf__shrinking':[True], 'svm__clf__probability':[True], 'svm__clf__class_weight':['balanced'], 'svm__clf__decision_function_shape':['ovo']})
	if 'tree' in classifier_names: #82.3 'max_depth':15
		param_set.update({'tree__max_depth':[10,15,20], 'tree__class_weight':['balanced'], 'tree__presort':[True]})
	if 'randomforest' in classifier_names: #91.8 'n_estimators':300, 'min_samples_leaf':None, 'max_depth':25
		param_set.update({'randomforest__n_estimators':[100], 'randomforest__max_features':[10,25,50], 'randomforest__min_samples_leaf':[50] ,'randomforest__max_depth':[None], 'randomforest__bootstrap':[True], 'randomforest__class_weight':['balanced'], 'randomforest__oob_score':[True], 'randomforest__n_jobs':[3]})
	if 'extratrees' in classifier_names: #92.8 'n_estimators':500, 'max_depth':50
		param_set.update({'extratrees__n_estimators':[300], 'extratrees__max_features':['auto'], 'extratrees__min_samples_leaf':[50], 'extratrees__max_depth':[None], 'extratrees__bootstrap':[False], 'extratrees__class_weight':['balanced'], 'extratrees__oob_score':[False], 'extratrees__n_jobs':[3]})
	if 'gradboost' in classifier_names: #92.3 'n_estimators':100, 'learning_rate':0.1, 'min_samples_leaf':50
		param_set.update({'gradboost__n_estimators':[100], 'gradboost__max_features':['auto'], 'gradboost__learning_rate':[0.1], 'gradboost__min_samples_leaf':[50]})
	if 'adaboost' in classifier_names:
		param_set.update({'adaboost__n_estimators':[100], 'adaboost__learning_rate':[0.1]})
	if 'mlp' in classifier_names: # 95.0 'hidden_layer_sizes':(50,), 'alpha':10, 'solver':'lbfgs'
		param_set.update({'mlp__hidden_layer_sizes':[(50,)], 'mlp__alpha':[10], 'mlp__solver':['lbfgs']})
	
	# run grid search or randomized search
	if tuning_method=='grid':
		search = GridSearchCV(eclf, param_grid=param_set, cv=2, n_jobs=3)
	elif tuning_method=='rand':
		search = RandomizedSearchCV(eclf, param_distributions=param_set, n_iter=10, cv=2, n_jobs=3)
	
	return search
def aa_tfidf_MaxEnt_OutputCode(data):
    job = Job('aa_tfidf_MaxEnt_OutputCode', cv = cv_n_fold)
    pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english',
                                                         max_features = 2000,
                                                         min_df = 5)),
                               ('m', OutputCodeClassifier(LogisticRegression(),
                                                          code_size = 10))])
    parameters = dict(tfidf__norm = ['l2'],
                      tfidf__ngram_range = [(1, 2)],
                      m__estimator__C = [0.01])
    job.run(pipeline, parameters, data)
    return None
Example #8
0
def test_ecoc_float_y():
    # Test that the OCC errors on float targets
    X = iris.data
    y = iris.data[:, 0]

    ovo = OutputCodeClassifier(LinearSVC())
    msg = "Unknown label type"
    with pytest.raises(ValueError, match=msg):
        ovo.fit(X, y)

    ovo = OutputCodeClassifier(LinearSVC(), code_size=-1)
    msg = "code_size should be greater than 0, got -1"
    with pytest.raises(ValueError, match=msg):
        ovo.fit(X, y)
def ab_tfidf_elasticnet_OutputCode(data):
    job = Job('ab_tfidf_elasticnet_OutputCode', cv = cv_n_fold)
    pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words = 'english',
                                                         min_df = 5)),
                               ('elnet', OutputCodeClassifier(
                               SGDClassifier(penalty="elasticnet"),
                               code_size = 100))])
    parameters = dict(tfidf__norm = ['l2'],
                      tfidf__ngram_range = [(1, 2)], # ,      # [(1, 3)]
                      elnet__estimator__alpha = [0.0001],  # [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
                      elnet__estimator__l1_ratio = [0.1]) # [0.1, 0.5, 0.8, 0.9, 0.99]
    job.run(pipeline, parameters, data)
    return None
Example #10
0
def scikit_outputcode(X, y, X_test, y_test=None):
    from sklearn.multiclass import OutputCodeClassifier
    from sklearn.svm import LinearSVC
    predictions = OutputCodeClassifier(LinearSVC(random_state=0),
                                       code_size=2,
                                       random_state=0).fit(X,
                                                           y).predict(X_test)
    correctcount = 0
    totalcount = 0
    for index, each in enumerate(predictions):
        if y_test[index] == each:
            correctcount += 1
        totalcount += 1

    print str(correctcount) + " / " + str(totalcount) + " = " + str(
        float(correctcount) / totalcount)
Example #11
0
def clasificar_ECOC(X, y, df, trainInputs, trainOutputs, testInputs, testOutputs, graphname):
	print("\n[" + str(graphname) + "]")
	kernelRBF=1.0*RBF(1.0)
	clf=OutputCodeClassifier(estimator = DecisionTreeClassifier())
	clf=clf.fit(trainInputs, trainOutputs)
	precisionTrain = clf.score(trainInputs, trainOutputs)
	precisionTest = clf.score(testInputs, testOutputs)
	print("\tCCR train = %.2f%% | CCR test = %.2f%%" % (precisionTrain*100, precisionTest*100))
	prediccion_test = clf.predict(testInputs)
	print(prediccion_test)
	print(testOutputs)
	return precisionTest
Example #12
0
def OutputCodeClassifier(data, label, pred_data, pred_last):
    '''
    0.76473194506
    Number of mislabeled points out of a total 841 points : 211
    0.749108204518
    需要规范化
    '''
    data = np.array(data)
    pred_data = np.array(pred_data)
    label = np.array(label)
    pred_last = np.array(pred_last)
    from sklearn.multiclass import OutputCodeClassifier
    from sklearn.svm import LinearSVC
    clf = OutputCodeClassifier(LinearSVC(random_state=0),
                               code_size=2,
                               random_state=0)
    clf.fit(data, label)

    print clf.score(data, label)
    pred_result = clf.predict(pred_data)
    print("Number of mislabeled points out of a total %d points : %d" %
          (pred_data.shape[0], (pred_last != pred_result).sum()))
    print clf.score(pred_data, pred_last)
    return pred_result
Example #13
0
    def _model1(self, visDataObjects, features, labels):
        """Ted's round one.

    Find max margin in:
      for t in vis_types:
        for x in columns:
          yield margin(x_axis | t, x)

    Repeat for y.

    Then we basis so (independently) pick the best axis assignment for a chart
    type.
    """
        from sklearn.multiclass import OutputCodeClassifier
        from sklearn.svm import LinearSVC
        clf = OutputCodeClassifier(LinearSVC(random_state=0),
                                   code_size=2,
                                   random_state=0)
        pass
Example #14
0
    def _multiclass_refit(self, clf):
        """Return advanced choices of the classification method"""

        if self.args.multiclass == 'one-vs-rest':
            from sklearn.multiclass import OneVsRestClassifier
            print('[ML] Using one-vs-rest method to re-train')
            clf = OneVsRestClassifier(clf)

        elif self.args.multiclass == 'one-vs-one':
            from sklearn.multiclass import OneVsOneClassifier
            self.args.get_prob = False
            print('[ML] Using one-vs-one method to re-train')
            print('[ML] WARNING: Set get_prob to False')
            clf = OneVsOneClassifier(clf)

        elif self.args.multiclass == 'error-correcting':
            from sklearn.multiclass import OutputCodeClassifier
            print('[ML] Using error-correcting method to re-train')
            clf = OutputCodeClassifier(clf, code_size=2)

        return clf
Example #15
0
 def evaluateOutputCode(X, Y, printReport=False):
     time = datetime.datetime.now()
     X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                         Y,
                                                         test_size=0.2,
                                                         random_state=42)
     clf = OutputCodeClassifier(LinearSVC(random_state=0),
                                code_size=2,
                                random_state=0)
     clf.fit(X_train, Y_train)
     if printReport:
         print 'Training time:' + str(datetime.datetime.now() - time)
         print 'Evaluation result: OneVsOne: ' + str(
             clf.score(X_test, Y_test))
     Y_test = clf.predict(X_test)
     if printReport:
         print '0: ' + str((Y_test == 0).sum())
         print '1: ' + str((Y_test == 1).sum())
         print '2: ' + str((Y_test == 2).sum())
     return [clf.score(X_test, Y_test), (Y_test == 1).sum(), clf]
Example #16
0
 def train_svm(self,params = 10):
     #obtain the best parameter settings for an svm outputcode classifier
     if len(self.labels) > 2:
         print("outputcodeclassifier")
         param_grid = {'estimator__C': [0.001, 0.005, 0.01, 0.5, 1, 5, 10, 50, 100, 500, 1000],
             'estimator__kernel': ['linear','rbf','poly'], 
             'estimator__gamma': [0.0005, 0.002, 0.008, 0.032, 0.128, 0.512, 1.024, 2.048],
             'estimator__degree': [1,2,3,4]}
         model = OutputCodeClassifier(svm.SVC(probability=True))
     else:
         print("svc model")
         param_grid = {'C': [0.001, 0.005, 0.01, 0.5, 1, 5, 10, 50, 100, 500, 1000],
             'kernel': ['linear','rbf','poly'], 
             'gamma': [0.0005, 0.002, 0.008, 0.032, 0.128, 0.512, 1.024, 2.048],
             'degree': [1,2,3,4]}
         model = svm.SVC(probability=True)
     paramsearch = RandomizedSearchCV(model, param_grid, cv=5, verbose=2,n_iter = params,n_jobs=self.jobs) 
     print("Grid search...")
     paramsearch.fit(self.training_csr,numpy.asarray(self.trainlabels))
     print("Prediction...")
     #print the best parameters to the file
     parameters = paramsearch.best_params_
     self.outstring = "best parameter settings:\n"
     for parameter in parameters.keys():
         self.outstring += (parameter + ": " + str(parameters[parameter]) + "\n")
     self.outstring += ("best score: " + str(paramsearch.best_score_) + "\n\n")
     #train an svm outputcode classifier using the best parameters
     if len(self.labels) > 2:
         clf = svm.SVC(probability=True, C=parameters['estimator__C'],
             kernel=parameters['estimator__kernel'],gamma=parameters['estimator__gamma'],
             degree=parameters['estimator__degree'])
         self.clf = OutputCodeClassifier(clf,n_jobs=self.jobs)
         self.clf.fit(self.training_csr,self.trainlabels)
     else:
         self.clf = svm.SVC(probability=True, C=parameters['C'],
             kernel=parameters['kernel'],gamma=parameters['gamma'],
             degree=parameters['degree'])
         self.clf.fit(self.training_csr,self.trainlabels)
Example #17
0
from sklearn.multiclass import OutputCodeClassifier
from sklearn.svm import LinearSVC
from sklearn.datasets import load_svmlight_file
import numpy as np
import sklearn

TEST_SPLIT = .2

X, Y = load_svmlight_file("ablated_features.txt")

num_instances = len(Y)
num_test = int((1 - TEST_SPLIT) * num_instances)
indices = np.arange(num_instances)
np.random.shuffle(indices)

X = X[indices]
Y = Y[indices]

X_train = X[:num_test]
Y_train = Y[:num_test]
X_test = X[num_test:]
Y_test = Y[num_test:]

# print X_train.shape[0], X_test.shape[0]

clf = OutputCodeClassifier(LinearSVC(random_state=0),
                           code_size=20,
                           random_state=0)
preds = clf.fit(X_train, Y_train).predict(X_test)
print sklearn.metrics.accuracy_score(Y_test, preds)
Example #18
0
# predict
predictions = classifier.predict(valid_X)
accuracy_score(valid_label, predictions)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

##
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
                              GradientBoostingClassifier)

from sklearn.multiclass import OutputCodeClassifier

classifier = OutputCodeClassifier(GradientBoostingClassifier(max_depth=5,
                                                             n_estimators=14),
                                  code_size=2,
                                  random_state=0)

classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
accuracy_score(y_test, predictions)

# creating a confusion matrix
cm = confusion_matrix(y_test, dtree_predictions)

### test data
test['age_bin'] = test['age'].apply(lambda x: age_bin(x))

test = test[~test['image_name'].isin(wrong_im_test)]

encode_columns_test = test[['age_bin', 'gender', 'view_position']]
print('NB for BOW KF1',NB_BOW_KF1)
print('NB for TF-IDF KF1',NB_TFIDF_KF1)

NB_BOW_KF2=np.sum(clf.predict(X4)==Y4.values.tolist())/X2.shape[0];
NB_TFIDF_KF2=np.sum(clf2.predict(X4_tf)==Y4.values.tolist())/X2.shape[0];
print('NB for BOW KF2',NB_BOW_KF2)
print('NB for TF-IDF KF2',NB_TFIDF_KF2)


# In[66]:


#SVM
from sklearn.multiclass import OutputCodeClassifier
from sklearn.svm import LinearSVC
clf = OutputCodeClassifier(LinearSVC(random_state=0),code_size=2, random_state=0)
SVM_BOW_KF1=np.sum(clf.fit(X_train, y_train).predict(X2)==Y2.values.tolist())/X2.shape[0]
# save the model to disk
filename = 'finalized_model3.sav'
pickle.dump(clf, open(filename, 'wb'))

clf1 = pickle.load(open(filename, 'rb'))

SVM_TFIDF_KF1=np.sum(clf.fit(tfidf_train, y_train).predict(X2_tf)==Y2.values.tolist())/X2.shape[0]

# save the model to disk
filename = 'finalized_model4.sav'
pickle.dump(clf, open(filename, 'wb'))

clf2 = pickle.load(open(filename, 'rb'))
Example #20
0
#check details
print(f'The size of the data is {breast.data.shape}')
print(f'There are {breast.target_names} classifiers')

# split the dataset into training and testing
x_train, x_test, y_train, y_test = train_test_split(breast.data,
                                                    breast.target,
                                                    test_size=0.2)

# creating a classification
clf_1 = MLPClassifier(solver='lbfgs',
                      alpha=1e-5,
                      hidden_layer_sizes=(5, 2),
                      random_state=42)
clf_2 = OutputCodeClassifier(LinearSVC(random_state=0),
                             code_size=2,
                             random_state=42)

# train the classifier with training data
clf_1.fit(x_train, y_train)
clf_2.fit(x_train, y_train)

# find y_pred prediction best on x_test data
y_pred_1 = clf_1.predict(x_test)
y_pred_2 = clf_2.predict(x_test)

# calculate accuracy of y_pred using y_test
print(f'accuracy {accuracy_score(y_test, y_pred_1)}')
print(f'accuracy {accuracy_score(y_test, y_pred_2)}')

# use classification_report function to print more information
Example #21
0
    train_ingredients.append(' '.join(ings))

#construct test_ingredients
for entry in test_set:
    ings = [WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', w)) for w in entry['ingredients']]

    test_ingredients.append(' '.join(ings))

#used to encode labels as numbers for use with RandomForestClassifier
le = LabelEncoder()

#encode cuisines as numbers
train_cuisines = le.fit_transform(train_cuisines)

#used to create bag of ingredients vocabulary and create features for each entry
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform(train_ingredients).toarray()
test_features = vectorizer.transform(test_ingredients).toarray()

clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=2, random_state=2)
result = clf.fit(train_features, train_cuisines).predict(test_features)

output = pd.DataFrame(data={'id':test_ids, 'cuisine':le.inverse_transform(result)})

#force explicit ordering of columns
output = output[['id', 'cuisine']]
output.to_csv('ecoc.csv', index=False)


Example #22
0
def main():

    filenameLB = 'mfcc_lb.csv'
    allsongcat = pickle.load(open('mfcc_fv.p', 'rb'))
    hcdf = pickle.load(open('hcdf_fv.p', 'rb'))

    with open('mfcc_lb.csv') as f:
        reader = csv.reader(f)
        for row in reader:
            labels = row

    # select training and test sets
    '''
    TEidx = np.array(random.sample(range(0,1000), 100))
    
    training = []
    test = []
    
    trainingLB = []
    testLB = []

    # make numpy arrays
    for i in range(1000):
        if i in TEidx:
            test.append(featureDict[i])
            testLB.append(int(labels[i]))
        else:
            training.append(featureDict[i])
            trainingLB.append(int(labels[i]))
        
    # fit with classifier and predict
    X = np.array(training)
    Y = np.array(trainingLB)

    '''
    l = [allsongcat, hcdf]
    all_feats = combineFeatures(l)
    feats_shuf = []
    labels_shuf = []
    index_shuf = range(len(labels))
    shuffle(index_shuf)
    for i in index_shuf:
        feats_shuf.append(all_feats[i])
        labels_shuf.append(labels[i])

    X = np.array(feats_shuf)
    Y = np.array(labels_shuf)

    kf = KFold(1000, n_folds=10)
    #rf = RandomForestClassifier(n_estimators=50, max_features = 'log2')
    sgd = SGDClassifier(loss="hinge", penalty="l2")
    #svc = svm.SVC(kernel='linear')
    dtree = DecisionTreeClassifier(max_depth=3)
    lsvc = LinearSVC(random_state=0)
    cla = OutputCodeClassifier(sgd, code_size=128, random_state=0)

    cm_all = np.zeros((10, 10), dtype=np.int)

    cb = np.zeros((10, 20))
    losses = []

    with open('ECOC_sgd_error.csv', 'w') as f1:
        wrtest = csv.writer(f1,
                            quoting=csv.QUOTE_NONNUMERIC,
                            lineterminator='\n')
        scores = 0.0
        for train, test in kf:
            X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[
                test]
            cla.fit(X_train, y_train)
            predictions = cla.predict(X_test)
            loss = zero_one_loss(predictions, y_test)
            losses.append(loss)
            scores += loss
            # print y_test
            # print predictions

            cb = cla.code_book_

            np.savetxt('codebook.csv', cb, delimiter=',')

            # Compute confusion matrix
            cm = confusion_matrix(
                y_test,
                predictions,
                labels=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])
            np.set_printoptions(precision=2)
            #print(cm_all)
            cm_all = np.add(cm_all, cm)

        # make ECOC coding matrix 0-1 binary
        cb[cb <= 0] = 0
        wrtest.writerow(losses)
    print cb

    print scores / 10
Example #23
0
def oc_classify(X,Y):
	size = np.count_nonzero(sp.unique(Y))
	clf = OutputCodeClassifier(LinearSVC(),code_size=size)
	clf.fit(X,Y)
	return clf
print "Running Feature Extraction.."
vectorizer = CountVectorizer()  #initialise Bag of words
train_count = vectorizer.fit_transform(train.Phrase)
print "Bag of words Counts: ", train_count.shape

#Tf-Idf Transformer
print "Running Tf-Idf Transformer"
tf_idf = TfidfTransformer()  #initialise Tf-Idf Transformer
train_tf_idf = tf_idf.fit_transform(train_count)
print "Tf-Idf : ", train_tf_idf.shape

#Process the test set
print "Processing Test set.. \n"
test_count = vectorizer.transform(test.Phrase)
test_tf_idf = tf_idf.transform(test_count)

print "Training the Model and predicting on the Test data.."
predicted1 = OutputCodeClassifier(LinearSVC(random_state=0),
                                  code_size=2,
                                  random_state=0).fit(
                                      train_tf_idf,
                                      train.Sentiment).predict(test_tf_idf)

print "Writing the output in a csv file..."
output = pd.DataFrame(data={
    "PhraseId": test.PhraseId,
    "Sentiment": predicted1
})
output.to_csv("Sentiment Analysis on Movie Reviews -- OutputCode",
              index=False,
              quoting=3)
def single_classifier(clf_name):

	# create the classifier objects
	classifiers = {
		'knn':KNeighborsClassifier(),
		'logistic':LogisticRegression(),
		'lda':LinearDiscriminantAnalysis(),
		'svm':SVC(),
		'tree':DecisionTreeClassifier(),
		'randomforest':RandomForestClassifier(),
		'extratrees':ExtraTreesClassifier(),
		'gradboost':GradientBoostingClassifier(),
		'adaboost':AdaBoostClassifier(),
		'mlp':MLPClassifier(),
		'ecoc':OutputCodeClassifier(SVC(C=2,kernel='linear',shrinking=True,class_weight='balanced'), code_size=2)}

	# feature selection using a pipeline
	if f_sel_method=='none':
		pipe = Pipeline([('clf',classifiers[clf_name])])
		param_set = {}
	elif f_sel_method=='anova':
		pipe = Pipeline([('f_sel',SelectPercentile(score_func=f_classif)), ('clf',classifiers[clf_name])])
		param_set = {'f_sel__percentile':[25,50,75,100]}
	elif f_sel_method=='mutualinfo':
		pipe = Pipeline([('f_sel',SelectPercentile(score_func=mutual_info_classif)), ('clf',classifiers[clf_name])])
		param_set = {'f_sel__percentile':[25,50,75,100]}
	elif f_sel_method=='recursivesvm':
		f_sel = SVC(C=1, kernel='linear', shrinking=True, class_weight='balanced')
		pipe = Pipeline([('f_sel',RFECV(estimator=f_sel)), ('clf',classifiers[clf_name])])
		param_set = {'f_sel__step':[10], 'f_sel__cv':[2], 'f_sel__scoring':['accuracy']}
	elif f_sel_method=='frommodelsvm':
		f_sel = SVC(C=1, kernel='linear', shrinking=True, class_weight='balanced')
		pipe = Pipeline([('f_sel',SelectFromModel(f_sel)), ('clf',classifiers[clf_name])])
		param_set = {}
	elif f_sel_method=='frommodeltree':
		f_sel = ExtraTreesClassifier(n_estimators=100, class_weight='balanced')
		pipe = Pipeline([('f_sel',SelectFromModel(f_sel)), ('clf',classifiers[clf_name])])
		param_set = {}

	# specify parameters of the classifiers
	if clf_name=='knn': #89.9,90.8 'n_neighbors':17, 'p':1, 'weights':'distance'
		param_set.update({'clf__n_neighbors':[1,9,13,17,25,50], 'clf__p':[1,2,3,5], 'clf__weights':['distance'], 'clf__algorithm':['auto'], 'clf__n_jobs':[3]})
	elif clf_name=='logistic': #94.4 'C':1, 'solver':'newton-cg'
		param_set.update({'clf__C':[1,2,3,4], 'clf__solver':['newton-cg'], 'clf__class_weight':['balanced'], 'clf__max_iter':[100]})
	elif clf_name=='lda': #94.9 'solver':'lsqr'
		param_set.update({'clf__solver':['lsqr','eigen'], 'clf__shrinkage':['auto']})
	elif clf_name=='svm': #95.3 'C':1, 'kernel':'linear'
		param_set.update({'clf__C':[0.75,1,1.25,1.5,2], 'clf__kernel':['linear'], 'clf__shrinking':[True], 'clf__probability':[False], 'clf__class_weight':['balanced'], 'clf__decision_function_shape':['ovr']})
	elif clf_name=='tree': #82.3 'max_depth':15
		param_set.update({'clf__min_samples_leaf':[10,50,75,100], 'clf__class_weight':['balanced'], 'clf__presort':[True]})
	elif clf_name=='randomforest': #91.8 'n_estimators':300, 'min_samples_leaf':None, 'max_depth':25
		param_set.update({'clf__n_estimators':[500,1000], 'clf__max_features':[5,10,25], 'clf__min_samples_leaf':[1,10,25] ,'clf__max_depth':[None], 'clf__bootstrap':[True], 'clf__class_weight':['balanced'], 'clf__oob_score':[False], 'clf__n_jobs':[3]})
	elif clf_name=='extratrees': #92.8 'n_estimators':500, 'max_depth':50
		param_set.update({'clf__n_estimators':[100,500,1000], 'clf__max_features':[5,10,20,25,50,100,150], 'clf__min_samples_leaf':[1,10,25,50,100], 'clf__max_depth':[None], 'clf__bootstrap':[False], 'clf__class_weight':['balanced'], 'clf__oob_score':[False], 'clf__n_jobs':[3]})
	elif clf_name=='gradboost': #92.3 'n_estimators':100, 'learning_rate':0.1, 'min_samples_leaf':50
		param_set.update({'clf__n_estimators':[100], 'clf__max_features':['auto'], 'clf__learning_rate':[0.1], 'clf__min_samples_leaf':[50]})
	elif clf_name=='adaboost': #57.9 'n_estimators':100, 'learning_rate':0.1
		param_set.update({'clf__n_estimators':[100,500], 'clf__learning_rate':[0.01,0.1]})
	elif clf_name=='mlp': #95.0 'hidden_layer_sizes':(50,), 'alpha':10, 'solver':'lbfgs'
		param_set.update({'clf__hidden_layer_sizes':[(50,),(60,),(100,)], 'clf__alpha':[0.5,1,2,5,7], 'clf__solver':['adam']})
	elif clf_name=='ecoc':
		param_set.update({})
		
	# run grid search or randomized search
	if tuning_method=='grid':
		search = GridSearchCV(pipe, param_grid=param_set, cv=2, n_jobs=3)
	elif tuning_method=='rand':
		search = RandomizedSearchCV(pipe, param_distributions=param_set, n_iter=10, cv=2, n_jobs=3)
					
	return search
Example #26
0
      knn.fit(train_ft, train_label).score(test_ft, test_label))
print('LogisticRegression score: %f' %
      logistic.fit(train_ft, train_label).score(test_ft, test_label))

# SVM
list_of_acc = list()

accur = 0
# for c in np.logspace(-2, 10, 5):
c = 1000
# for c in np.logspace(-2, 10, 5):
#     for c in np.logspace(-2, 10, 5):
for c in [100, 1000, 10000, 100000]:
    for g in np.logspace(-9, 3, 13):

        clf = OutputCodeClassifier(svm.SVC(random_state=0, gamma=g, C=c),
                                   code_size=10,
                                   random_state=0)

        accur_temp = clf.fit(svmtrain,
                             svmtrainlabel).score(svmtest, svmtestlabel)

        if accur < accur_temp:
            accur = accur_temp
            gamma = g

        print(c, g, accur)

list_of_acc.append(accur)
print(np.mean(list_of_acc))
Example #27
0
def getFitness(individual, X, y):
    """
    Feature subset fitness function
    """

    if individual.count(0) != len(individual):
        # get index with value 0
        cols = [index for index in range(
            len(individual)) if individual[index] == 0]

        # get features subset
        X_parsed = X.drop(X.columns[cols], axis=1)
        X_subset = pd.get_dummies(X_parsed)

        # X_subset = X
        #
        # for col in cols:
        #     X_subset[col].values[:] = 0

        # apply classification algorithm
        clf = AdaBoostClassifier()
        clf = BaggingClassifier()
        clf = BernoulliNB()

        clf = CalibratedClassifierCV()
        clf = CategoricalNB()
        clf = ClassifierChain()
        clf = ComplementNB()

        clf = DecisionTreeClassifier()
        clf = DummyClassifier()

        clf = ExtraTreeClassifier()
        clf = ExtraTreesClassifier()

        clf = GaussianNB()
        clf = GaussianProcessClassifier()
        clf = GradientBoostingClassifier()

        # clf = HistGradientBoostingClassifier()

        clf = KNeighborsClassifier()

        clf = LabelPropagation()
        clf = LabelSpreading()
        clf = LinearDiscriminantAnalysis()
        clf = LinearSVC()
        clf = LogisticRegression()
        clf = LogisticRegressionCV()

        clf = MLPClassifier()
        clf = MultiOutputClassifier()
        clf = MultinomialNB()

        clf = NearestCentroid()
        clf = NuSVC()

        clf = OneVsOneClassifier()
        clf = OneVsRestClassifier()
        clf = OutputCodeClassifier()

        clf = PassiveAggressiveClassifier()
        clf = Perceptron()

        clf = QuadraticDiscriminantAnalysis()

        clf = RadiusNeighborsClassifier()
        clf = RandomForestClassifier()
        clf = RidgeClassifier()
        clf = RidgeClassifierCV()

        clf = SGDClassifier()
        clf = SVC()
        clf = StackingClassifier()

        clf = VotingClassifier()

        # clf.fit(X, y)
        # clf.fit(X_subset, y_train)
        clf.fit(X_subset, y)

        # y_pred_ANN = clf.predict(X_test)
        # y_pred = clf.predict(X_subset)

        # score = cross_val_score(clf, X, y, cv=5)
        #
        # print(max(score), min(score))

        return (avg(cross_val_score(clf, X_subset, y, cv=5)),)
        # return (avg(score),)
        # return accuracy_score(y, y_pred_ANN)
    else:
        return (0,)
Example #28
0
# -*- coding: utf-8 -*-
"""
Created on Fri May 24 20:38:46 2019

@author: pathouli
"""

import pandas as pd
from sklearn.multiclass import OutputCodeClassifier
from sklearn.svm import LinearSVC

the_path = 'C:/Users/pathouli/myStuff/academia/torhea/projects/groupC/'

allstate_data = pd.read_csv(the_path + 'train.csv', sep=",")
clf = OutputCodeClassifier(LinearSVC(random_state=0),
                           code_size=2,
                           random_state=0)

label_cols = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
X_cols = allstate_data.columns.difference(label_cols)
X = allstate_data[X_cols][1:10000]
y = allstate_data[label_cols][1:10000]  #small sample to test

clf.fit(X, y).predict(X)

# https://www.kaggle.com/c/allstate-purchase-prediction-challenge/data
Example #29
0
     X_train = training2
     y_train = labels[100:172,i]
     X_test = sample2
     y_test = labels[272:,i]
 else:
     X_train = training
     y_train = labels[:172,i]
     X_test = sampletest
     y_test = labels[172:,i]
 
 box = np.zeros([6,6])
 accuracy = np.zeros(100)
 for m in range(0,100):
     posterior = np.empty([100,72,6])
     gbc = GradientBoostingClassifier(n_estimators=60, max_depth=3)
     occ = OutputCodeClassifier(gbc)
     y_pred = occ.fit(X_train, y_train).predict(X_test)
     
     n=0
     for i in range(0,len(y_pred)):
         if y_pred[i] == y_test[i]:
             #print i, y_pred[i], y_test[i]
             n = n+1
             accuracy[m] = accuracy[m]+1
         box[y_test[i]-1,y_pred[i]-1] = box[y_test[i]-1,y_pred[i]-1] + 1
             #posterior[m] =  knc.predict_proba(X_test)
 print np.mean(accuracy)/0.72, np.std(accuracy)/0.72
 #print sum(accuracy[0:8])/8.0, sum(accuracy[8:18])/10.0, sum(accuracy[18:30])/12.0, sum(accuracy[56:72])/16.0, sum(accuracy[30:43])/13.0, sum(accuracy[43:56])/13.0, sum(accuracy)/72.0
 '''
 means = np.empty([72,6])
 stds = np.empty([72,6])
Example #30
0
    def __init__(self, X, y, people, df_features, feature_names, conf_dict):

        self.X = X
        self.y = np.array(y)
        self.feature_names = feature_names

        self.people = people.reset_index(drop=True)
        self.X_df = df_features
        self.y_df = y.reset_index(drop=True)

        self.app_list = config_dict["app_list"]
        self.labels_numeric = {name: i for i, name in enumerate(self.app_list)}
        self.n_classes = len(self.labels_numeric)

        self.clf_name = config_dict["classifier"]

        #        self.feature_selection = config_dict["feature_selection"] #True/False
        #        self.num_features = config_dict["num_features"]
        #        self.one_vs_all_type = config_dict["one_vs_all_type"]
        self.feature_selection = conf_dict["feature_selection"]  #True/False
        self.num_features = conf_dict["num_features"]
        self.one_vs_all_type = conf_dict["one_vs_all_type"]

        self.chosen_feature_names = None
        self.chosen_features_all_folds = []

        self.clf_dict = {}
        #self.clf_dict["one_vs_all"] = OneVsRestClassifier(SVC(kernel='rbf', C=1000, gamma=0.001))

        self.clf_dict["output_code"] = OutputCodeClassifier(SVC(kernel='rbf',
                                                                C=1000,
                                                                gamma=0.001),
                                                            code_size=2,
                                                            random_state=0)

        params_rf = {
            'n_estimators': 100,
            'max_depth': 20,
            'max_features': 'sqrt',
            'min_samples_leaf': 1,
            'min_samples_split': 10,
            'random_state': 0
        }
        #params_rf = {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_features': 'auto', 'max_depth': 20, 'random_state': 0}
        self.clf_dict["rf"] = RandomForestClassifier(**params_rf)
        #        self.clf_dict["svm"] = SVC(kernel='rbf', C=1000, gamma=0.001)
        #        self.clf_dict["svm"] = SVC(kernel='linear', C=1, gamma=0.001)
        params_svm = {
            'C': 10,
            'degree': 2,
            'gamma': 'scale',
            'kernel': 'sigmoid'
        }
        self.clf_dict["svm"] = SVC(**params_svm)

        #Naive Bayes classifier is a general term which refers to conditional independence of each of the features in the model, while Multinomial Naive Bayes classifier is a specific instance of a Naive Bayes classifier which uses a multinomial distribution for each of the features.
        self.clf_dict["nb"] = MultinomialNB(alpha=0.00001)
        self.clf_dict["gnb"] = GaussianNB(var_smoothing=0.05)

        self.clf_dict["knn"] = KNeighborsClassifier(n_neighbors=8)

        params_dt = {
            'criterion': 'gini',
            'max_depth': 20,
            'max_features': 'auto',
            'min_samples_leaf': 2,
            'min_samples_split': 2,
            'random_state': 42,
            'splitter': 'best'
        }
        self.clf_dict["dt"] = DecisionTreeClassifier(**params_dt)

        self.clf_dict["one_vs_all"] = OneVsRestClassifier(
            self.clf_dict[conf_dict["one_vs_all_type"]])

        self.fs_dict = {}
        self.fs_dict["selectKbest_chi2"] = SelectKBest(chi2,
                                                       k=self.num_features)
        self.fs_dict["selectKbest_fclassif"] = SelectKBest(f_classif,
                                                           k=self.num_features)
    for ind, im in enumerate(images):
        row = []
        for (top_left, bottom_right) in rectangles:
            row += get_haar_features(im, top_left, bottom_right)

        train_ecoc_table[ind] = row

    test_ecoc_table = np.zeros(shape=(np.shape(test_images)[0], 200))
    for ind, im in enumerate(test_images):
        row = []
        for (top_left, bottom_right) in rectangles:
            row += get_haar_features(im, top_left, bottom_right)

        test_ecoc_table[ind] = row

    clf = OutputCodeClassifier(AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200), code_size=5, random_state=0)
    clf.fit(train_ecoc_table, labels)

    train_pred = np.array(clf.predict(train_ecoc_table))
    print "Digits Training Accuracy: %f" % (np.sum(train_pred == np.array(labels)).astype(np.float)/np.shape(train_pred)[0])

    test_pred = np.array(clf.predict(test_ecoc_table))
    print "Digits Testing Accuracy: %f" % (np.sum(test_pred == np.array(test_labels)).astype(np.float)/np.shape(test_pred)[0])

    # ecoc_table = []
    # for im in images:
    #
    #     im_preprocess = np.matrix([[np.sum(im[:i,:j]) for i in range(1, 29)] for j in range(1, 29)])
    #
    #     def get_black_rectangle(top_left, bottom_right):
    #         x1, y1 = top_left
Example #32
0
def test_ecoc_exceptions():
    ecoc = OutputCodeClassifier(LinearSVC(random_state=0))
    with pytest.raises(NotFittedError):
        ecoc.predict([])
Example #33
0
def test_ecoc_delegate_sparse_base_estimator():
    # Non-regression test for
    # https://github.com/scikit-learn/scikit-learn/issues/17218
    X, y = iris.data, iris.target
    X_sp = sp.csc_matrix(X)

    # create an estimator that does not support sparse input
    base_estimator = CheckingClassifier(
        check_X=check_array,
        check_X_params={
            "ensure_2d": True,
            "accept_sparse": False
        },
    )
    ecoc = OutputCodeClassifier(base_estimator, random_state=0)

    with pytest.raises(TypeError, match="A sparse matrix was passed"):
        ecoc.fit(X_sp, y)

    ecoc.fit(X, y)
    with pytest.raises(TypeError, match="A sparse matrix was passed"):
        ecoc.predict(X_sp)

    # smoke test to check when sparse input should be supported
    ecoc = OutputCodeClassifier(LinearSVC(random_state=0))
    ecoc.fit(X_sp, y).predict(X_sp)
    assert len(ecoc.estimators_) == 4
Example #34
0
            C=penalty,
            random_state=109))
elif class_type == 'ovo':
    clf = SVC(
        kernel=kernel,
        gamma=1,
        coef0=coef,
        degree=degree,
        max_iter=max_iter,
        C=penalty,
        random_state=109)  # SVC is ovo by default, contrary to documentation
elif class_type == 'ecoc':
    clf = OutputCodeClassifier(SVC(kernel=kernel,
                                   gamma=1,
                                   coef0=coef,
                                   degree=degree,
                                   max_iter=max_iter,
                                   C=penalty,
                                   random_state=109),
                               random_state=109)

# remove convergence warning printouts from SVM training
if silence == 1: warnings.filterwarnings("ignore")

# Train the model using the training sets
if silence == 0: print("Training SVM using %s classification" % class_type)
clf.fit(X_train, y_train)

#########################################
# Generate the configuration matrix for the SVM
#########################################
# this will be used for manual classification
Example #35
0
        WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', w))
        for w in entry['ingredients']
    ]

    test_ingredients.append(' '.join(ings))

#used to encode labels as numbers for use with RandomForestClassifier
le = LabelEncoder()

#encode cuisines as numbers
train_cuisines = le.fit_transform(train_cuisines)

#used to create bag of ingredients vocabulary and create features for each entry
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform(train_ingredients).toarray()
test_features = vectorizer.transform(test_ingredients).toarray()

clf = OutputCodeClassifier(LinearSVC(random_state=0),
                           code_size=2,
                           random_state=2)
result = clf.fit(train_features, train_cuisines).predict(test_features)

output = pd.DataFrame(data={
    'id': test_ids,
    'cuisine': le.inverse_transform(result)
})

#force explicit ordering of columns
output = output[['id', 'cuisine']]
output.to_csv('ecoc.csv', index=False)
Example #36
0
class Classifier():

    def __init__(self,trainlist,testlist,scaling = "binary",jobs=16,directory=False,
            features = False, feature_info = False):
        self.training = trainlist
        self.test = testlist #self.test should be a list with multiple lists for each testset
        self.scaling = scaling
        self.jobs = jobs
        self.directory = directory
        self.feature_status = {}
        self.outstring = False
        self.features = features
        self.feature_info = feature_info

    def count_feature_frequency(self):
        
        def ff(instances,queue):
            feature_frequency = defaultdict(int)
            for i,instance in enumerate(instances):
                for feature in instance["ngrams"]:
                    feature_frequency[feature] += 1
            queue.put(feature_frequency)
        
        print(len(self.training))

        q = multiprocessing.Queue()
        chunks = gen_functions.make_chunks(self.training,self.jobs)
        for chunk in chunks:
            p = multiprocessing.Process(target=ff,args=[chunk,q])
            p.start()

        ds = []
        while True:
            l = q.get()
            ds.append(l)
            if len(ds) == len(chunks):
                break
        
        self.feature_frequency = defaultdict(int)
        for d in ds:
            for k in d:
                self.feature_frequency[k] += d[k]
        self.features = sorted(self.feature_frequency, key=self.feature_frequency.get, 
            reverse=True)

    def make_feature_labellist(self):
        feature_labellist = defaultdict(list)
        for instance in self.training:
            try:
                label = int(instance["label"])       
                for feature in instance["ngrams"]:
                    feature_labellist[feature].append(label)
            except:
                continue
        self.feature_labellist = feature_labellist

    def prune_features(self):
        for instance in self.training:
            new_features = []
            #print feature_status
            for f in instance["ngrams"]:
                try:
                    if self.feature_status[f]:
                        new_features.append(f)
                except:
                    continue
            instance["ngrams"] = new_features
            # queue.put(instance)

    def convert_features(self,convert_list):
        for instance in self.training:
            new_features = []
            #print feature_status
            #print instance["features"]
            for i,f in enumerate(instance["ngrams"]):
                if f in convert_list.keys():
                     instance["ngrams"][i] = convert_list[f]
            #print instance["features"]

    def filter_stdev(self,threshold,prop):
        self.make_feature_labellist()
        feature_convert = {}
        new_features = []
        for feature in self.feature_labellist.keys():
            if re.search(r"^" + prop,feature):
                if gen_functions.return_standard_deviation(self.feature_labellist[feature]) > threshold or len(self.feature_labellist[feature]) <= 2:
                    self.feature_status[feature] = False
                else:
                    new_feature = str(abs(int(numpy.median(self.feature_labellist[feature])))) + "_days"
                    feature_convert[feature] = new_feature
                    new_features.append(new_feature)
                    self.feature_status[new_feature] = True
            else:
                self.feature_status[feature] = True
                new_features.append(feature)
        self.convert_features(feature_convert)
        self.prune_features()
        self.features = list(set(new_features))

    def prune_features_topfrequency(self,n):
        #generate feature_frequency dict
        for f in self.features[:n]:
            self.feature_status[f] = True 
        for f in self.features[n:]:
            self.feature_status[f] = False
        self.features = self.features[:n]
        self.prune_features()

    def balance_data(self):
        label_instances = defaultdict(list)
        new_training = []
        for instance in self.training:     
            label = instance["label"]
            label_instances[label].append(instance)
        if len(label_instances.keys()) > 2:
            median = int(numpy.median(numpy.array([len(label_instances[x]) for \
                x in label_instances.keys()])))
            for label in label_instances.keys():
                if len(label_instances[label]) == median:
                    new_training.extend(label_instances[label])
                else:
                    instances = lineconverter.Lineconverter(label_instances[label])
                    if len(instances.lines) < median:
                        instances.sample(median-len(instances.lines),sample_type="up")
                    else:
                        instances.sample(len(instances.lines)-median)
                    new_training.extend(instances.lines)
            self.training = new_training

    def index_features(self,ind = 0):
        feature_frequency=defaultdict(int)
        self.feature_info={}
        #print self.features      
        for i,feature in enumerate(self.features):
            self.feature_info[feature]=i+ind
        
        def sparsify(instances,writelist):
            for instance in instances:
                sparse_features = defaultdict(int)
                for feature in instance["ngrams"]:
                    try:
                        sparse_features[self.feature_info[feature]] += 1
                    except:
                        continue
                instance["sparse"] = sparse_features
                writelist.append(instance)         
        new_instances = []
        sparsify(self.training,new_instances)
        self.training = new_instances

        for tset in self.test:
            for instance in tset["instances"]:
                sparse_features = defaultdict(int)
                for feature in instance["ngrams"]:
                    try:
                        sparse_features[self.feature_info[feature]] += 1
                    except:
                        continue
                instance["sparse"] = sparse_features

    def vectorize(self,instances):
        zerolist = [float(0)] * len(self.feature_info.keys())
        matrix = []
        for instance in instances:
            featurev = zerolist[:]
            for feature in instance["sparse"].keys():
                if self.scaling == "binary":
                    featurev[feature] = float(1)
                elif self.scaling == "log": 
                    featurev[feature] = math.log(instance["sparse"][feature],10)
                elif self.scaling == "tfidf":
                    featurev[feature] = instance["sparse"][feature] * self.idf[feature]
            for feat in instance["features"]:
                featurev.append(feat)
            matrix.append(featurev)
        return matrix

    def model_necessities(self):
        #generate scipy libsvm input
        self.trainlabels_raw = [x["label"] for x in self.training]
        self.labels = set(self.trainlabels_raw)
        labeldict = dict(zip(self.labels,range(len(self.labels))))
        self.labeldict_back = dict(zip(range(len(self.labels)),self.labels))
        if self.scaling == "tfidf":
            self.idf = weight_features.return_idf(self.training)
        self.trainingvectors = self.vectorize(self.training)
        self.training_csr = csr_matrix(self.trainingvectors)
        self.trainlabels = [labeldict[x["label"]] for x in self.training]

    def predict(self,ts):
        testvectors = self.vectorize(ts)
        predictions = []
        for i,t in enumerate(testvectors):
            classification = self.clf.predict(t)
            proba = self.clf.predict_proba(t)
            classification_label = self.labeldict_back[classification[0]]
            if len(ts[0]["meta"]) == 6:
                predictions.append([ts[i]["meta"][5], ts[i]["label"] + " " + classification_label, \
                    " ".join([str(round(x,2)) for x in proba.tolist()[0]])])
            else:
                predictions.append([" ".join([x for x in ts[i]["ngrams"] if not re.search("_",x)]), ts[i]["label"] + " " + classification_label, \
                    " ".join([str(round(x,2)) for x in proba.tolist()[0]])])
        return predictions

    def train_svm(self,params = 10):
        #obtain the best parameter settings for an svm outputcode classifier
        if len(self.labels) > 2:
            print("outputcodeclassifier")
            param_grid = {'estimator__C': [0.001, 0.005, 0.01, 0.5, 1, 5, 10, 50, 100, 500, 1000],
                'estimator__kernel': ['linear','rbf','poly'], 
                'estimator__gamma': [0.0005, 0.002, 0.008, 0.032, 0.128, 0.512, 1.024, 2.048],
                'estimator__degree': [1,2,3,4]}
            model = OutputCodeClassifier(svm.SVC(probability=True))
        else:
            print("svc model")
            param_grid = {'C': [0.001, 0.005, 0.01, 0.5, 1, 5, 10, 50, 100, 500, 1000],
                'kernel': ['linear','rbf','poly'], 
                'gamma': [0.0005, 0.002, 0.008, 0.032, 0.128, 0.512, 1.024, 2.048],
                'degree': [1,2,3,4]}
            model = svm.SVC(probability=True)
        paramsearch = RandomizedSearchCV(model, param_grid, cv=5, verbose=2,n_iter = params,n_jobs=self.jobs) 
        print("Grid search...")
        paramsearch.fit(self.training_csr,numpy.asarray(self.trainlabels))
        print("Prediction...")
        #print the best parameters to the file
        parameters = paramsearch.best_params_
        self.outstring = "best parameter settings:\n"
        for parameter in parameters.keys():
            self.outstring += (parameter + ": " + str(parameters[parameter]) + "\n")
        self.outstring += ("best score: " + str(paramsearch.best_score_) + "\n\n")
        #train an svm outputcode classifier using the best parameters
        if len(self.labels) > 2:
            clf = svm.SVC(probability=True, C=parameters['estimator__C'],
                kernel=parameters['estimator__kernel'],gamma=parameters['estimator__gamma'],
                degree=parameters['estimator__degree'])
            self.clf = OutputCodeClassifier(clf,n_jobs=self.jobs)
            self.clf.fit(self.training_csr,self.trainlabels)
        else:
            self.clf = svm.SVC(probability=True, C=parameters['C'],
                kernel=parameters['kernel'],gamma=parameters['gamma'],
                degree=parameters['degree'])
            self.clf.fit(self.training_csr,self.trainlabels)

    def train_nb(self):
        self.clf = naive_bayes.MultinomialNB()
        self.clf.fit(self.training_csr,self.trainlabels)

    def train_decisiontree(self):
        self.clf = tree.DecisionTreeClassifier()
        self.clf.fit(self.training_csr.toarray(),self.trainlabels)

    def tenfold_train(self,voting,classifiers = [],p = 10):
        kf = cross_validation.KFold(len(self.training), n_folds=10)
        training = deepcopy(self.training)
        feat = deepcopy(self.features)
        fi = deepcopy(self.feature_info)
        if voting == "weighted":
            self.feature_info = {}
            self.features = []
            for instance in self.training:
                instance["sparse"] = defaultdict(int)
                instance["ngrams"] = []
        len_features = len(self.features)
        for i,fn in enumerate(classifiers):
            featurename = "___" + fn
            self.feature_info[featurename] = len_features + i
            self.features.append(featurename)
        for train_index, test_index in kf:
            train = deepcopy([training[x] for x in train_index])
            test = deepcopy([training[y] for y in test_index])
            cl = Classifier(train,test,features = feat,feature_info = fi)
            cl.model_necessities()
            if "svm" in classifiers:
                cl.train_svm(params = p)
                predictions = cl.predict(test)
                for i,j in enumerate(test_index):
                    prediction = int(float(predictions[i][1].split()[1]))
                    self.training[j]["sparse"][self.feature_info["___svm"]] = prediction
                    if prediction == 1:
                        self.training[j]["ngrams"].append("___svm")
            if "nb" in classifiers:
                cl.train_nb()
                predictions = cl.predict(test)
                for i,j in enumerate(test_index):
                    prediction = int(float(predictions[i][1].split()[1]))
                    self.training[j]["sparse"][self.feature_info["___nb"]] = prediction
                    if prediction == 1:
                        self.training[j]["ngrams"].append("___nb")
            if "dt" in classifiers:
                cl.train_decisiontree()
                predictions = cl.predict(test)
                for i,j in enumerate(test_index):
                    prediction = int(float(predictions[i][1].split()[1]))
                    self.training[j]["sparse"][self.feature_info["___dt"]] = prediction
                    if prediction == 1:
                        self.training[j]["ngrams"].append("___dt")               
            
    def return_classification_features(self):
        prediction_features_testset = []
        for tset in self.test:
            prediction_features = []
            predictions = self.predict(tset["instances"])
            for i,prediction in enumerate(predictions):
                prediction_features.append(int(float(predictions[i][1].split()[1])))
            prediction_features_testset.append(prediction_features)
        return prediction_features_testset    

    def add_classification_features(self,featuredict,featurenames,voter):
        if voter == "majority":
            self.feature_info = {}
            len_features = len(self.feature_info.keys())
            for i,fn in enumerate(featurenames):
                self.feature_info[fn] = len_features + i
                self.features.append(fn)
        for i,tset in enumerate(self.test):
            for j,instance in enumerate(tset["instances"]):
                if voter != "arbiter":
                    tset["instances"][j]["sparse"] = defaultdict(int)
                    tset["instances"][j]["ngrams"] = []
                for fn in featurenames:
                    tset["instances"][j]["sparse"][self.feature_info[fn]] = featuredict[i][j][fn]
                    tset["instances"][j]["ngrams"].append(fn)

    def append_classifier_labelings(self):
        len_features = len(self.feature_info.keys())
        self.feature_info["___append"] = len_features
        self.features.append("___append")
        for instance in self.training:
            instance["sparse"][self.feature_info["___append"]] = instance["append"]
            if instance["append"] == 1:
                instance["features"].append("___append")
        for tset in self.test:
            for instance in tset["instances"]:
                instance["sparse"][self.feature_info["___append"]] = instance["append"]
                if instance["append"] == 1:
                    instance["features"].append("___append")

    def output_data(self):
        if re.search(".txt",self.test[0]["out"]):
            outdir = self.test[0]["out"][:-4] + "_"
        else:
            outdir = self.test[0]["out"]
        #output features
        #featureout = codecs.open(outdir + "features.txt","w","utf-8")
        featureout = open(outdir + "features.txt", "w", encoding = "utf-8")
        for feature in sorted(self.feature_info, key=self.feature_info.get):
            featureout.write(feature + "\t" + str(self.feature_info[feature]) + "\n")
        featureout.close()
        #output trainfile
        #trainout = codecs.open(outdir + "train.txt","w","utf-8")
        trainout = open(outdir + "train.txt", "w", encoding = "utf-8")
        for instance in self.training:
            trainout.write(instance["label"] + " " + ",".join(instance["ngrams"]) + " " + 
                ",".join([str(x) for x in instance["sparse"].keys()]) + "\n")
        trainout.close()
        #output testfile
        #testout = codecs.open(outdir + "test.txt","w","utf-8")
        testout = open(outdir + "test.txt", "w", encoding = "utf-8")
        for i,tset in enumerate(self.test):
            #testout = codecs.open(outdir + "test" + str(i) + ".txt","w","utf-8")
            for instance in tset["instances"]:
                testout.write(instance["label"] + " " + ",".join(instance["ngrams"]) + " " + 
                    ",".join([str(x) for x in instance["sparse"].keys()]) + "\n")

    def test_model(self):
        for tset in self.test:
            testresults = self.predict(tset["instances"])
            #outfile = codecs.open(tset["out"] + "predictions.txt","w","utf-8")
            if re.search(".txt",tset["out"]):
                outstring = tset["out"][:-4] + "_predictions.txt"
            else:
                outstring = tset["out"] + "predictions.txt"
#            outfile = codecs.open(outstring,"w","utf-8")
            outfile = open(outstring, "w", encoding = "utf-8")
            if self.outstring:
                outfile.write(self.outstring)
            for instance in testresults:
                outfile.write("\t".join(instance) + "\n") 
            outfile.close()

    def save_model(self):
        for tset in self.test:
            outfile = tset["out"][:-4] + "_model.joblib.pkl"
            #with open(outfile, 'wb') as fid:
            #    cPickle.dump(self.clf, fid)    
            with open(outfile, 'wb') as fid:
                pickle.dump(self.clf, fid)    
            #_ = joblib.dump(, outfile, compress=9)
            #outvocabulary = codecs.open(tset["out"] + "vocabulary.txt","w","utf-8")
            outstring = tset["out"][:-4] + "_vocabulary.txt"
            #outvocabulary = codecs.open(outstring,"w","utf-8")
            outvocabulary = open(outstring, "w", encoding = "utf-8")
            for feature in self.features:
                outvocabulary.write(feature + "\n")
            outvocabulary.close() 
            #outidf = codecs.open(tset["out"][:-4] + "_idfs.txt","w","utf-8")
            outidf = open(tset["out"][:-4] + "_idfs.txt", "w", encoding = "utf-8")
            for key in self.idf.keys():
                outidf.write(str(key) + "\t" + str(self.idf[key]) + "\n")
            outidf.close()
Example #37
0
def test_ecoc_exceptions():
    ecoc = OutputCodeClassifier(LinearSVC(random_state=0))
    assert_raises(ValueError, ecoc.predict, [])
Example #38
0
    print("Accuracy using MLPClassifier and Random Seed:", s, ":", str(acc))
    print(confusion_matrix(label_test, prediction))
print("Mean Accuracy using MLPClassifier Classifier: ",
      np.array(acc_array).mean())
#----------------------------------------------------------------
# Init the Models for Comparision
#----------------------------------------------------------------
models = [
    BaggingClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    KNeighborsClassifier(),
    GaussianNB(),
    tree.DecisionTreeClassifier(),
    svm.SVC(kernel='linear', C=1),
    OutputCodeClassifier(BaggingClassifier()),
    OneVsRestClassifier(svm.SVC(kernel='linear'))
]

model_names = [
    "Bagging with DT", "Random Forest", "AdaBoost", "KNN", "Naive Bayes",
    "Decision Tree", "Linear SVM", "OutputCodeClassifier with Linear SVM",
    "OneVsRestClassifier with Linear SVM"
]
#----------------------------------------------------------------
# Run Each Model
#----------------------------------------------------------------
for model, name in zip(models, model_names):
    model.fit(data_train, label_train)
    # Display the relative importance of each attribute
    if name == "Random Forest":