def __init__(self, dataset, in_features=6714, method='lsvc_ovr'): super(SVCSolver, self).__init__(dataset, in_features) self.method = method self.timestamp = datetime.now().strftime("%Y%m%d%H%M%S") self.model_name = f'{self.timestamp}_{self.method}' # Init classifiers lsvc = svm.LinearSVC( verbose=0, dual=False, # loss='hinge', penalty='l2', C=0.6) lsvc_ovr = multiclass.OneVsRestClassifier(lsvc, n_jobs=-1) svc = svm.SVC(C=100, kernel='rbf', gamma='scale', shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=True, max_iter=-1) svc_ovr = multiclass.OneVsRestClassifier(svc, n_jobs=-1) classifiers = { 'lsvc': lsvc, 'lsvc_ovr': lsvc_ovr, 'svc': svc, 'svc_ovr': svc_ovr, } self.clf = classifiers[self.method]
def svm_classify(train_image_feats, train_labels, test_image_feats): ''' Parameters ---------- train_image_feats: is an N x d matrix, where d is the dimensionality of the feature representation. train_labels: is an N x l cell array, where each entry is a string indicating the ground truth one-hot vector for each training image. test_image_feats: is an M x d matrix, where d is the dimensionality of the feature representation. You can assume M = N unless you've modified the starter code. Returns ------- is an M x l cell array, where each row is a one-hot vector indicating the predicted category for each test image. Usefull funtion: # You can use svm from sci-kit learn. # Reference: https://scikit-learn.org/stable/modules/svm.html ''' clf = multiclass.OneVsRestClassifier(svm.LinearSVC(C=20.0)) clf.fit(train_image_feats, train_labels) predicted_labels = clf.predict(test_image_feats) return predicted_labels
def sklearn_multiclass_prediction(mode, X_train, y_train, X_test): """ Use Scikit Learn built-in functions multiclass.OneVsRestClassifier and multiclass.OneVsOneClassifier to perform multiclass classification. Arguments: mode: one of 'ovr', 'ovo' or 'crammer'. X_train, X_test: numpy ndarray of training and test features. y_train: labels of training data, from 0 to 9. Returns: y_pred_train, y_pred_test: a tuple of 2 numpy ndarrays, being your prediction of labels on training and test data, from 0 to 9. """ y_pred_train = None y_pred_test = None # using random_state=12345 for reproductivity # svm_model = svm.LinearSVC(random_state=12345) svm_model = SVC(verbose=1) # print(X_train) if mode == 'ovr': ovr_model = multiclass.OneVsRestClassifier(svm_model) ovr_model.fit(X_train, y_train) # print(ovr_model) y_pred_train = ovr_model.predict(X_train) # y_pred_test = ovr_model.predict(X_test) pickle.dump(ovr_model, open('ovr_model.pkl', 'wb')) return y_pred_train #, y_pred_test
def count_vectors(self, features): count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_df=1.0, max_features=features) count_vect.fit(self.trainDF['cleaned_sentence']) xtrain_count = count_vect.transform(self.X_train) xvalid_count = count_vect.transform(self.X_test) xcross_count = count_vect.transform(self.X_cross) for model_name, model in self.models.items(): mc_model = multiclass.OneVsRestClassifier(model) classifier = mc_model.fit(xtrain_count, self.y_train) # Training predictions self.check_model(classifier, xtrain_count, self.y_train, model_name, features, 'count_vectors', 'training') # Test predictions self.check_model(classifier, xvalid_count, self.y_test, model_name, features, 'count_vectors', 'test') # Cross Validation predictions self.check_model(classifier, xcross_count, self.y_cross, model_name, features, 'count_vectors', 'cross')
def tfidf_ngram(self, features): tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2, 5), max_features=features) tfidf_vect_ngram.fit(self.trainDF['cleaned_sentence']) xtrain_tfidf = tfidf_vect_ngram.transform(self.X_train) xvalid_tfidf = tfidf_vect_ngram.transform(self.X_test) xcross_tfidf = tfidf_vect_ngram.transform(self.X_cross) for model_name, model in self.models.items(): mc_model = multiclass.OneVsRestClassifier(model) classifier = mc_model.fit(xtrain_tfidf, self.y_train) # Training predictions self.check_model(classifier, xtrain_tfidf, self.y_train, model_name, features, 'tfidf_ngram', 'training') # Test predictions self.check_model(classifier, xvalid_tfidf, self.y_test, model_name, features, 'tfidf_ngram', 'test') # Cross Validation predictions self.check_model(classifier, xcross_tfidf, self.y_cross, model_name, features, 'tfidf_ngram', 'cross')
def clf_default(y=None, **svm_params): '''@return default classifier with additional params set class_weight="balanced" if y represents foreground data''' if y is not None and -1 not in y: svm_params['class_weight'] = "balanced" return multiclass.OneVsRestClassifier(svm.SVC(**svm_params))
def trainfunctionclassifier(trees, sents, numproc): """Train a classifier to predict functions tags in trees.""" from sklearn import linear_model, multiclass, pipeline from sklearn import preprocessing, feature_extraction from sklearn.model_selection import GridSearchCV from sklearn.metrics import make_scorer, jaccard_similarity_score vectorizer = pipeline.Pipeline([ ('vectorizer', feature_extraction.DictVectorizer(sparse=True)), ('scaler', preprocessing.StandardScaler( copy=False, with_mean=False))]) # PTB has no function tags on pretermintals, Negra/Tiger/Lassy do. posfunc = any(functions(node) for tree in trees for node in tree.subtrees() if node and isinstance(node[0], int)) target = [functions(node) for tree in trees for node in tree.subtrees() if tree is not node and node and (posfunc or isinstance(node[0], Tree))] # PTB may have multiple tags (or 0) per node. # Negra/Tiger/Lassy have exactly 1 tag for every node. multi = any(len(a) > 1 for a in target) if multi: encoder = preprocessing.MultiLabelBinarizer() else: encoder = preprocessing.LabelEncoder() target = [a[0] if a else '--' for a in target] # binarize features (output is a sparse array) trainfeats = vectorizer.fit_transform(functionfeatures(node, sent) for tree, sent in zip(trees, sents) for node in tree.subtrees() if tree is not node and node and (posfunc or isinstance(node[0], Tree))) trainfuncs = encoder.fit_transform(target) classifier = linear_model.SGDClassifier( loss='hinge', penalty='elasticnet', n_iter=int(10 ** 6 / len(trees))) alphas = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6] if multi: classifier = multiclass.OneVsRestClassifier( classifier, n_jobs=numproc or -1) param_grid = dict( estimator__alpha=alphas) else: param_grid = dict(alpha=alphas) classifier = GridSearchCV(estimator=classifier, param_grid=param_grid, scoring=make_scorer(jaccard_similarity_score)) # train classifier classifier.fit(trainfeats, trainfuncs) msg = ('trained classifier; grid search results:\n%s\n' 'multi=%r, posfunc=%r; best score on training set: %g %%\n' 'parameters: %r\nfunction tags: %s' % ( '\n'.join(str(a) for a in classifier.grid_scores_), multi, posfunc, 100.0 * classifier.best_score_, classifier.best_estimator_, ' '.join(str(a) for a in encoder.classes_))) return (classifier, vectorizer, encoder, posfunc, multi), msg
def calcImportances(self): if self.cluster is None: print( "Data must be clustered before importances can be calculated") return estimator = ensemble.RandomForestClassifier(n_estimators=100) classifier = multiclass.OneVsRestClassifier(estimator) classifier.fit(self.df, self.cluster) self.importances = [ e.feature_importances_ for e in classifier.estimators_ ]
def train(): X_train, X_valid, y_train, y_valid = load_train_data() """About Xgboost Parameters. Because the distribution of each labels is not uniform. Each classifier may have outstanding accuracy that lead to overfit. So, increasing gamma to penalize that classifier to not overfit that label. More information about xgboost parameters: https://github.com/dmlc/xgboost/wiki/Parameters So far, this parameters give score `0.2529` on local validation. And got `0.2547` at LB score. Using experimentation datasets. params = - 'max_depth': 6 - 'num_round': 512 - 'gamma': 1.0 - 'min_child_weight': 4 - 'eta': 0.025 - 'objective': 'binary:logistic' - 'eval_metric': 'logloss' - 'nthread': 4 """ model = "xgboost gbt" params = { 'max_depth': 6, 'num_round': 512, 'gamma': 1.0, 'min_child_weight': 4, 'eta': 0.025, 'objective': 'binary:logistic', 'eval_metric': 'logloss', 'nthread': 4 } clf = XGBoostClassifier(**params) # Multilabel clf = multiclass.OneVsRestClassifier(clf, n_jobs=1) # Local Validation validate(clf, model, X_train, X_valid, y_train, y_valid) # Train whole set for submission. X = np.concatenate((X_train, X_valid)) y = np.concatenate((y_train, y_valid)) print(" --- Start training {} Classifier on whole set.".format(model)) clf.fit(X, y) print(" --- Finished training on whole set.") print(" -- Finished training.") return clf
def train_model(features, labels): # Set up SVM model. Try linear kernel first. estimator = svm.SVC(kernel='linear') #estimator = linear_model.LogisticRegression() # Set up multi-class classifier multi_label_classifier = multiclass.OneVsRestClassifier(estimator) # multi_label_classifier = multiclass.OneVsOneClassifier(estimator) X_train, X_test, y_train, y_test = \ cross_validation.train_test_split(features, labels, test_size=0.7, random_state=0) predictions = multi_label_classifier.fit(X_train, y_train).predict(X_test) score = metrics.accuracy_score(y_test, predictions) print 'Classification accuracy on training data: %f' % score print(metrics.classification_report(y_test, predictions)) """
def get_predict_probs(self): one_vs_rest = multiclass.OneVsRestClassifier( linear_model.LogisticRegression()) train_feature_vectors = [] train_classes = [] for node_id in sorted(self.seed_nodes_to_cluster.keys()): train_feature_vectors.append(self.y[node_id, :]) train_classes.append(self.seed_nodes_to_cluster[node_id]) one_vs_rest.fit(train_feature_vectors, train_classes) test_features = [] node_ids = [] for node_id in sorted(self.nodes_to_features.keys()): node_ids.append(node_id) test_features.append(self.y[node_id, :]) predict_proba = one_vs_rest.predict_proba(test_features) confidence_scores = [] for i, node_id in enumerate(node_ids): class_to_predict = np.argmax(predict_proba[i]) score = np.max(predict_proba[i]) confidence_scores.append((score, node_id, class_to_predict)) return confidence_scores