Beispiel #1
0
    def __init__(self, dataset, in_features=6714, method='lsvc_ovr'):
        super(SVCSolver, self).__init__(dataset, in_features)

        self.method = method
        self.timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
        self.model_name = f'{self.timestamp}_{self.method}'

        # Init classifiers
        lsvc = svm.LinearSVC(
            verbose=0,
            dual=False,
            # loss='hinge',
            penalty='l2',
            C=0.6)
        lsvc_ovr = multiclass.OneVsRestClassifier(lsvc, n_jobs=-1)
        svc = svm.SVC(C=100,
                      kernel='rbf',
                      gamma='scale',
                      shrinking=True,
                      probability=False,
                      tol=0.001,
                      cache_size=200,
                      class_weight=None,
                      verbose=True,
                      max_iter=-1)
        svc_ovr = multiclass.OneVsRestClassifier(svc, n_jobs=-1)
        classifiers = {
            'lsvc': lsvc,
            'lsvc_ovr': lsvc_ovr,
            'svc': svc,
            'svc_ovr': svc_ovr,
        }

        self.clf = classifiers[self.method]
Beispiel #2
0
def svm_classify(train_image_feats, train_labels, test_image_feats):
    '''
    Parameters
        ----------
        train_image_feats:  is an N x d matrix, where d is the dimensionality of the feature representation.
        train_labels: is an N x l cell array, where each entry is a string 
        			  indicating the ground truth one-hot vector for each training image.
    	test_image_feats: is an M x d matrix, where d is the dimensionality of the
    					  feature representation. You can assume M = N unless you've modified the starter code.
        
    Returns
        -------
    	is an M x l cell array, where each row is a one-hot vector 
        indicating the predicted category for each test image.

    Usefull funtion:
    	
    	# You can use svm from sci-kit learn.
        # Reference: https://scikit-learn.org/stable/modules/svm.html

    '''
    clf = multiclass.OneVsRestClassifier(svm.LinearSVC(C=20.0))
    clf.fit(train_image_feats, train_labels)
    predicted_labels = clf.predict(test_image_feats)
    return predicted_labels
Beispiel #3
0
def sklearn_multiclass_prediction(mode, X_train, y_train, X_test):
    """
    Use Scikit Learn built-in functions multiclass.OneVsRestClassifier
    and multiclass.OneVsOneClassifier to perform multiclass classification.

    Arguments:
        mode: one of 'ovr', 'ovo' or 'crammer'.
        X_train, X_test: numpy ndarray of training and test features.
        y_train: labels of training data, from 0 to 9.

    Returns:
        y_pred_train, y_pred_test: a tuple of 2 numpy ndarrays,
                                   being your prediction of labels on
                                   training and test data, from 0 to 9.
    """
    y_pred_train = None
    y_pred_test = None
    # using random_state=12345 for reproductivity
    # svm_model = svm.LinearSVC(random_state=12345)
    svm_model = SVC(verbose=1)
    # print(X_train)
    if mode == 'ovr':
        ovr_model = multiclass.OneVsRestClassifier(svm_model)
        ovr_model.fit(X_train, y_train)
        # print(ovr_model)
        y_pred_train = ovr_model.predict(X_train)
        # y_pred_test = ovr_model.predict(X_test)
        pickle.dump(ovr_model, open('ovr_model.pkl', 'wb'))
    return y_pred_train  #, y_pred_test
    def count_vectors(self, features):
        count_vect = CountVectorizer(analyzer='word',
                                     token_pattern=r'\w{1,}',
                                     max_df=1.0,
                                     max_features=features)
        count_vect.fit(self.trainDF['cleaned_sentence'])
        xtrain_count = count_vect.transform(self.X_train)
        xvalid_count = count_vect.transform(self.X_test)
        xcross_count = count_vect.transform(self.X_cross)

        for model_name, model in self.models.items():
            mc_model = multiclass.OneVsRestClassifier(model)
            classifier = mc_model.fit(xtrain_count, self.y_train)

            # Training predictions
            self.check_model(classifier, xtrain_count, self.y_train,
                             model_name, features, 'count_vectors', 'training')

            # Test predictions
            self.check_model(classifier, xvalid_count, self.y_test, model_name,
                             features, 'count_vectors', 'test')

            # Cross Validation predictions
            self.check_model(classifier, xcross_count, self.y_cross,
                             model_name, features, 'count_vectors', 'cross')
    def tfidf_ngram(self, features):
        tfidf_vect_ngram = TfidfVectorizer(analyzer='word',
                                           token_pattern=r'\w{1,}',
                                           ngram_range=(2, 5),
                                           max_features=features)
        tfidf_vect_ngram.fit(self.trainDF['cleaned_sentence'])
        xtrain_tfidf = tfidf_vect_ngram.transform(self.X_train)
        xvalid_tfidf = tfidf_vect_ngram.transform(self.X_test)
        xcross_tfidf = tfidf_vect_ngram.transform(self.X_cross)

        for model_name, model in self.models.items():
            mc_model = multiclass.OneVsRestClassifier(model)
            classifier = mc_model.fit(xtrain_tfidf, self.y_train)

            # Training predictions
            self.check_model(classifier, xtrain_tfidf, self.y_train,
                             model_name, features, 'tfidf_ngram', 'training')

            # Test predictions
            self.check_model(classifier, xvalid_tfidf, self.y_test, model_name,
                             features, 'tfidf_ngram', 'test')

            # Cross Validation predictions
            self.check_model(classifier, xcross_tfidf, self.y_cross,
                             model_name, features, 'tfidf_ngram', 'cross')
Beispiel #6
0
def clf_default(y=None, **svm_params):
    '''@return default classifier with additional params

    set class_weight="balanced" if y represents foreground data'''
    if y is not None and -1 not in y:
        svm_params['class_weight'] = "balanced"
    return multiclass.OneVsRestClassifier(svm.SVC(**svm_params))
def trainfunctionclassifier(trees, sents, numproc):
	"""Train a classifier to predict functions tags in trees."""
	from sklearn import linear_model, multiclass, pipeline
	from sklearn import preprocessing, feature_extraction
	from sklearn.model_selection import GridSearchCV
	from sklearn.metrics import make_scorer, jaccard_similarity_score
	vectorizer = pipeline.Pipeline([
			('vectorizer', feature_extraction.DictVectorizer(sparse=True)),
			('scaler', preprocessing.StandardScaler(
				copy=False, with_mean=False))])
	# PTB has no function tags on pretermintals, Negra/Tiger/Lassy do.
	posfunc = any(functions(node) for tree in trees
			for node in tree.subtrees()
			if node and isinstance(node[0], int))
	target = [functions(node) for tree in trees
			for node in tree.subtrees()
			if tree is not node and node
				and (posfunc or isinstance(node[0], Tree))]
	# PTB may have multiple tags (or 0) per node.
	# Negra/Tiger/Lassy have exactly 1 tag for every node.
	multi = any(len(a) > 1 for a in target)
	if multi:
		encoder = preprocessing.MultiLabelBinarizer()
	else:
		encoder = preprocessing.LabelEncoder()
		target = [a[0] if a else '--' for a in target]
	# binarize features (output is a sparse array)
	trainfeats = vectorizer.fit_transform(functionfeatures(node, sent)
			for tree, sent in zip(trees, sents)
				for node in tree.subtrees()
				if tree is not node
				and node and (posfunc or isinstance(node[0], Tree)))
	trainfuncs = encoder.fit_transform(target)
	classifier = linear_model.SGDClassifier(
			loss='hinge',
			penalty='elasticnet',
			n_iter=int(10 ** 6 / len(trees)))
	alphas = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6]
	if multi:
		classifier = multiclass.OneVsRestClassifier(
				classifier, n_jobs=numproc or -1)
		param_grid = dict(
				estimator__alpha=alphas)
	else:
		param_grid = dict(alpha=alphas)
	classifier = GridSearchCV(estimator=classifier, param_grid=param_grid,
			scoring=make_scorer(jaccard_similarity_score))
	# train classifier
	classifier.fit(trainfeats, trainfuncs)
	msg = ('trained classifier; grid search results:\n%s\n'
			'multi=%r, posfunc=%r; best score on training set: %g %%\n'
			'parameters: %r\nfunction tags: %s' % (
			'\n'.join(str(a) for a in classifier.grid_scores_),
			multi, posfunc, 100.0 * classifier.best_score_,
			classifier.best_estimator_,
			' '.join(str(a) for a in encoder.classes_)))
	return (classifier, vectorizer, encoder, posfunc, multi), msg
Beispiel #8
0
 def calcImportances(self):
     if self.cluster is None:
         print(
             "Data must be clustered before importances can be calculated")
         return
     estimator = ensemble.RandomForestClassifier(n_estimators=100)
     classifier = multiclass.OneVsRestClassifier(estimator)
     classifier.fit(self.df, self.cluster)
     self.importances = [
         e.feature_importances_ for e in classifier.estimators_
     ]
Beispiel #9
0
def train():
    X_train, X_valid, y_train, y_valid = load_train_data()
    """About Xgboost Parameters.
    Because the distribution of each labels is not uniform. Each classifier may
    have outstanding accuracy that lead to overfit. So, increasing gamma to
    penalize that classifier to not overfit that label.

    More information about xgboost parameters: 
    https://github.com/dmlc/xgboost/wiki/Parameters

    So far, this parameters give score `0.2529` on local validation. And got
    `0.2547` at LB score. Using experimentation datasets.
    params =  
      - 'max_depth': 6
      - 'num_round': 512
      - 'gamma': 1.0
      - 'min_child_weight': 4
      - 'eta': 0.025
      - 'objective': 'binary:logistic'
      - 'eval_metric': 'logloss'
      - 'nthread': 4
    """
    model = "xgboost gbt"
    params = {
        'max_depth': 6,
        'num_round': 512,
        'gamma': 1.0,
        'min_child_weight': 4,
        'eta': 0.025,
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'nthread': 4
    }
    clf = XGBoostClassifier(**params)

    # Multilabel
    clf = multiclass.OneVsRestClassifier(clf, n_jobs=1)

    # Local Validation
    validate(clf, model, X_train, X_valid, y_train, y_valid)

    # Train whole set for submission.
    X = np.concatenate((X_train, X_valid))
    y = np.concatenate((y_train, y_valid))

    print(" --- Start training {} Classifier on whole set.".format(model))
    clf.fit(X, y)
    print(" --- Finished training on whole set.")

    print(" -- Finished training.")
    return clf
Beispiel #10
0
def train_model(features, labels):
    # Set up SVM model. Try linear kernel first.
    estimator = svm.SVC(kernel='linear')
    #estimator = linear_model.LogisticRegression()

    # Set up multi-class classifier
    multi_label_classifier = multiclass.OneVsRestClassifier(estimator)
    # multi_label_classifier = multiclass.OneVsOneClassifier(estimator)

    X_train, X_test, y_train, y_test = \
        cross_validation.train_test_split(features, labels, test_size=0.7, random_state=0)

    predictions = multi_label_classifier.fit(X_train, y_train).predict(X_test)
    score = metrics.accuracy_score(y_test, predictions)
    print 'Classification accuracy on training data: %f' % score

    print(metrics.classification_report(y_test, predictions))
    """
 def get_predict_probs(self):
     one_vs_rest = multiclass.OneVsRestClassifier(
         linear_model.LogisticRegression())
     train_feature_vectors = []
     train_classes = []
     for node_id in sorted(self.seed_nodes_to_cluster.keys()):
         train_feature_vectors.append(self.y[node_id, :])
         train_classes.append(self.seed_nodes_to_cluster[node_id])
     one_vs_rest.fit(train_feature_vectors, train_classes)
     test_features = []
     node_ids = []
     for node_id in sorted(self.nodes_to_features.keys()):
         node_ids.append(node_id)
         test_features.append(self.y[node_id, :])
     predict_proba = one_vs_rest.predict_proba(test_features)
     confidence_scores = []
     for i, node_id in enumerate(node_ids):
         class_to_predict = np.argmax(predict_proba[i])
         score = np.max(predict_proba[i])
         confidence_scores.append((score, node_id, class_to_predict))
     return confidence_scores