def __init__(self, data, regularization_term=1.0, gamma=0.0): features, weights, labels = data self.gamma = gamma self.c = regularization_term self.clf = svm.SVC(C=self.c, gamma=self.gamma) self.predictions, self.trnaccuracy, self.tstaccuracy = None, None, None self.dataset = split_dataset(features, weights, labels)
def grid_search_best_parameter(data): features, weights, labels = data labels = np.array([1 if l == 'b' else 0 for l in labels]) trnfeatures, tstfeatures, trnweights, tstweights, trnlabels, tstlabels = split_dataset(features, weights, labels) # Set the parameters by cross-validation tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}] scores = ['precision', 'recall'] reports = {} for score in scores: LOGGER.info("# Tuning hyper-parameters for %s" % score) clf = GridSearchCV(svm.SVC(C=1), tuned_parameters, cv=5, scoring=score) clf.fit(trnfeatures, trnlabels) LOGGER.info("Best parameters set found on development set:") LOGGER.info(clf.best_estimator_) LOGGER.info("Grid scores on development set:") for params, mean_score, scores in clf.grid_scores_: LOGGER.info("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params)) LOGGER.info("Detailed classification report:") LOGGER.info("The model is trained on the full development set.") LOGGER.info("The scores are computed on the full evaluation set.") y_true, y_pred = tstlabels, clf.predict(tstfeatures) reports[score] = classification_report(y_true, y_pred) return reports
def _prepare_data(self): self.dataset = split_dataset(self.features, self.weights, self.labels) classes = set(self.labels) def training_set(): ds = ClassificationDataSet( self.dataset['training']['features'].shape[1], 1, nb_classes=len(classes)) for i in range(self.dataset['training']['features'].shape[0]): ds.addSample(self.dataset['training']['features'][i], self.dataset['training']['labels'][i]) return ds def test_set(): ds = ClassificationDataSet(self.features.shape[1], 1, nb_classes=len(classes)) for i in range(self.dataset['test']['features'].shape[0]): ds.addSample(self.dataset['test']['features'][i], self.dataset['test']['labels'][i]) return ds self.trndata = training_set() self.tstdata = test_set() self.tstdata._convertToOneOfMany() self.trndata._convertToOneOfMany()
def grid_search_tradeoff_estimators_learning_rate(raw_data): features, weights, labels = raw_data dataset = split_dataset(features, weights, labels) dataset['training']['labels'] = [1 if l == 's' else 0 for l in dataset['training']['labels']] dataset['test']['labels'] = [1 if l == 's' else 0 for l in dataset['test']['labels']] tunning_parameters = {'n_estimators': np.arange(50, 100, 5), 'learning_rate': np.arange(0.2, 2.2, .2)} scores = ['precision', 'recall'] report = grid_search_best_parameter(dataset, AdaBoostClassifier, tunning_parameters, scores=scores) return report
def grid_search_tradeoff_estimators_learning_rate(raw_data): features, weights, labels = raw_data dataset = split_dataset(features, weights, labels) dataset['training']['labels'] = [ 1 if l == 's' else 0 for l in dataset['training']['labels'] ] dataset['test']['labels'] = [ 1 if l == 's' else 0 for l in dataset['test']['labels'] ] tunning_parameters = { 'n_estimators': np.arange(50, 100, 5), 'learning_rate': np.arange(0.2, 2.2, .2) } scores = ['precision', 'recall'] report = grid_search_best_parameter(dataset, AdaBoostClassifier, tunning_parameters, scores=scores) return report
def _prepare_data(self): self.dataset = split_dataset(self.features, self.weights, self.labels) classes = set(self.labels) def training_set(): ds = ClassificationDataSet(self.dataset['training']['features'].shape[1], 1, nb_classes=len(classes)) for i in range(self.dataset['training']['features'].shape[0]): ds.addSample(self.dataset['training']['features'][i], self.dataset['training']['labels'][i]) return ds def test_set(): ds = ClassificationDataSet(self.features.shape[1], 1, nb_classes=len(classes)) for i in range(self.dataset['test']['features'].shape[0]): ds.addSample(self.dataset['test']['features'][i], self.dataset['test']['labels'][i]) return ds self.trndata = training_set() self.tstdata = test_set() self.tstdata._convertToOneOfMany() self.trndata._convertToOneOfMany()
def estimate_components(data, iterations=10): features, weights, labels = data n_components = features.shape[1] baseline_estimator = LinearSVC() dataset = split_dataset(features, weights, labels) train_features = dataset['training']['features'] test_features = dataset['test']['features'] start = time() baseline_estimator.fit(train_features, dataset['training']['labels']) elapsed = time() - start baseline_accuracy = baseline_estimator.score( test_features, dataset['test']['labels'], sample_weight=dataset['test']['weights']) scores = [] baseline_record = [[n, 0, baseline_accuracy, elapsed] for n in range(1, n_components)] [scores.append(b) for b in baseline_record] for component in range(1, n_components): estimator = random_projection.SparseRandomProjection() estimator.n_components = component start = time() for iter in range(1, iterations): transformed_train_features = estimator.fit_transform( train_features) transformed_test_features = estimator.transform(test_features) baseline_estimator.fit(transformed_train_features, dataset['training']['labels']) accuracy = baseline_estimator.score( transformed_test_features, dataset['test']['labels'], sample_weight=dataset['test']['weights']) scores.append([component, iter, accuracy, time() - start]) df = pd.DataFrame.from_records( scores, columns=['components', 'iteration', 'classification_accuracy', 'time']) return df
def __init__(self, data, n_estimators=50, learning_rate=1.0): features, weights, labels = data self.clf = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate) self.predictions, self.trnaccuracy, self.tstaccuracy = None, None, None self.dataset = split_dataset(features, weights, labels)
def __init__(self, data, n_neighbours=3, power_parameter=2): features, weights, labels = data self.clf = KNeighborsClassifier(n_neighbors=n_neighbours, p=power_parameter) self.predictions, self.trnaccuracy, self.tstaccuracy = None, None, None self.dataset = split_dataset(features, weights, labels)
def __init__(self, data, criterion='gini', min_samples_split=60): features, weights, labels = data self.clf = tree.DecisionTreeClassifier(criterion=criterion, min_samples_split=min_samples_split) self.predictions, self.trnaccuracy, self.tstaccuracy = None, None, None self.dataset = split_dataset(features, weights, labels)