Ejemplo n.º 1
0
 def __init__(self, data, regularization_term=1.0, gamma=0.0):
     features, weights, labels = data
     self.gamma = gamma
     self.c = regularization_term
     self.clf = svm.SVC(C=self.c, gamma=self.gamma)
     self.predictions, self.trnaccuracy, self.tstaccuracy = None, None, None
     self.dataset = split_dataset(features, weights, labels)
Ejemplo n.º 2
0
def grid_search_best_parameter(data):
    features, weights, labels = data
    labels = np.array([1 if l == 'b' else 0 for l in labels])
    trnfeatures, tstfeatures, trnweights, tstweights, trnlabels, tstlabels = split_dataset(features, weights, labels)
    # Set the parameters by cross-validation
    tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}]

    scores = ['precision', 'recall']
    reports = {}
    for score in scores:
        LOGGER.info("# Tuning hyper-parameters for %s" % score)
        clf = GridSearchCV(svm.SVC(C=1), tuned_parameters, cv=5, scoring=score)
        clf.fit(trnfeatures, trnlabels)

        LOGGER.info("Best parameters set found on development set:")
        LOGGER.info(clf.best_estimator_)
        LOGGER.info("Grid scores on development set:")
        for params, mean_score, scores in clf.grid_scores_:
            LOGGER.info("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params))

        LOGGER.info("Detailed classification report:")
        LOGGER.info("The model is trained on the full development set.")
        LOGGER.info("The scores are computed on the full evaluation set.")
        y_true, y_pred = tstlabels, clf.predict(tstfeatures)
        reports[score] = classification_report(y_true, y_pred)
    return reports
Ejemplo n.º 3
0
    def _prepare_data(self):
        self.dataset = split_dataset(self.features, self.weights, self.labels)
        classes = set(self.labels)

        def training_set():
            ds = ClassificationDataSet(
                self.dataset['training']['features'].shape[1],
                1,
                nb_classes=len(classes))
            for i in range(self.dataset['training']['features'].shape[0]):
                ds.addSample(self.dataset['training']['features'][i],
                             self.dataset['training']['labels'][i])
            return ds

        def test_set():
            ds = ClassificationDataSet(self.features.shape[1],
                                       1,
                                       nb_classes=len(classes))
            for i in range(self.dataset['test']['features'].shape[0]):
                ds.addSample(self.dataset['test']['features'][i],
                             self.dataset['test']['labels'][i])
            return ds

        self.trndata = training_set()
        self.tstdata = test_set()
        self.tstdata._convertToOneOfMany()
        self.trndata._convertToOneOfMany()
Ejemplo n.º 4
0
def grid_search_tradeoff_estimators_learning_rate(raw_data):
    features, weights, labels = raw_data
    dataset = split_dataset(features, weights, labels)
    dataset['training']['labels'] = [1 if l == 's' else 0 for l in dataset['training']['labels']]
    dataset['test']['labels'] = [1 if l == 's' else 0 for l in dataset['test']['labels']]
    tunning_parameters = {'n_estimators': np.arange(50, 100, 5),
                          'learning_rate': np.arange(0.2, 2.2, .2)}
    scores = ['precision', 'recall']
    report = grid_search_best_parameter(dataset, AdaBoostClassifier, tunning_parameters, scores=scores)
    return report
Ejemplo n.º 5
0
def grid_search_tradeoff_estimators_learning_rate(raw_data):
    features, weights, labels = raw_data
    dataset = split_dataset(features, weights, labels)
    dataset['training']['labels'] = [
        1 if l == 's' else 0 for l in dataset['training']['labels']
    ]
    dataset['test']['labels'] = [
        1 if l == 's' else 0 for l in dataset['test']['labels']
    ]
    tunning_parameters = {
        'n_estimators': np.arange(50, 100, 5),
        'learning_rate': np.arange(0.2, 2.2, .2)
    }
    scores = ['precision', 'recall']
    report = grid_search_best_parameter(dataset,
                                        AdaBoostClassifier,
                                        tunning_parameters,
                                        scores=scores)
    return report
Ejemplo n.º 6
0
    def _prepare_data(self):
        self.dataset = split_dataset(self.features, self.weights, self.labels)
        classes = set(self.labels)

        def training_set():
            ds = ClassificationDataSet(self.dataset['training']['features'].shape[1], 1, nb_classes=len(classes))
            for i in range(self.dataset['training']['features'].shape[0]):
                ds.addSample(self.dataset['training']['features'][i],
                             self.dataset['training']['labels'][i])
            return ds

        def test_set():
            ds = ClassificationDataSet(self.features.shape[1], 1, nb_classes=len(classes))
            for i in range(self.dataset['test']['features'].shape[0]):
                ds.addSample(self.dataset['test']['features'][i],
                             self.dataset['test']['labels'][i])
            return ds

        self.trndata = training_set()
        self.tstdata = test_set()
        self.tstdata._convertToOneOfMany()
        self.trndata._convertToOneOfMany()
def estimate_components(data, iterations=10):
    features, weights, labels = data
    n_components = features.shape[1]
    baseline_estimator = LinearSVC()
    dataset = split_dataset(features, weights, labels)
    train_features = dataset['training']['features']
    test_features = dataset['test']['features']
    start = time()
    baseline_estimator.fit(train_features, dataset['training']['labels'])
    elapsed = time() - start
    baseline_accuracy = baseline_estimator.score(
        test_features,
        dataset['test']['labels'],
        sample_weight=dataset['test']['weights'])
    scores = []
    baseline_record = [[n, 0, baseline_accuracy, elapsed]
                       for n in range(1, n_components)]
    [scores.append(b) for b in baseline_record]
    for component in range(1, n_components):
        estimator = random_projection.SparseRandomProjection()
        estimator.n_components = component
        start = time()
        for iter in range(1, iterations):
            transformed_train_features = estimator.fit_transform(
                train_features)
            transformed_test_features = estimator.transform(test_features)
            baseline_estimator.fit(transformed_train_features,
                                   dataset['training']['labels'])
            accuracy = baseline_estimator.score(
                transformed_test_features,
                dataset['test']['labels'],
                sample_weight=dataset['test']['weights'])
            scores.append([component, iter, accuracy, time() - start])
    df = pd.DataFrame.from_records(
        scores,
        columns=['components', 'iteration', 'classification_accuracy', 'time'])
    return df
Ejemplo n.º 8
0
 def __init__(self, data, n_estimators=50, learning_rate=1.0):
     features, weights, labels = data
     self.clf = AdaBoostClassifier(n_estimators=n_estimators,
                                   learning_rate=learning_rate)
     self.predictions, self.trnaccuracy, self.tstaccuracy = None, None, None
     self.dataset = split_dataset(features, weights, labels)
Ejemplo n.º 9
0
 def __init__(self, data, n_neighbours=3, power_parameter=2):
     features, weights, labels = data
     self.clf = KNeighborsClassifier(n_neighbors=n_neighbours, p=power_parameter)
     self.predictions, self.trnaccuracy, self.tstaccuracy = None, None, None
     self.dataset = split_dataset(features, weights, labels)
Ejemplo n.º 10
0
 def __init__(self, data, criterion='gini', min_samples_split=60):
     features, weights, labels = data
     self.clf = tree.DecisionTreeClassifier(criterion=criterion, min_samples_split=min_samples_split)
     self.predictions, self.trnaccuracy, self.tstaccuracy = None, None, None
     self.dataset = split_dataset(features, weights, labels)
Ejemplo n.º 11
0
 def __init__(self, data, n_estimators=50, learning_rate=1.0):
     features, weights, labels = data
     self.clf = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
     self.predictions, self.trnaccuracy, self.tstaccuracy = None, None, None
     self.dataset = split_dataset(features, weights, labels)