Example #1
0
def test(X, y, coef, intercept):
    clf = SGDClassifier(fit_intercept=True)
    clf.coef_ = coef.todense()
    clf.intercept_ = intercept.todense()
    clf.classes_ = np.unique(y)

    return clf.score(X, y)
Example #2
0
def _generate_pfa_classifier(result, indep_vars, featurizer, classes):
    # Create mock SGDRegressor for sklearn_to_pfa
    estimator = SGDClassifier()
    estimator.classes_ = classes
    estimator.intercept_ = [result[cat]['intercept'] for cat in classes]
    # NOTE: linearly dependent columns will be assigned 0
    estimator.coef_ = [[
        result[cat].get(c, {'coef': 0.})['coef'] for c in featurizer.columns
        if c != 'intercept'
    ] for cat in classes]

    types = [(var['name'], var['type']['name']) for var in indep_vars]
    return sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa())
Example #3
0
def client_update(init_weights, epochs, batch_size, features, labels,
                  all_classes, rand_seed):
    """
    Given the previous weights from the server, it does updates on the model
    and returns the new set of weights

    init_weights: weights to initialize the training with
        ex: [weights of size num_classes*num_features, intercepts of size num_classes]
    epochs: number of epochs to run the training for
    batch_size: the size of each batch of data while training
    features: a 2D array containing features for each sample
        ex: [[feature1, feature2], [feature1, feature2], ...]
    labels: an array containing the labels for the corresponding sample in "features"
        ex: [label1, label2, ...]
    all_classes: an array containing the unique labels across the entire dataset (`labels` may not contain all of these)
    rand_seed: a seed to use with any random number generation in order to get consistant results between runs
    """

    # split the data into batches by given batch_size
    # TODO: need to ensure that a batch doesn't just contain 1 label
    batches_features = []
    batches_labels = []

    for i in range(0, len(features), batch_size):
        batches_features.append(features[i:i + batch_size])
        batches_labels.append(labels[i:i + batch_size])

    coef = list(init_weights[0])
    intercept = list(init_weights[1])

    classifier = SGDClassifier(loss="log", random_state=rand_seed)
    classifier.coef_ = np.array(coef)
    classifier.intercept_ = np.array(intercept)

    for epoch in range(epochs):
        for i in range(len(batches_features)):
            classifier.partial_fit(
                batches_features[i],
                batches_labels[i],
                # list of all possible classes - need to get all unique values instead of hardcoding
                classes=all_classes,
            )

            # update the weights so for the next batch the new ones are used
            coef = classifier.coef_
            intercept = classifier.intercept_

    weights = [coef, intercept]

    return weights
Example #4
0
    def train_clf(self, X, idxss, rs, tries=0):
        clf = SGDClassifier(loss=self.loss, penalty='l1', n_iter=self.n_epochs + tries,
                alpha=self.alpha, fit_intercept=self.bias, class_weight='balanced',
                random_state=rs)

        X_train, y_train = self.build_XY(X, idxss)

        clf.fit(X_train, y_train)

        # Halves the memory requirement
        clf.coef_ = sparsify(clf.coef_)
        if self.bias:
            clf.intercept_ = clf.intercept_.astype('float32')

        return clf, CLF(clf.coef_, clf.intercept_)
Example #5
0
    def from_path(cls, path, **shared):
        """Loads a :class:`LogRegIntentClassifier` instance from a path

        The data at the given path must have been generated using
        :func:`~LogRegIntentClassifier.persist`
        """
        import numpy as np
        from sklearn.linear_model import SGDClassifier

        path = Path(path)
        model_path = path / "intent_classifier.json"
        if not model_path.exists():
            raise LoadingError("Missing intent classifier model file: %s"
                               % model_path.name)

        with model_path.open(encoding="utf8") as f:
            model_dict = json.load(f)

        # Create the classifier
        config = LogRegIntentClassifierConfig.from_dict(model_dict["config"])
        intent_classifier = cls(config=config, **shared)
        intent_classifier.intent_list = model_dict['intent_list']

        # Create the underlying SGD classifier
        sgd_classifier = None
        coeffs = model_dict['coeffs']
        intercept = model_dict['intercept']
        t_ = model_dict["t_"]
        if coeffs is not None and intercept is not None:
            sgd_classifier = SGDClassifier(**LOG_REG_ARGS)
            sgd_classifier.coef_ = np.array(coeffs)
            sgd_classifier.intercept_ = np.array(intercept)
            sgd_classifier.t_ = t_
        intent_classifier.classifier = sgd_classifier

        # Add the featurizer
        featurizer = model_dict['featurizer']
        if featurizer is not None:
            featurizer_path = path / featurizer
            intent_classifier.featurizer = Featurizer.from_path(
                featurizer_path, **shared)

        return intent_classifier
Example #6
0
    def from_dict(cls, unit_dict):
        """Creates a :class:`LogRegIntentClassifier` instance from a dict

        The dict must have been generated with
        :func:`~LogRegIntentClassifier.to_dict`
        """
        config = LogRegIntentClassifierConfig.from_dict(unit_dict["config"])
        intent_classifier = cls(config=config)
        sgd_classifier = None
        coeffs = unit_dict['coeffs']
        intercept = unit_dict['intercept']
        t_ = unit_dict["t_"]
        if coeffs is not None and intercept is not None:
            sgd_classifier = SGDClassifier(**LOG_REG_ARGS)
            sgd_classifier.coef_ = np.array(coeffs)
            sgd_classifier.intercept_ = np.array(intercept)
            sgd_classifier.t_ = t_
        intent_classifier.classifier = sgd_classifier
        intent_classifier.intent_list = unit_dict['intent_list']
        featurizer = unit_dict['featurizer']
        if featurizer is not None:
            intent_classifier.featurizer = Featurizer.from_dict(featurizer)
        return intent_classifier
Example #7
0
 def _gen_sgdcls(self):
     coef_path = "%s/%s" % (CSV_DIR, self.coef_csv)
     intercept_path = "%s/%s" % (CSV_DIR, self.coef_intercept)
     
     coef = None
     if os.path.exists(coef_path):
         coef = np.array(f.csv2arr(coef_path))
         
     intercept = None
     if os.path.exists(intercept_path):
         coef = np.array(f.csv2arr(intercept_path))
     
     
     sgdcls = SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
             eta0=0.0, fit_intercept=True, l1_ratio=0.15,
             learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=N_JOBS,
             penalty='l2', power_t=0.5, random_state=None, shuffle=True,
             verbose=0, warm_start=False)
     
     if coef is not None:
         sgdcls.coef_ = coef
         sgdcls.intercept_ = intercept
     self.sgdcls = sgdcls
Example #8
0
    y_set = [0] * (all_starPlayersCount) + [1] * (all_starPlayersCount)

    # Start to learn
    clf.fit(X_set, y_set)
    coefficients.append(clf.coef_)
    intercept.append(clf.intercept_)
    # print clf.get_params()

# Average collected coefs and n0
coefficients = np.array(coefficients)
averagedCoef = np.mean(coefficients, axis=0)
# print intercept
averagedIntercept = np.mean(intercept, axis=0)
# print averagedIntercept

clf.coef_ = averagedCoef
clf.intercept_ = averagedIntercept
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# print y_pred
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix= " + str(cm))
print("Recall/Sensitivity = " + str(recall_score(y_test, y_pred)))
print("Precision = " + str(precision_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))

# getting top 3/2 players only
results = []
for a in range(len(X_test)):
    results.append((a, (np.sum(np.multiply(X_test[a, :], coefficients)))))
Example #9
0
def server_update(
    init_weight,
    client_fraction,
    num_rounds,
    features,
    labels,
    epoch,
    batch_size,
    display_weight_per_round,
):
    """
    Calls clientUpdate to get the updated weights from clients, and applies Federated
    Averaging Algorithm to update the weight on server side
    
    init_weights: weights to initialize the training with 
        ex: [weights of size num_classes*num_features, intercepts of size num_classes]
    client_fraction: fraction of clients to use per round 
    num_rounds: number of rounds used to update the weight
    features: a 3D array containing features for each sample 
        ex: [[[feature1, feature2], [feature1, feature2], ...]]
    label: an array containing the labels for the corresponding sample in "features"
        ex: [label1, label2, ...]
    epoch: number of epochs to run the training for
    batch_size: the size of each batch of data while training 
    display_weight_per_round: a boolean value used to toggle the display of weight value per round
    
    """

    # initialize the weights
    coef = list(init_weight[0])
    intercept = list(init_weight[1])

    # number of clients
    client_num = len(features)
    # fraction of clients
    C = client_fraction

    # use to generate n_k so that the sum of n_k equals to n
    for i in range(num_rounds):
        # calculate the number of clients used in this round
        m = max(int(client_num * C), 1)
        # random set of m client's index
        S = np.array(random.sample(range(client_num), m))

        num_samples = []

        # grab all the weights from clients
        client_coefs = None
        client_intercepts = None

        for i in S:
            client_feature = features[i]
            client_label = labels[i]

            coefs, intercept = client_update([coef, intercept], epoch,
                                             batch_size, client_feature,
                                             client_label)

            client_coefs = append(
                client_coefs,
                coefs,
            )

            client_intercepts = append(client_intercepts, intercept)

            num_samples.append(len(client_feature))

        # calculate the new server weights based on new weights coming from client
        new_coefs = np.zeros(init_weight[0].shape, dtype=np.float64, order="C")
        new_intercept = np.zeros(init_weight[1].shape,
                                 dtype=np.float64,
                                 order="C")

        for i in range(len(client_coefs)):
            client_coef = client_coefs[i]
            client_intercept = client_intercepts[i]

            n_k = len(features[i])
            added_coef = [
                value * (n_k) / sum(num_samples) for value in client_coef
            ]
            added_intercept = [
                value * (n_k) / sum(num_samples) for value in client_intercept
            ]

            new_coefs = np.add(new_coefs, added_coef)
            new_intercept = np.add(new_intercept, added_intercept)

        # update the server weights to newly calculated weights
        coef = new_coefs
        intercept = new_intercept

        if display_weight_per_round:
            print("Updated Weights: ", coef, intercept)

    # load coefficients and intercept into the classifier
    clf = SGDClassifier(loss="hinge", penalty="l2")

    clf.coef_ = new_coefs
    clf.intercept_ = new_intercept
    clf.classes_ = np.unique(
        list(labels))  # the unique labels are the classes for the classifier

    return clf
def binary_linear_classifier_diagnostics(
        ds,
        train_size_by2=5,
        mask_pattern=re.compile('~document~.*'),
        train_idx=None,
        test_idx=None):
    ds.mask_col(pattern=mask_pattern)
    try:
        train_set = ds.get_train_set(train_size_by2, train_idx=train_idx)
    except DatasetTooSmall:
        print 'Train Set too Big'
        return
    cls = SGDClassifier(loss='hinge',
                        penalty='l2',
                        learning_rate='optimal',
                        average=10,
                        warm_start=False,
                        class_weight='balanced').fit(*train_set)
    # cls = SGDClassifier().fit(*train_set)
    test_feat, test_label = ds.get_test_set(test_idx=test_idx)
    for coef_type, coef_ in [
        ('clipped', np.array([[e if e > 0 else 0
                               for e in cls.coef_.squeeze()]])),
        ('original', cls.coef_)
    ]:
        cls.coef_ = coef_
        if VERBOSE:
            tmp = ds.get_test_set_names(test_idx=test_idx)
            print zip(*tmp)
            for e in zip(tmp[0], tmp[2],
                         [zip(*ds.get_row_features(e)) for e in tmp[1]]):
                print e
        tmp = zip(test_label, cls.decision_function(test_feat))
        random.shuffle(tmp)
        _y_pred, _ = zip(*sorted(tmp, key=lambda x: x[1], reverse=True))
        _y_pred_randombaseline = zip(*tmp)[0]
        y_pred = [(1 if e > 0 else 0) for e in _y_pred]
        y_pred_randombaseline = [(1 if e > 0 else 0)
                                 for e in _y_pred_randombaseline]
        # 'average=', average, \
        print 'Criteria=%s' % ds.idx2feat_map[ds.perma_mask[0]], \
            'train_rows=%d' % train_set[0].shape[0], \
            'mask_pattern.pattern=%s' % mask_pattern.pattern, \
            'train_col=%d' % train_set[0].shape[1], \
            'coef_type=%s' % coef_type,

        for k in itertools.takewhile(lambda x: x <= 2 * ds.test_size_by2,
                                     [10, 100]):
            print 'P@%d=%.4f' % (
                k, rasengan.rank_metrics.precision_at_k(y_pred, k)),
            print 'BASE-P@%d=%.4f' % (k,
                                      rasengan.rank_metrics.precision_at_k(
                                          y_pred_randombaseline, k)),
        print 'AUPR=%.3f' % rasengan.rank_metrics.average_precision(y_pred),
        print 'BASE-AUPR=%.3f' % rasengan.rank_metrics.average_precision(
            y_pred_randombaseline)
        # print ds.max_coef(cls.coef_.squeeze())
        # print ds.min_coef(cls.coef_.squeeze())
        if VERBOSE:
            for e in sorted(ds.interpret_coef(cls.coef_.squeeze()),
                            key=lambda x: x[1],
                            reverse=True)[:10]:
                print '\t', e
            tmp = ds.get_train_set_names(train_size_by2, train_idx=train_idx)
            print zip(*tmp)
            for e in zip(tmp[0], tmp[2],
                         [zip(*ds.get_row_features(e)) for e in tmp[1]]):
                print e
    # import ipdb as pdb
    # pdb.set_trace()
    # print FTI.sort(descending=True, compose=None)[:3]
    # print FTI.sort(descending=False, compose=None)[:3]
    # print FTI.sort(coef=np.asarray(ds.test_set[0][0].todense()).squeeze(),
    #                compose=None)[:10]
    return
Example #11
0
 def f(coef, intercept, classes):
     s = SGDClassifier()
     s.coef_ = coef
     s.intercept_ = intercept
     s.classes_ = classes
     return s
Example #12
0
sgd_clf = SGDClassifier(loss="hinge",
                        learning_rate="constant",
                        eta0=0.001,
                        alpha=alpha,
                        max_iter=100000,
                        tol=-np.infty,
                        random_state=42)

sgd_clf.fit(x, y)

# In[ ]:

b = sgd_clf.decision_function([-scaler.mean_ / scaler.scale_])
w = sgd_clf.coef_[0] / scaler.scale_
sgd_clf.intercept_ = np.array([b])
sgd_clf.coef_ = np.array([w])

# Find support vectors (LinearSVC does not do this automatically)
t = y * 2 - 1
support_vectors_idx1 = (t * (x.dot(w) + b) < 1).ravel()
sgd_clf.support_vectors_ = x[support_vectors_idx1]

# In[ ]:

plt.figure(figsize=(10, 5))
plot_svc_decision_boundary(sgd_clf, 0, 5.5)
plt.plot(x[:, 0][y == 0], x[:, 1][y == 0], "bo", label="Iris-Setosa")
plt.plot(x[:, 0][y == 1], x[:, 1][y == 1], "ro", label="Iris-Versicolor")
plt.xlabel("Petal length", fontsize=12)
plt.ylabel("Petal width", fontsize=12)
plt.legend(loc="upper left", fontsize=12)
Example #13
0
def server_update(
    init_weight,
    client_fraction,
    num_rounds,
    features,
    labels,
    epoch,
    batch_size,
    display_weight_per_round,
    rand_seed,
):
    """
    Calls client_update to get the updated weights from clients, and applies Federated
    Averaging Algorithm to update the weight on server side

    init_weights: weights to initialize the training with
        ex: [weights of size num_classes*num_features, intercepts of size num_classes]
    client_fraction: fraction of clients to use per round
    num_rounds: number of rounds used to update the weight
    features: a 3D array containing features for each sample
        ex: [[[feature1, feature2], [feature1, feature2], ...]]
    labels: an array containing the labels for the corresponding sample in "features"
        ex: [label1, label2, ...]
    epoch: number of epochs to run the training for
    batch_size: the size of each batch of data while training
    display_weight_per_round: a boolean value used to toggle the display of weight value per round
    rand_seed: a seed to use with any random number generation in order to get consistant results between runs

    """
    # initialize the weights
    coef = init_weight[0]
    intercept = init_weight[1]

    # unique classes in the dataset
    all_classes = np.unique(labels)

    # number of clients
    client_num = len(features)
    # fraction of clients
    C = client_fraction

    # reseed the rng each run
    random.seed(rand_seed)

    serv = server.ServerFacade(coef, intercept)

    # use to generate n_k so that the sum of n_k equals to n
    for i in range(num_rounds):
        # calculate the number of clients used in this round
        m = max(int(client_num * C), 1)
        # random set of m client's index
        user_ids = np.array(random.sample(range(client_num), m))

        for user_id in user_ids:
            client_features = features[user_id]
            num_samples = len(client_features)
            client_labels = labels[user_id]
            coefs, intercept = client_update(
                [coef, intercept],
                epoch,
                batch_size,
                client_features,
                client_labels,
                all_classes,
                rand_seed,
            )

            # this will get moved to the end of Client.update_and_submit_weights
            payload = {
                "coefs": coefs.tolist(),
                "intercept": intercept.tolist(),
                "num_samples": num_samples,
            }
            serv.ingest_client_data(json.dumps(payload))

    coef, intercept = serv.compute_new_weights()

    # TODO: extract down to end of function so that we can construct
    # a new SGD using new coef+intercept data.

    # Reconstruct a new classifier so that we can test the accuracy
    # using new coef and intercept

    # load coefficients and intercept into the classifier
    clf = SGDClassifier(loss="log", random_state=rand_seed)

    clf.coef_ = coef
    clf.intercept_ = intercept
    clf.classes_ = np.unique(
        list(labels))  # the unique labels are the classes for the classifier

    return clf