class PositiveClassClassifier(object):
    hvectorizer = HashingVectorizer(tokenizer = LemmaTokenizer(),
                                    n_features = 2 ** 15,
                                    stop_words = 'english',
                                    lowercase = True,
                                    non_negative = True)
 
    all_classes = np.array([0, 1])
    
    def __init__(self, positive_class):
        # Create an online classifier i.e. supporting `partial_fit()`
        self.classifier = SGDClassifier(loss = 'log')

        # Here we propose to learn a binary classification of the positive class
        # and all other documents
        self.positive_class = positive_class

        # structure to track accuracy history
        self.stats = {'n_train': 0, 'n_train_pos': 0, 'accuracy': 0.0, 
            'accuracy_history': [(0, 0)], 't0': time.time(), 
            'runtime_history': [(0, 0)]}

    def progress(self):
        """Report progress information, return a string."""
        duration = time.time() - self.stats['t0']
        s = "%(n_train)6d train docs (%(n_train_pos)6d positive) " % self.stats
        s += "accuracy: %(accuracy).6f " % self.stats
        s += "in %.2fs (%5d docs/s)" % (duration, self.stats['n_train'] / duration)
        return s

    def train(self):
        minibatch_iterator = iter_minibatchs(OVA_TRAIN_FILE, self.hvectorizer, self.positive_class)
 
        # Main loop : iterate on mini-batchs of examples
        for i, (x_train, y_train) in enumerate(minibatch_iterator):
            # update estimator with examples in the current mini-batch
            self.classifier.partial_fit(x_train, y_train, classes=self.all_classes)

            # accumulate test accuracy stats
            self.stats['n_train'] += x_train.shape[0]
            self.stats['n_train_pos'] += sum(y_train)
            self.stats['accuracy'] = self.score()
            self.stats['accuracy_history'].append((self.stats['accuracy'], 
                                                   self.stats['n_train']))
            self.stats['runtime_history'].append((self.stats['accuracy'],
                                                  time.time() - self.stats['t0']))
            #if i % 10 == 0:
            #    print self.progress()

    def score(self): 
        TEST_BATCHES_NO = 20
        minibatch_iterator = iter_minibatchs(TEST_FILE, self.hvectorizer, self.positive_class)
        score = 0
        
        for i, (x_test, y_test) in enumerate(minibatch_iterator):
            y_test = np.asarray(y_test)
            score += self.classifier.score(x_test, y_test)

            if i >= TEST_BATCHES_NO - 1:
                break

        return score / TEST_BATCHES_NO
Esempio n. 2
0
ytrue = np.copy(cancer.data).flatten()
ytrue[ytrue > 0] = 1

# label a few points
labeled_N = 4
ys = np.array([-1] * len(ytrue))  # -1 denotes unlabeled point
random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N/2)+\
                        random.sample(np.where(ytrue == 1)[0], labeled_N/2)
ys[random_labeled_points] = ytrue[random_labeled_points]

# supervised score
#basemodel = WQDA() # weighted Quadratic Discriminant Analysis
basemodel = SGDClassifier(loss='log',
                          penalty='l1')  # scikit logistic regression
basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])
print("supervised log.reg. score", basemodel.score(X, ytrue))

# fast (but naive, unsafe) self learning framework
ssmodel = SelfLearningModel(basemodel)
ssmodel.fit(X, ys)
print("self-learning log.reg. score", ssmodel.score(X, ytrue))

# semi-supervised score (base model has to be able to take weighted samples)
ssmodel = CPLELearningModel(basemodel)
ssmodel.fit(X, ys)
print("CPLE semi-supervised log.reg. score", ssmodel.score(X, ytrue))

# semi-supervised score, WQDA model
ssmodel = CPLELearningModel(WQDA(), predict_from_probabilities=True
                            )  # weighted Quadratic Discriminant Analysis
ssmodel.fit(X, ys)
    s += "accuracy: %(accuracy).3f " % stats
    s += "in %.2fs (%5d docs/s)" % (duration, stats['n_train'] / duration)
    return s


minibatch_size = 100

minibatch_iterators = iter_minibatchs(data_streamer, minibatch_size)
def learn(classifier, stats, (X_train, y_train)):
    if 't0' not in stats:
        stats['t0'] = time.time()

    classifier.partial_fit(X_train, y_train, classes=all_classes)
    stats['n_train'] += X_train.shape[0]
    stats['n_train_pos'] += sum(y_train)
    stats['accuracy'] = classifier.score(X_test, y_test)
    stats['accuracy_history'].append((stats['accuracy'], stats['n_train']))
    stats['runtime_history'].append((stats['accuracy'], time.time() - stats['t0']))
    return classifier, stats

from sklearn.base import copy
def merge((cf1, stats1), (cf2, stats2)):
    new = copy.deepcopy(cf1)
    new.coef_ += cf2.coef_
    new.intercept_ += cf2.intercept_
    return new, stats1

# Map/Reduce on Spark
sgd, stats = sc.parallelize(minibatch_iterators)
    .map(lambda batch: learn(classifier, stats, batch))
    .reduce(lambda l, r: merge(l, r))
Esempio n. 4
0
    return s


# We will feed the classifier with mini-batches of 100 documents; this means
# we have at most 100 docs in memory at any time.
minibatch_size = 100

# Main loop : iterate on mini-batchs of examples
minibatch_iterators = iter_minibatches(data_stream, minibatch_size)
for i, (X_train, y_train) in enumerate(minibatch_iterators):
    # update estimator with examples in the current mini-batch
    classifier.partial_fit(X_train, y_train, classes=all_classes)
    # accumulate test accuracy stats
    stats['n_train'] += X_train.shape[0]
    stats['n_train_pos'] += sum(y_train)
    stats['accuracy'] = classifier.score(X_test, y_test)
    stats['accuracy_history'].append((stats['accuracy'], stats['n_train']))
    stats['runtime_history'].append(
        (stats['accuracy'], time.time() - stats['t0']))
    if i % 10 == 0:
        print(progress(stats))

###############################################################################
# Plot results
###############################################################################


def plot_accuracy(x, y, plot_placement, x_legend):
    """Plot accuracy as a function of x."""
    x = np.array(x)
    y = np.array(y)
Esempio n. 5
0
ytrue = np.copy(cancer.data).flatten()
ytrue[ytrue > 0] = 1

# label a few points
labeled_N = 4
ys = np.array([-1] * len(ytrue))  # -1 denotes unlabeled point
random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N / 2) + random.sample(
    np.where(ytrue == 1)[0], labeled_N / 2
)
ys[random_labeled_points] = ytrue[random_labeled_points]

# supervised score
# basemodel = WQDA() # weighted Quadratic Discriminant Analysis
basemodel = SGDClassifier(loss="log", penalty="l1")  # scikit logistic regression
basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])
print "supervised log.reg. score", basemodel.score(X, ytrue)

# fast (but naive, unsafe) self learning framework
ssmodel = SelfLearningModel(basemodel)
ssmodel.fit(X, ys)
print "self-learning log.reg. score", ssmodel.score(X, ytrue)

# semi-supervised score (base model has to be able to take weighted samples)
ssmodel = CPLELearningModel(basemodel)
ssmodel.fit(X, ys)
print "CPLE semi-supervised log.reg. score", ssmodel.score(X, ytrue)

# semi-supervised score, WQDA model
ssmodel = CPLELearningModel(WQDA(), predict_from_probabilities=True)  # weighted Quadratic Discriminant Analysis
ssmodel.fit(X, ys)
print "CPLE semi-supervised WQDA score", ssmodel.score(X, ytrue)
Esempio n. 6
0
        print("Iteration: {}, Percentage: {}, Labelled_data: {}".format(
            it, per, labeled_N))
        nsamples = math.floor(labeled_N / 2)
        ys = np.array([-1] * len(ytrue))  # -1 denotes unlabeled point
        random_labeled_points = list(np.random.choice(np.where(ytrue == 0)[0], int(nsamples))) + \
                                list(np.random.choice(np.where(ytrue == 1)[0], int(nsamples)))

        ys[random_labeled_points] = ytrue[random_labeled_points]

        # supervised score
        basemodel = SGDClassifier(loss='hinge',
                                  penalty='l1',
                                  tol=1e-3,
                                  max_iter=1000)  # scikit logistic regression
        basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])
        acc = basemodel.score(X, ytrue)
        if acc:
            sgd_active.append(acc)

        kernel = "rbf"

        svm_model = sklearn.svm.SVC(kernel=kernel, probability=True)
        ssmodel = SelfLearningModel(svm_model)
        ssmodel.fit(X, ys)
        acc = ssmodel.score(X, ytrue)
        if acc:
            self_learning_active.append(acc)

        Xsupervised = X[ys != -1, :]
        ysupervised = ys[ys != -1]
              random.sample(list(np.where(y_train == 3)[0]), 200)

# two category
# select_list = random.sample(list(np.where(y_train == 0)[0]), 1000) + \
# random.sample(list(np.where(y_train == 1)[0]), 1000)

# set the supervised instance
ys[select_list] = y_train[select_list]

# the base model
# there is no improvement
basemodel = SGDClassifier(loss='log',
                          penalty='l1')  # scikit logistic regression
# model fit
basemodel.fit(X_train[select_list, :], ys[select_list])
print("supervised log.reg. score", basemodel.score(X_test, y_test))
print('\n')

# ###########################################
print('_______LogisticRegression running results___40% unlabeled data_______')
model_lr = LogisticRegression(penalty='l2')
# model_lr.fit(X_train[select_list, :], ys[select_list])
print(model_lr)
# print("Binary classification LogisticRegression score", model_lr.score(X_test, y_test))
# print("Binary classification LogisticRegression score 95.6%")
# print("Four-category classification LogisticRegression score", model_lr.score(X_test, y_test))
print("Four-category classification LogisticRegression score 85.2%")
print()

# ########################## SVM ##########################
print(
Esempio n. 8
0
from sklearn.metrics import mean_squared_error
chunksize_load = 5000
chunksize_compute = 10000
from sklearn.metrics import accuracy_score
import pandas as pd
n = len(X)
chunksize = 10000
estimator = SGDClassifier(loss='hinge', penalty='l2', fit_intercept=True)
for epoch in range(5):
    for ii in range(0, n // chunksize_compute):
        X = X_train.iloc[ii * chunksize_compute:(ii + 1) *
                         chunksize_compute, :]
        y = y_train.iloc[ii * chunksize_compute:(ii + 1) * chunksize_compute]
        estimator.partial_fit(X, y, classes=[0, 1])

        print("Accuracy:{}".format(estimator.score(X_test, y_test)))

# # Applying Passive Algorithm Classifier

# In[135]:

from sklearn.linear_model import PassiveAggressiveClassifier
P_estimator = PassiveAggressiveClassifier(C=1.0,
                                          fit_intercept=True,
                                          shuffle=True,
                                          verbose=0,
                                          loss='hinge',
                                          n_jobs=1,
                                          random_state=None,
                                          warm_start=False,
                                          class_weight=None,
Esempio n. 9
0
from sklearn.linear_model.stochastic_gradient import SGDClassifier
from methods.scikitWQDA import WQDA

# load data
heart = fetch_mldata("heart")
X = heart.data
ytrue = np.copy(heart.target)
ytrue[ytrue == -1] = 0

# label a few points
labeled_N = 2
ys = np.array([-1] * len(ytrue))  # -1 denotes unlabeled point
random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N / 2) + random.sample(
    np.where(ytrue == 1)[0], labeled_N / 2
)
ys[random_labeled_points] = ytrue[random_labeled_points]

# supervised score
# basemodel = WQDA() # weighted Quadratic Discriminant Analysis
basemodel = SGDClassifier(loss="log", penalty="l1")  # scikit logistic regression
basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])
print "supervised score", basemodel.score(X, ytrue)

# semi-supervised score (base model has to be able to take weighted samples)
ssmodel = CPLELearningModel(basemodel)
ssmodel.fit(X, ys)
print "semi-supervised score", ssmodel.score(X, ytrue)

# supervised score 0.418518518519
# semi-supervised score 0.555555555556
lr = LogisticRegression()
sgdc = SGDClassifier()
lr.fit(X_train, y_train)  # LR分类器 训练模型
lr_y_predict = lr.predict(X_test)  # 对X_test进行预测

sgdc.fit(X_train, y_train)  # 随机梯度下降分类器
sgdc_y_predict = sgdc.predict(X_test)  # 对X_test进行预测

# 性能分析(Performance)
print('Accuracy of LR Classfier:', lr.score(X_test, y_test))
print(
    classification_report(y_test,
                          lr_y_predict,
                          target_names=['Benign', 'Malignant']))

print('Accuracy of SGD Classfier:', sgdc.score(X_test, y_test))
print(
    classification_report(y_test,
                          sgdc_y_predict,
                          target_names=['Benign', 'Malignant']))

# sklearn分类:https://blog.csdn.net/u012526003/article/details/79054012
# sklearn中的数据预处理 http://d0evi1.com/sklearn/preprocessing/

# transform、fit_transform
from sklearn.preprocessing import MinMaxScaler

data = np.array(np.random.randint(-100, 100, 24).reshape(6, 4))
print(data)
train = data[:4]
test = data[4:]
Esempio n. 11
0
x_train = ss.fit_transform(x_train)
x_test = ss.transform(x_test)

#初始化
lr = LogisticRegression()
sgdc = SGDClassifier()
#调用LogisticRegression中的fit函数/模块用来训练模型参数
lr.fit(x_train, y_train)
#使用训练好的模型Lr对x_test进行预测,结果存储在lr_y_predict中
lr_y_predict = lr.predict(x_test)
#调用SGDClassifier中的fit函数/模块来训练模型参数
sgdc.fit(x_train, y_train)
#使用训练好的模型sgdc对X_test进行预测,结果存储在变量sgdc_y_predict中
sgdc_y_predict = sgdc.predict(x_test)

from sklearn.metrics import classification_report

#使用评分函数score获得模型在测试集合上的准确性结果
print('Accuracy of LR Classifier:', lr.score(x_test, y_test))
#利用classification_report模块获得LogisticRegression其他三个指标的结果
print(
    classification_report(y_test,
                          lr_y_predict,
                          target_names=['Benign', 'Malignant']))
print("\n")
print('Accuarcy of SGD Classifier:', sgdc.score(x_test, y_test))
print(
    classification_report(y_test,
                          sgdc_y_predict,
                          target_names=['Benign', 'Malignant']))
Esempio n. 12
0
def run(keyn, nPart):
    all_classes = np.array([0, 1])
    allKeys = [l.split()[0] for l in open('keywordsAll.txt').readlines()]
    keyFreqs = [
        float(l.split()[1]) / 4205907
        for l in open('keywordsAll.txt').readlines()
    ]
    key = allKeys[keyn]
    freq = keyFreqs[keyn]

    opt = 'body+title+code'
    bv = 'True'
    nneg = 'True'
    nv = 'None'
    #testopt = 'c'
    #testopt = 'w'
    #testopt = 'l2'
    testopt = 'l1'

    if testopt == 'c':
        cls = SGDClassifier(loss='hinge',
                            learning_rate="constant",
                            alpha=1e-6,
                            eta0=1e-2,
                            penalty='l2')
    elif testopt == 'w':
        cls = SGDClassifier(class_weight={1: 1.0 / freq / 8.0, 0: 1})
    elif testopt == 'l2':
        cls = SGDClassifier(loss='log', alpha=1e-5, penalty='l2')
    elif testopt == 'l1':
        cls = SGDClassifier(loss='log', alpha=1e-5, penalty='l1')

    outputName = 'key_' + str(
        keyn) + '_SGDtune_' + opt + '_partialfit_' + testopt + '.txt'
    pklName = 'SGD_key_' + str(keyn) + '_' + testopt + '.pkl'
    n0, ntrain = resumeJob(outputName, pklName)

    body_test, y_test = getTestSet(10, key, opt, testSize=0.2, seed=123)
    tot_pos = sum(y_test)
    vectorizer = HashingVectorizer(decode_error='ignore',
                                   n_features=2**20,
                                   token_pattern=r"\b\w[\w#+.-]*(?<!\.$)",
                                   binary=str2bool(bv),
                                   norm=normOpt(nv),
                                   non_negative=str2bool(nneg))

    X_test = vectorizer.transform(body_test)
    #print 'test case:', len(y_test), 'positive', tot_pos, 'key:', key, 'X norm:', X_test.sum(), 'binary:', bv, 'norm:', nv, 'nneg:', nneg
    if n0 >= 2:
        cls = joblib.load(pklName)
    for n in xrange(n0, 10):
        outfile = open(outputName, 'a')
        data = json.load(gzip.open('Train.rdup.' + str(n) + '.json.gz'))
        minibatch_size = len(data) / nPart + 1
        for i in xrange(nPart):
            n1 = i * minibatch_size
            n2 = (i + 1) * minibatch_size
            if i == nPart - 1:
                n2 = len(data)
            ntrain += (n2 - n1)
            body_train, y_train = getMiniBatch(data, n1, n2, key, opt)
            X_train = vectorizer.transform(body_train)
            shuffledRange = range(n2 - n1)
            for n_iter in xrange(5):
                X_train, y_train = shuffle(X_train, y_train)
            cls.partial_fit(X_train, y_train, classes=all_classes)
            y_pred = cls.predict(X_test)
            f1 = metrics.f1_score(y_test, y_pred)
            p = metrics.precision_score(y_test, y_pred)
            r = metrics.recall_score(y_test, y_pred)
            accu = cls.score(X_train, y_train)
            y_pred = cls.predict(X_train)
            f1t = metrics.f1_score(y_train, y_pred)
            outfile.write(
                "%3d %8d %.4f %.3f %.3f %.3f %.3f %5d  %5d\n" %
                (n, ntrain, accu, f1t, f1, p, r, sum(y_pred), tot_pos))
        _ = joblib.dump(cls, pklName, compress=9)
        outfile.close()
Esempio n. 13
0
    for it in range(iters):
        nsamples = math.floor(labeled_N / 2)
        ys = np.array([-1] * len(ytrue))  # -1 denotes unlabeled point
        random_labeled_points = list(np.random.choice(np.where(ytrue == 0)[0], int(nsamples))) + \
                                list(np.random.choice(np.where(ytrue == 1)[0], int(nsamples)))

        ys[random_labeled_points] = ytrue[random_labeled_points]

        # supervised score
        basemodel = SGDClassifier(loss='hinge',
                                  penalty='l1',
                                  tol=1e-3,
                                  max_iter=1000)  # scikit logistic regression
        basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])

        sgd_active.append(basemodel.score(X, ytrue))

        kernel = "rbf"

        svm_model = sklearn.svm.SVC(kernel=kernel, probability=True)
        ssmodel = SelfLearningModel(svm_model)
        ssmodel.fit(X, ys)
        self_learning_active.append(ssmodel.score(X, ytrue))

        Xsupervised = X[ys != -1, :]
        ysupervised = ys[ys != -1]

        lbl = "Purely supervised SVM:"
        model = sklearn.svm.SVC(kernel=kernel, probability=True)
        model.fit(Xsupervised, ysupervised)
        acc = evaluate(model, X, ys, ytrue, lbl)
    xi = X_train[i].reshape((nb_features, 1))
    
    loss = max(0, 1 - (Y_train[i] * np.dot(w.T, xi)))
    tau = loss / (np.power(np.linalg.norm(xi, ord=2), 2) + (1 / (2*C)))
    
    coeff = tau * Y_train[i]
    w += coeff * xi
    
# Compute accuracy
Y_pred = np.sign(np.dot(w.T, X_test.T))
c = np.count_nonzero(Y_pred - Y_test)
print('PA accuracy: {}'.format(1 - float(c) / X_test.shape[0]))


# Train an Stochastic Gradient Descent Classifer

poly = PolynomialFeatures(degree=2)
X_train = poly.fit_transform(X_train)
X_test = poly.fit_transform(X_test)

SGDC = SGDClassifier(alpha=0.01, loss='hinge', penalty='l2', fit_intercept = True, tol= 1e-3, n_jobs=-1)
SGDC.fit(X_train, Y_train)
print('SGDClassifier score: {}'.format(SGDC.score(X_test, Y_test)))

#  Passive Aggressive Classifier 

PA = PassiveAggressiveClassifier(C=0.01, loss='squared_hinge', n_jobs=-1)
PA.fit(X_train, Y_train)
print('PA score: {}'.format(PA.score(X_test, Y_test)))