Ejemplo n.º 1
0
def test_ovr_partial_fit_exceptions():
    ovr = OneVsRestClassifier(MultinomialNB())
    X = np.abs(np.random.randn(14, 2))
    y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3]
    ovr.partial_fit(X[:7], y[:7], np.unique(y))
    # If a new class that was not in the first call of partial fit is seen
    # it should raise ValueError
    y1 = [5] + y[7:-1]
    msg = r"Mini-batch contains \[.+\] while classes must be subset of \[.+\]"
    with pytest.raises(ValueError, match=msg):
        ovr.partial_fit(X=X[7:], y=y1)
Ejemplo n.º 2
0
def test_ovr_partial_fit_exceptions():
    ovr = OneVsRestClassifier(MultinomialNB())
    X = np.abs(np.random.randn(14, 2))
    y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3]
    ovr.partial_fit(X[:7], y[:7], np.unique(y))
    # A new class value which was not in the first call of partial_fit
    # It should raise ValueError
    y1 = [5] + y[7:-1]
    assert_raises_regexp(ValueError, r"Mini-batch contains \[.+\] while "
                                     r"classes must be subset of \[.+\]",
                         ovr.partial_fit, X=X[7:], y=y1)
Ejemplo n.º 3
0
def test_ovr_partial_fit_exceptions():
    ovr = OneVsRestClassifier(MultinomialNB())
    X = np.abs(np.random.randn(14, 2))
    y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3]
    ovr.partial_fit(X[:7], y[:7], np.unique(y))
    # A new class value which was not in the first call of partial_fit
    # It should raise ValueError
    y1 = [5] + y[7:-1]
    assert_raises_regexp(ValueError, r"Mini-batch contains \[.+\] while "
                                     r"classes must be subset of \[.+\]",
                         ovr.partial_fit, X=X[7:], y=y1)
Ejemplo n.º 4
0
def test_ovr_partial_fit():
    # Test if partial_fit is working as intented
    X, y = shuffle(iris.data, iris.target, random_state=0)
    ovr = OneVsRestClassifier(MultinomialNB())
    ovr.partial_fit(X[:100], y[:100], np.unique(y))
    ovr.partial_fit(X[100:], y[100:])
    pred = ovr.predict(X)
    ovr2 = OneVsRestClassifier(MultinomialNB())
    pred2 = ovr2.fit(X, y).predict(X)

    assert_almost_equal(pred, pred2)
    assert_equal(len(ovr.estimators_), len(np.unique(y)))
    assert_greater(np.mean(y == pred), 0.65)

    # Test when mini batches doesn't have all classes
    # with SGDClassifier
    X = np.abs(np.random.randn(14, 2))
    y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3]

    ovr = OneVsRestClassifier(
        SGDClassifier(max_iter=1, tol=None, shuffle=False, random_state=0))
    ovr.partial_fit(X[:7], y[:7], np.unique(y))
    ovr.partial_fit(X[7:], y[7:])
    pred = ovr.predict(X)
    ovr1 = OneVsRestClassifier(
        SGDClassifier(max_iter=1, tol=None, shuffle=False, random_state=0))
    pred1 = ovr1.fit(X, y).predict(X)
    assert_equal(np.mean(pred == y), np.mean(pred1 == y))

    # test partial_fit only exists if estimator has it:
    ovr = OneVsRestClassifier(SVC())
    assert_false(hasattr(ovr, "partial_fit"))
Ejemplo n.º 5
0
def test_ovr_partial_fit():
    # Test if partial_fit is working as intented
    X, y = shuffle(iris.data, iris.target, random_state=0)
    ovr = OneVsRestClassifier(MultinomialNB())
    ovr.partial_fit(X[:100], y[:100], np.unique(y))
    ovr.partial_fit(X[100:], y[100:])
    pred = ovr.predict(X)
    ovr2 = OneVsRestClassifier(MultinomialNB())
    pred2 = ovr2.fit(X, y).predict(X)

    assert_almost_equal(pred, pred2)
    assert_equal(len(ovr.estimators_), len(np.unique(y)))
    assert_greater(np.mean(y == pred), 0.65)

    # Test when mini batches doesn't have all classes
    ovr = OneVsRestClassifier(MultinomialNB())
    ovr.partial_fit(iris.data[:60], iris.target[:60], np.unique(iris.target))
    ovr.partial_fit(iris.data[60:], iris.target[60:])
    pred = ovr.predict(iris.data)
    ovr2 = OneVsRestClassifier(MultinomialNB())
    pred2 = ovr2.fit(iris.data, iris.target).predict(iris.data)
    
    assert_almost_equal(pred, pred2)
    assert_equal(len(ovr.estimators_), len(np.unique(iris.target)))
    assert_greater(np.mean(iris.target == pred), 0.65)
Ejemplo n.º 6
0
def test_ovr_partial_fit():
    # Test if partial_fit is working as intended
    X, y = shuffle(iris.data, iris.target, random_state=0)
    ovr = OneVsRestClassifier(MultinomialNB())
    ovr.partial_fit(X[:100], y[:100], np.unique(y))
    ovr.partial_fit(X[100:], y[100:])
    pred = ovr.predict(X)
    ovr2 = OneVsRestClassifier(MultinomialNB())
    pred2 = ovr2.fit(X, y).predict(X)

    assert_almost_equal(pred, pred2)
    assert_equal(len(ovr.estimators_), len(np.unique(y)))
    assert_greater(np.mean(y == pred), 0.65)

    # Test when mini batches doesn't have all classes
    # with SGDClassifier
    X = np.abs(np.random.randn(14, 2))
    y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3]

    ovr = OneVsRestClassifier(SGDClassifier(max_iter=1, tol=None,
                                            shuffle=False, random_state=0))
    ovr.partial_fit(X[:7], y[:7], np.unique(y))
    ovr.partial_fit(X[7:], y[7:])
    pred = ovr.predict(X)
    ovr1 = OneVsRestClassifier(SGDClassifier(max_iter=1, tol=None,
                                             shuffle=False, random_state=0))
    pred1 = ovr1.fit(X, y).predict(X)
    assert_equal(np.mean(pred == y), np.mean(pred1 == y))

    # test partial_fit only exists if estimator has it:
    ovr = OneVsRestClassifier(SVC())
    assert_false(hasattr(ovr, "partial_fit"))
Ejemplo n.º 7
0
def test_ovr_partial_fit():
    # Test if partial_fit is working as intented
    X, y = shuffle(iris.data, iris.target, random_state=0)
    ovr = OneVsRestClassifier(MultinomialNB())
    ovr.partial_fit(X[:100], y[:100], np.unique(y))
    ovr.partial_fit(X[100:], y[100:])
    pred = ovr.predict(X)
    ovr2 = OneVsRestClassifier(MultinomialNB())
    pred2 = ovr2.fit(X, y).predict(X)

    assert_almost_equal(pred, pred2)
    assert_equal(len(ovr.estimators_), len(np.unique(y)))
    assert_greater(np.mean(y == pred), 0.65)

    # Test when mini batches doesn't have all classes
    ovr = OneVsRestClassifier(MultinomialNB())
    ovr.partial_fit(iris.data[:60], iris.target[:60], np.unique(iris.target))
    ovr.partial_fit(iris.data[60:], iris.target[60:])
    pred = ovr.predict(iris.data)
    ovr2 = OneVsRestClassifier(MultinomialNB())
    pred2 = ovr2.fit(iris.data, iris.target).predict(iris.data)

    assert_almost_equal(pred, pred2)
    assert_equal(len(ovr.estimators_), len(np.unique(iris.target)))
    assert_greater(np.mean(iris.target == pred), 0.65)
Ejemplo n.º 8
0
G = myGeneratorTrain()
G1 = myGeneratorVal()
clf = OneVsRestClassifier(SGDClassifier(loss='log',
                                        penalty='l1',
                                        max_iter=1000,
                                        n_jobs=-1),
                          n_jobs=-2)
for i in range(5):
    print("i=", i)
    for j in range(int(num_batches * splits)):
        K = G.next()
        yt = K[1]
        xt = np.concatenate((K[0][0], K[0][1], K[0][2], K[0][3], K[0][4],
                             K[0][5], K[0][6], K[0][7]),
                            axis=1)
        clf.partial_fit(xt, yt, classes=np.unique(Y) - 1)
    K1 = G1.next()
    yv = K1[1]
    xv = np.concatenate((K1[0][0], K1[0][1], K1[0][2], K1[0][3], K1[0][4],
                         K1[0][5], K1[0][6], K1[0][7]),
                        axis=1)
    print(clf.score(xv, yv))

from sklearn.externals import joblib
joblib.dump(clf, 'sgdClassifier.pkl')

np.savez('nnIndices2.npz', trainIdx=trainIdx, testIdx=testIdx)

#model.evaluate([X_test[:,0:1000],X_test[:,1000:2000],X_test[:,2000:3000]],np_utils.to_categorical(Y[testIdx]-1,1000))
Ejemplo n.º 9
0
class MultilabelTraining:

    X_COLUMN_NAME = "page_text_extract"

    DEFAULT_TARGET_THEMES = [
        5,
        6,
        26,
        33,
        139,
        163,
        232,
        313,
        339,
        350,
        406,
        409,
        555,
        589,
        597,
        634,
        660,
        695,
        729,
        766,
        773,
        793,
        800,
        810,
        852,
        895,
        951,
        975,
    ]

    OTHER_THEMES_VALUE = 4242

    def __init__(
        self,
        df=pd.DataFrame(),
        x_column_name=X_COLUMN_NAME,
        group_processes=True,
        classifier=XGBClassifier(max_depth=15, random_state=42, n_jobs=-1),
        vectorizer=HashingVectorizer(n_features=2 ** 14),
        target_themes=DEFAULT_TARGET_THEMES,
        other_themes_value=OTHER_THEMES_VALUE,
        remove_processes_without_theme=True,
        is_incremental_training=False,
        vocab_path="",
    ):
        self.is_incremental_training = is_incremental_training
        self.vocab_path = vocab_path
        self.remove_processes_without_theme = remove_processes_without_theme
        self.mo_classifier = OneVsRestClassifier(classifier)
        self.classifier = classifier
        self.vectorizer = vectorizer
        self.target_themes = target_themes
        self.other_themes_value = other_themes_value
        self.group_processes = group_processes
        self.x_column_name = x_column_name
        self._initialize_dataframe(df)

    def _initialize_dataframe(self, df):
        if not df.empty:
            self.dp = DataframePreprocessing(
                df.copy(),
                group_processes=self.group_processes,
                x_column_name=self.x_column_name,
                target_themes=self.target_themes,
                other_themes_value=self.other_themes_value,
                is_incremental_training=self.is_incremental_training,
                remove_processes_without_theme=self.remove_processes_without_theme,
                vocab_path=self.vocab_path,
            )
            self.y_columns_names = self.dp.distinct_themes
            self.df = self.dp.processed_df
        else:
            self.df = df

    def _split(self, X, y):
        print("Splitting dataset...")
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, stratify=y, test_size=0.2, random_state=42
        )

    def _vectorize(self, X_train):
        print("Vectorizing...")
        return self.vectorizer.fit_transform(X_train)

    def train(self, split_df=False):
        print("Training...")
        self.X_train, self.y_train = (
            self.df[self.x_column_name],
            self.df[self.y_columns_names],
        )
        if split_df:
            self._split(self.X_train, self.y_train)
        vector = self._vectorize(self.X_train)
        self.mo_classifier.fit(vector, self.y_train)
        if split_df:
            vector_test = self._vectorize(self.X_test)
            self.y_pred = self.mo_classifier.predict(vector_test)
            metrics = get_multilabel_metrics(self.y_test, self.y_pred)
            return metrics
        return None

    def _update_dataframe(
        self, df, is_incremental_training=True, is_parquet=False, labels_freq={}
    ):
        self.dp = DataframePreprocessing(
            df.copy(),
            x_column_name=self.x_column_name,
            group_processes=self.group_processes,
            target_themes=self.target_themes,
            other_themes_value=self.other_themes_value,
            is_incremental_training=is_incremental_training,
            remove_processes_without_theme=self.remove_processes_without_theme,
            is_parquet=is_parquet,
            vocab_path=self.vocab_path,
            labels_freq=labels_freq,
        )
        self.df = self.dp.processed_df

    def incremental_train(self, df_path, nrows=5000):
        print("Training incrementally...")
        columns_names = pd.read_csv(df_path, nrows=1).columns.tolist()
        skiprows = 1
        classes, labels_freq = DataframePreprocessing(
            target_themes=self.target_themes
        ).get_unique_binarized_labels(df_path, "tema")
        while True:
            df = pd.read_csv(
                df_path,
                nrows=nrows,
                skiprows=skiprows,
                header=None,
                names=columns_names,
            )
            if df.empty:
                break
            self._update_dataframe(df, labels_freq=labels_freq)
            X_train, y_train = (
                self.df[self.x_column_name],
                self.df[self.target_themes + [self.other_themes_value]],
            )
            vector = self._vectorize(X_train)
            self.mo_classifier.partial_fit(vector, y_train, classes=classes)
            skiprows += nrows
            print("{} rows already trained\n".format(skiprows - 1))

    def predict(self):
        return self.mo_classifier.predict(self._vectorize(self.X_test).todense())

    def set_X_test(self, X):
        self.X_test = X

    def set_y_test(self, y):
        self.y_test = y

    def get_pickle(self):
        return pickle.dumps(self.mo_classifier)
Ejemplo n.º 10
0
class OneVsRestSGDClassifier(LabelClassifier):
    def __init__(self,
                 f_dim=100,
                 ft_iters=20,
                 update_iters=100,
                 label_dict_path='data/labels.txt'):

        LabelClassifier.__init__(self, label_dict_path)
        self.f_dim = f_dim  # dimension of word feature vector
        self.ft_iters = ft_iters
        self.update_iters = update_iters

        self.ft_model = FastText(min_count=1, size=self.f_dim)
        self.clf = OneVsRestClassifier(
            SGDClassifier(loss='modified_huber',
                          class_weight={
                              0: 0.4,
                              1: 0.6
                          },
                          penalty='l2',
                          warm_start=False,
                          random_state=1))

    def init_fasttext(self, model_path=None, train_data=None):
        """
        if train_data is provided, train a new fasttext model;
        otherwise, load it from the given path

        --------
        Parameter:

            model_path: fasttext model prefix

            train_data: a list of tokenized sentences. if not provided,
                will try to load existing model from model_path

        """

        if not train_data and model_path and os.path.isfile(model_path):
            #=== load exisitng model ====
            print('loading fasttext model from', model_path)
            self.ft_model = FastText.load(model_path)

        elif train_data:
            #=== train fast text model ====
            # if train_data is not a list of list, split each sentence
            # into list of words
            print('training fasttext model from scratch...')
            train_data = [re.split(',| ',r) if (not isinstance(r,list)) else r\
                          for  r in  train_data ]

            self.ft_model.build_vocab(train_data)
            self.ft_model.train(train_data,
                                total_examples=len(train_data),
                                epochs=self.ft_iters)
            if model_path:
                self.ft_model.save(model_path, separately=[])
        else:
            #=== no train data and no model path provided
            raise TrainDataException(
                'Error building fasttext model. No data/model provided.')

    def div_norm(self, x):
        norm_value = np.sqrt(np.sum(x**2))  #l2norm
        if norm_value > 0:
            return x * (1.0 / norm_value)
        else:
            return x

    def sentence_to_vec(self, words):
        """ generating embedding by summing up normalized
        word embeddings

        --------
        Parameter:
            words: a list of words or a string representation of a sentence
            (seperated by space or ',' )

        Return:
            sentence embedding matrix of size len(words) x f_dim

        """
        if not isinstance(words, list):
            words = re.split(',| ', words)

        vecs = np.zeros((len(words), self.f_dim))
        for i, word in enumerate(words):
            v = self.ft_model.wv.get_vector(word)
            vecs[i] = self.div_norm(v)
        return np.mean(vecs, axis=0)

    def to_vec(self, data):
        """ batch computation of sentence embeddings """
        vec = np.zeros((len(data), self.f_dim))
        for i, sentence in enumerate(data):
            vec[i] = self.sentence_to_vec(sentence)

        return vec

    def train(self, train_data, train_label):
        """
        offline training of the SGD classifier

        --------
        Parameters:

            train_data: a list of tokenized sentences. Each sentence is either
                a string deliminated by comma or space, or a list of words.

            train_label: a list of labels. Each label is a string deliminated
                by comma or space.
        Return:

            X: sentence embedding matrix of size len(train_data) x f_dim
            Y: binary label matrix of size len(train_data) x #_classes
        """
        print('training multilabel classifier on %d samples...' %
              len(train_data))
        Y = np.zeros((len(train_label), len(self.labeldict)))
        for i, labels in enumerate(train_label):
            label_list = re.split(',| ', labels)

            for l in label_list:
                if l:
                    Y[i, self.labeldictR[l]] = 1

        # add dummy sample to classes that do not have samples
        indices = np.where(np.sum(Y, axis=0) == 0)[0]
        Y_new = np.zeros((len(indices), Y.shape[1]))
        for i, id in enumerate(indices):
            train_data.append([self.labeldict[id]])
            Y_new[i, id] = 1
        Y = np.vstack((Y, Y_new))

        X = self.to_vec(train_data)
        self.clf.fit(X, Y)
        return X, Y

    def train_update(self, train_data, train_label):
        """
        online training of the SGD classifier

        --------
        Parameters: see train()

        """
        Y = np.zeros((len(train_label), len(self.labeldict)))
        X = self.to_vec(train_data)
        for i, labels in enumerate(train_label):
            label_list = re.split(',| ', labels)
            for l in label_list:
                if l:
                    Y[i, self.labeldictR[l]] = 1
        for i in range(self.update_iters):
            self.clf.partial_fit(X, Y)
        return X, Y

    def classify(self, string):
        """
        predict the labels of a tokenized sentence

        --------
        Parameters:
            string: string delimited by comma or space, or a list of words

        Return:
            labels: a list of predicted labels

        """
        X = self.to_vec([string])
        Y = self.clf.predict(X)
        #print('class probability',self.clf.predict_proba(X) )
        labels = [self.labeldict[id] for id in np.nonzero(Y[0])[0]]

        return labels

    def save_clf(self, filename):
        print('writing classification model to', filename, '...')
        with open(filename, 'wb') as f:
            pickle.dump(self.clf, f)

    def load_clf(self, filename):
        print('loading classification model from', filename, '...')
        with open(filename, 'rb') as f:
            self.clf = pickle.load(f)
Ejemplo n.º 11
0
classifier = SGDClassifier(loss='log',penalty='l1',alpha=0.001)  # logistic
classifier.partial_fit(samples_train,target_train,classes=target)  # training dataset
target_pred=classifier.predict(samples_test) #testing dataset
accuracy=accuracy_score(target_test,target_pred) #accuracy rate
print ('the accuracy score is',accuracy,'\n')

classifier_l2 = SGDClassifier(loss='log',penalty='l2',alpha=0.001)  # logistic
classifier_l2.partial_fit(samples_train,target_train,classes=target)  # training dataset
target_pred_l2=classifier_l2.predict(samples_test) #testing dataset
accuracy_l2=accuracy_score(target_test,target_pred_l2) #accuracy rate
print ('the accuracy score is',accuracy_l2,'\n')


from sklearn.multiclass import OneVsRestClassifier,OneVsOneClassifier
classifier_l1_onevsrest = OneVsRestClassifier(SGDClassifier(loss='log',penalty='l1',alpha=0.001))  # logistic
classifier_l1_onevsrest.partial_fit(samples_train,target_train,classes=target)  # training dataset
target_pred_l1_onevsrest=classifier_l1_onevsrest.predict(samples_test) #testing dataset
accuracy_l1_onevsrest=accuracy_score(target_test,target_pred_l1_onevsrest) #accuracy rate
print ('the accuracy score is',accuracy_l1_onevsrest,'\n')

from sklearn.preprocessing import PolynomialFeatures
samples_poly=PolynomialFeatures(5)
poly=samples_poly.fit_transform(samples)
samples_train,samples_test,target_train,target_test = train_test_split(poly,target,test_size=0.2,random_state=0)
classifier_poly = SGDClassifier(loss='log',penalty='l1',alpha=0.001)  # logistic
classifier_poly.partial_fit(samples_train,target_train,classes=target)  # training dataset
target_pred_poly=classifier_poly.predict(samples_test) #testing dataset
accuracy_poly=accuracy_score(target_test,target_pred_poly) #accuracy rate
print ('the accuracy score is',accuracy_poly,'\n')