Example #1
0
    def retrain(self, labeled_tweets: list):
        labels = set()
        for tweet in labeled_tweets:
            if "labels" in tweet and len(tweet["labels"]) > 0:
                labels.update([l for l in tweet["labels"] if not (l in self.labels_sent or l in self.labels_relevance)])
        self.labels = list(labels)
        assert "Irrelevant" not in self.labels, "Something went wrong"
        self.lcc = ClassifierChain(SGDClassifier(max_iter=100, loss='log', random_state=1))
        self.clrel = KNNClassifier()
        self.clsent = KNNClassifier()
        X, y, ys, yr = [], [], [], []
        for tweet in labeled_tweets:
            if "labels" in tweet and len(tweet["labels"]) > 0:
                X.append(tweet["tweet"])
                y.append(self._labels2array(tweet["labels"]))
                sls = [l for l, v in tweet["labels"].items() if l in self.labels_sent and v]
                if len(sls) == 1:
                    ys.append(self.labels_sent[sls[0]])
                else:
                    ys.append(self.labels_sent["NEUTRAL"])
                if self.labels_relevance[0] in tweet["labels"] and tweet["labels"][self.labels_relevance[0]]:
                    yr.append(1)
                else:
                    yr.append(0)

        X = np.array(self.tokenizer.transform(X).todense())
        y = np.array(y)
        ys = np.array(ys)
        yr = np.array(yr)
        self.clsent.fit(X, ys)
        print("Trained Sentiment Classifier")
        self.clrel.fit(X, yr)
        print("Trained Relevance Classifier")
        X2, y2 = [], []
        for Xe, ye in zip(X, y):
            if ye.sum() > 0:
                X2.append(Xe)
                y2.append(ye)
        X = np.array(X2)
        y = np.array(y2)
        self.lcc.fit(X, y)
        print("Trained Catecorical Classifier")
Example #2
0
    def __init__(self, texts: list):
        self.tokenizer = TfidfVectorizer()
        self.tokenizer.fit(texts)

        self.labels_sent = {"POSITIVE": np.array([1, 0, 0]), "NEUTRAL": np.array([0, 1, 0]),
                            "NEGATIVE": np.array([0, 0, 1])}
        self.labels_sent = {"POSITIVE": 0, "NEUTRAL": 1,
                            "NEGATIVE": 2}
        self.reverse_sent = {0: {"POSITIVE": True, "NEUTRAL": False,
                                 "NEGATIVE": False},
                             1: {"POSITIVE": False, "NEUTRAL": True,
                                 "NEGATIVE": False},
                             2: {"POSITIVE": False, "NEUTRAL": False,
                                 "NEGATIVE": True}}

        self.labels_relevance = ["Irrelevant"]
        self.labels = []
        self.lcc = ClassifierChain(SGDClassifier(max_iter=100, loss='log', random_state=1))
        self.clrel = KNNClassifier()
        self.clsent = KNNClassifier()
def test_classifier_chains_all():
    seed = 1
    X, Y = make_logical(random_state=seed)

    # CC
    cc = ClassifierChain(SGDClassifier(max_iter=100, tol=1e-3, loss='log', random_state=seed))
    cc.partial_fit(X, Y)
    y_predicted = cc.predict(X)
    y_expected = [[1, 0, 1], [1, 1, 0], [0, 0, 0], [1, 1, 0]]
    assert np.alltrue(y_predicted == y_expected)
    assert type(cc.predict_proba(X)) == np.ndarray

    # RCC
    rcc = ClassifierChain(SGDClassifier(max_iter=100, tol=1e-3, loss='log', random_state=seed), order='random',
                          random_state=seed)
    rcc.partial_fit(X, Y)
    y_predicted = rcc.predict(X)
    y_expected = [[1, 0, 1], [1, 1, 0], [0, 0, 0], [1, 1, 0]]
    assert np.alltrue(y_predicted == y_expected)

    # MCC
    mcc = MonteCarloClassifierChain(SGDClassifier(max_iter=100, tol=1e-3, loss='log', random_state=seed), M=1000)
    mcc.partial_fit(X, Y)
    y_predicted = mcc.predict(X)
    y_expected = [[1, 0, 1], [1, 1, 0], [0, 0, 0], [1, 1, 0]]
    assert np.alltrue(y_predicted == y_expected)

    # PCC
    pcc = ProbabilisticClassifierChain(SGDClassifier(max_iter=100, tol=1e-3, loss='log', random_state=seed))
    pcc.partial_fit(X, Y)
    y_predicted = pcc.predict(X)
    y_expected = [[1, 0, 1], [1, 1, 0], [0, 0, 0], [1, 1, 0]]
    assert np.alltrue(y_predicted == y_expected)
def test_classifier_chains():
    seed = 112
    stream = MultilabelGenerator(random_state=seed, n_targets=3, n_samples=5150)
    stream.prepare_for_use()
    estimator = SGDClassifier(random_state=seed, tol=1e-3, max_iter=10)
    learner = ClassifierChain(base_estimator=estimator, random_state=seed)
    X, y = stream.next_sample(150)
    learner.partial_fit(X, y)

    cnt = 0
    max_samples = 5000
    predictions = []
    true_labels = []
    wait_samples = 100
    correct_predictions = 0

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            true_labels.append(y[0])
            if np.array_equal(y[0], predictions[-1]):
                correct_predictions += 1

        learner.partial_fit(X, y)
        cnt += 1

    if not sklearn_version.startswith("0.21"):
        expected_predictions = [[0.0, 0.0, 1.0], [1.0, 0.0, 0.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0], [1.0, 0.0, 1.0],
                                [1.0, 0.0, 0.0], [1.0, 0.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 0.0],
                                [1.0, 0.0, 0.0], [0.0, 1.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0],
                                [0.0, 0.0, 1.0], [1.0, 1.0, 1.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0],
                                [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 1.0, 1.0], [0.0, 1.0, 1.0],
                                [1.0, 1.0, 1.0], [0.0, 0.0, 0.0], [0.0, 1.0, 0.0], [1.0, 1.0, 1.0], [1.0, 1.0, 0.0],
                                [0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0],
                                [1.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0],
                                [0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 1.0, 0.0],
                                [0.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0], [1.0, 1.0, 1.0]]
        assert np.alltrue(np.array_equal(predictions, expected_predictions))

        expected_correct_predictions = 21
        assert correct_predictions == expected_correct_predictions

        expected_info = "ClassifierChain(base_estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n" \
                        "       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n" \
                        "       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=10,\n" \
                        "       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',\n" \
                        "       power_t=0.5, random_state=112, shuffle=True, tol=0.001,\n" \
                        "       validation_fraction=0.1, verbose=0, warm_start=False),\n" \
                        "                order=None, random_state=112)"
        assert learner.get_info() == expected_info

    else:
        expected_predictions = [[0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0],
                                [1.0, 0.0, 0.0], [1.0, 0.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 0.0],
                                [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0],
                                [0.0, 0.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [0.0, 0.0, 0.0],
                                [0.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 0.0, 1.0], [0.0, 1.0, 1.0], [0.0, 1.0, 1.0],
                                [0.0, 1.0, 1.0], [0.0, 1.0, 0.0], [0.0, 1.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 0.0],
                                [0.0, 1.0, 1.0], [1.0, 0.0, 1.0], [0.0, 1.0, 1.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0],
                                [1.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0],
                                [0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 1.0], [0.0, 1.0, 0.0],
                                [0.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0], [1.0, 1.0, 1.0]]
        assert np.alltrue(np.array_equal(predictions, expected_predictions))

        expected_correct_predictions = 26
        assert correct_predictions == expected_correct_predictions

        expected_info = "ClassifierChain(base_estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n" \
                        "              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n" \
                        "              l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=10,\n" \
                        "              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,\n" \
                        "              random_state=112, shuffle=True, tol=0.001,\n" \
                        "              validation_fraction=0.1, verbose=0, warm_start=False),\n" \
                        "                order=None, random_state=112)"
        assert learner.get_info() == expected_info

    assert type(learner.predict(X)) == np.ndarray
def test_classifier_chains():
    seed = 112
    stream = MultilabelGenerator(random_state=seed,
                                 n_targets=3,
                                 n_samples=5150)

    estimator = SGDClassifier(random_state=seed, max_iter=10)
    learner = ClassifierChain(base_estimator=estimator, random_state=seed)
    X, y = get_next_n_samples(stream, 150)
    learner.partial_fit(X, y)

    cnt = 0
    max_samples = 5000
    predictions = []
    true_labels = []
    wait_samples = 100
    correct_predictions = 0

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            true_labels.append(y[0])
            if np.array_equal(y[0], predictions[-1]):
                correct_predictions += 1

        learner.partial_fit(X, y)
        cnt += 1

    if not sklearn_version.startswith("0.21"):
        expected_predictions = [[0., 0., 1.], [0., 0., 0.], [1., 0., 1.],
                                [1., 0., 1.], [0., 0., 1.], [1., 0., 0.],
                                [1., 0., 1.], [1., 0., 1.], [0., 0., 1.],
                                [0., 0., 0.], [1., 0., 1.], [0., 0., 1.],
                                [0., 0., 1.], [0., 0., 1.], [0., 0., 1.],
                                [0., 0., 1.], [1., 0., 1.], [0., 0., 0.],
                                [1., 0., 1.], [0., 0., 0.], [0., 1., 1.],
                                [0., 1., 1.], [0., 0., 1.], [0., 1., 1.],
                                [0., 1., 1.], [0., 1., 1.], [0., 1., 0.],
                                [0., 1., 0.], [1., 1., 1.], [0., 1., 0.],
                                [0., 1., 1.], [1., 0., 1.], [0., 1., 1.],
                                [0., 0., 0.], [0., 0., 0.], [1., 0., 0.],
                                [1., 1., 1.], [0., 1., 1.], [0., 0., 0.],
                                [1., 0., 1.], [0., 0., 1.], [0., 0., 0.],
                                [0., 0., 0.], [0., 0., 1.], [0., 1., 0.],
                                [0., 0., 0.], [1., 1., 1.], [0., 0., 0.],
                                [1., 1., 1.]]
        assert np.alltrue(np.array_equal(predictions, expected_predictions))

        expected_correct_predictions = 26
        assert correct_predictions == expected_correct_predictions

        expected_info = "ClassifierChain(base_estimator=SGDClassifier(max_iter=10, " \
                        "random_state=112), order=None, random_state=112)"
        info = " ".join([line.strip() for line in learner.get_info().split()])
        assert info == expected_info

    else:
        expected_predictions = [[0.0, 0.0, 1.0], [0.0, 0.0, 0.0],
                                [1.0, 0.0, 1.0], [1.0, 0.0, 1.0],
                                [0.0, 0.0, 1.0], [1.0, 0.0, 0.0],
                                [1.0, 0.0, 1.0], [1.0, 0.0, 1.0],
                                [0.0, 0.0, 1.0], [0.0, 0.0, 0.0],
                                [1.0, 0.0, 1.0], [0.0, 0.0, 1.0],
                                [0.0, 0.0, 1.0], [0.0, 0.0, 1.0],
                                [0.0, 0.0, 1.0], [0.0, 0.0, 1.0],
                                [1.0, 0.0, 1.0], [0.0, 0.0, 0.0],
                                [1.0, 0.0, 1.0], [0.0, 0.0, 0.0],
                                [0.0, 1.0, 1.0], [0.0, 1.0, 1.0],
                                [0.0, 0.0, 1.0], [0.0, 1.0, 1.0],
                                [0.0, 1.0, 1.0], [0.0, 1.0, 1.0],
                                [0.0, 1.0, 0.0], [0.0, 1.0, 0.0],
                                [1.0, 1.0, 1.0], [0.0, 1.0, 0.0],
                                [0.0, 1.0, 1.0], [1.0, 0.0, 1.0],
                                [0.0, 1.0, 1.0], [0.0, 0.0, 0.0],
                                [0.0, 0.0, 0.0], [1.0, 0.0, 0.0],
                                [1.0, 1.0, 1.0], [0.0, 1.0, 1.0],
                                [0.0, 0.0, 0.0], [1.0, 0.0, 1.0],
                                [0.0, 0.0, 1.0], [0.0, 0.0, 0.0],
                                [0.0, 0.0, 0.0], [0.0, 0.0, 1.0],
                                [0.0, 1.0, 0.0], [0.0, 0.0, 0.0],
                                [1.0, 1.0, 1.0], [0.0, 0.0, 0.0],
                                [1.0, 1.0, 1.0]]
        assert np.alltrue(np.array_equal(predictions, expected_predictions))

        expected_correct_predictions = 26
        assert correct_predictions == expected_correct_predictions

        expected_info = "ClassifierChain(base_estimator=SGDClassifier(max_iter=10, " \
                        "random_state=112), order=None, random_state=112)"
        info = " ".join([line.strip() for line in learner.get_info().split()])
        assert info == expected_info

    assert type(learner.predict(X)) == np.ndarray
         n_targets=data_stream.n_targets
     ),
     "ensemble": False
 },
 "br_nb": {
     "name": "Binary Relevance - Naive Bayes",
     "model": lambda data_stream: MultiOutputLearner(
         NaiveBayes(),
         n_targets=data_stream.n_targets
     ),
     "ensemble": False
 },
 "cc": {
     "name": "Classifier Chain - Perceptron",
     "model": lambda data_stream: ClassifierChain(
         Perceptron(),
         n_targets=data_stream.n_targets
     ),
     "ensemble": False
 },
 "cc_ht": {
     "name": "Binary Relevance - HoeffdingTreeClassifier",
     "model": lambda data_stream: ClassifierChain(
         PerceptronMask(),
         n_targets=data_stream.n_targets
     ),
     "ensemble": False
 },
 "cc_nb": {
     "name": "Classifier Chain - Naive Bayes",
     "model": lambda data_stream: ClassifierChain(
         NaiveBayes(),
Example #7
0
class LabelPredict:
    def __init__(self, texts: list):
        self.tokenizer = TfidfVectorizer()
        self.tokenizer.fit(texts)

        self.labels_sent = {"POSITIVE": np.array([1, 0, 0]), "NEUTRAL": np.array([0, 1, 0]),
                            "NEGATIVE": np.array([0, 0, 1])}
        self.labels_sent = {"POSITIVE": 0, "NEUTRAL": 1,
                            "NEGATIVE": 2}
        self.reverse_sent = {0: {"POSITIVE": True, "NEUTRAL": False,
                                 "NEGATIVE": False},
                             1: {"POSITIVE": False, "NEUTRAL": True,
                                 "NEGATIVE": False},
                             2: {"POSITIVE": False, "NEUTRAL": False,
                                 "NEGATIVE": True}}

        self.labels_relevance = ["Irrelevant"]
        self.labels = []
        self.lcc = ClassifierChain(SGDClassifier(max_iter=100, loss='log', random_state=1))
        self.clrel = KNNClassifier()
        self.clsent = KNNClassifier()

    def _labels2array(self, labeldict: dict):
        target = []
        for label in self.labels:
            if label in labeldict and labeldict[label] == True:
                target.append(1)
            else:
                target.append(0)
        return np.array(target)

    def retrain(self, labeled_tweets: list):
        labels = set()
        for tweet in labeled_tweets:
            if "labels" in tweet and len(tweet["labels"]) > 0:
                labels.update([l for l in tweet["labels"] if not (l in self.labels_sent or l in self.labels_relevance)])
        self.labels = list(labels)
        assert "Irrelevant" not in self.labels, "Something went wrong"
        self.lcc = ClassifierChain(SGDClassifier(max_iter=100, loss='log', random_state=1))
        self.clrel = KNNClassifier()
        self.clsent = KNNClassifier()
        X, y, ys, yr = [], [], [], []
        for tweet in labeled_tweets:
            if "labels" in tweet and len(tweet["labels"]) > 0:
                X.append(tweet["tweet"])
                y.append(self._labels2array(tweet["labels"]))
                sls = [l for l, v in tweet["labels"].items() if l in self.labels_sent and v]
                if len(sls) == 1:
                    ys.append(self.labels_sent[sls[0]])
                else:
                    ys.append(self.labels_sent["NEUTRAL"])
                if self.labels_relevance[0] in tweet["labels"] and tweet["labels"][self.labels_relevance[0]]:
                    yr.append(1)
                else:
                    yr.append(0)

        X = np.array(self.tokenizer.transform(X).todense())
        y = np.array(y)
        ys = np.array(ys)
        yr = np.array(yr)
        self.clsent.fit(X, ys)
        print("Trained Sentiment Classifier")
        self.clrel.fit(X, yr)
        print("Trained Relevance Classifier")
        X2, y2 = [], []
        for Xe, ye in zip(X, y):
            if ye.sum() > 0:
                X2.append(Xe)
                y2.append(ye)
        X = np.array(X2)
        y = np.array(y2)
        self.lcc.fit(X, y)
        print("Trained Catecorical Classifier")

    def predict(self, text: str):
        X = np.array(self.tokenizer.transform([text]).todense()).reshape((1, -1))
        predicted = self.lcc.predict(X)
        labels_add = {label: bool(value) for label, value in zip(self.labels, predicted.flatten())}
        sent_pred = self.clsent.predict(X)
        labels_add.update(self.reverse_sent[sent_pred.flatten()[0]])

        assert "POSITIVE" in labels_add, "Klassifikation nicht eindeutig"

        if self.clrel.predict(X) == np.array([1]):
            labels_add[self.labels_relevance[0]] = True
        else:
            labels_add[self.labels_relevance[0]] = False
        return labels_add

    def train_item(self, tweet):
        text = tweet["tweet"]
        labeldict = tweet["labels"]
        for l in labeldict:
            if l not in self.labels and l not in self.labels_relevance and l not in self.labels_sent:
                print("RETRAIN!")
                return False
        y = self._labels2array(labeldict).reshape((1, -1))
        X = np.array(self.tokenizer.transform([text]).todense()).reshape((1, -1))

        sls = [l for l, v in labeldict.items() if l in self.labels_sent and v]
        if len(sls) == 1:
            ys = self.labels_sent[sls[0]]
        else:
            ys = self.labels_sent["NEUTRAL"]
        ys = np.array([ys])
        if self.labels_relevance[0] in labeldict and labeldict[self.labels_relevance[0]]:
            yr = np.array([1])
        else:
            yr = np.array([0])
        if y.sum() > 0:
            self.lcc.partial_fit(X, y)
        if yr.sum() > 0:
            self.clrel.partial_fit(X, yr)
        if ys.sum() > 0:
            self.clsent.partial_fit(X, ys)
        return True