def test_classifier_chains_all(): seed = 1 X, Y = make_logical(random_state=seed) # CC cc = ClassifierChain(SGDClassifier(max_iter=100, tol=1e-3, loss='log', random_state=seed)) cc.partial_fit(X, Y) y_predicted = cc.predict(X) y_expected = [[1, 0, 1], [1, 1, 0], [0, 0, 0], [1, 1, 0]] assert np.alltrue(y_predicted == y_expected) assert type(cc.predict_proba(X)) == np.ndarray # RCC rcc = ClassifierChain(SGDClassifier(max_iter=100, tol=1e-3, loss='log', random_state=seed), order='random', random_state=seed) rcc.partial_fit(X, Y) y_predicted = rcc.predict(X) y_expected = [[1, 0, 1], [1, 1, 0], [0, 0, 0], [1, 1, 0]] assert np.alltrue(y_predicted == y_expected) # MCC mcc = MonteCarloClassifierChain(SGDClassifier(max_iter=100, tol=1e-3, loss='log', random_state=seed), M=1000) mcc.partial_fit(X, Y) y_predicted = mcc.predict(X) y_expected = [[1, 0, 1], [1, 1, 0], [0, 0, 0], [1, 1, 0]] assert np.alltrue(y_predicted == y_expected) # PCC pcc = ProbabilisticClassifierChain(SGDClassifier(max_iter=100, tol=1e-3, loss='log', random_state=seed)) pcc.partial_fit(X, Y) y_predicted = pcc.predict(X) y_expected = [[1, 0, 1], [1, 1, 0], [0, 0, 0], [1, 1, 0]] assert np.alltrue(y_predicted == y_expected)
def test_classifier_chains(): seed = 112 stream = MultilabelGenerator(random_state=seed, n_targets=3, n_samples=5150) estimator = SGDClassifier(random_state=seed, max_iter=10) learner = ClassifierChain(base_estimator=estimator, random_state=seed) X, y = get_next_n_samples(stream, 150) learner.partial_fit(X, y) cnt = 0 max_samples = 5000 predictions = [] true_labels = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) true_labels.append(y[0]) if np.array_equal(y[0], predictions[-1]): correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 if not sklearn_version.startswith("0.21"): expected_predictions = [[0., 0., 1.], [0., 0., 0.], [1., 0., 1.], [1., 0., 1.], [0., 0., 1.], [1., 0., 0.], [1., 0., 1.], [1., 0., 1.], [0., 0., 1.], [0., 0., 0.], [1., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [1., 0., 1.], [0., 0., 0.], [1., 0., 1.], [0., 0., 0.], [0., 1., 1.], [0., 1., 1.], [0., 0., 1.], [0., 1., 1.], [0., 1., 1.], [0., 1., 1.], [0., 1., 0.], [0., 1., 0.], [1., 1., 1.], [0., 1., 0.], [0., 1., 1.], [1., 0., 1.], [0., 1., 1.], [0., 0., 0.], [0., 0., 0.], [1., 0., 0.], [1., 1., 1.], [0., 1., 1.], [0., 0., 0.], [1., 0., 1.], [0., 0., 1.], [0., 0., 0.], [0., 0., 0.], [0., 0., 1.], [0., 1., 0.], [0., 0., 0.], [1., 1., 1.], [0., 0., 0.], [1., 1., 1.]] assert np.alltrue(np.array_equal(predictions, expected_predictions)) expected_correct_predictions = 26 assert correct_predictions == expected_correct_predictions expected_info = "ClassifierChain(base_estimator=SGDClassifier(max_iter=10, " \ "random_state=112), order=None, random_state=112)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info else: expected_predictions = [[0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [1.0, 0.0, 0.0], [1.0, 0.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [0.0, 0.0, 0.0], [0.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 0.0, 1.0], [0.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 1.0, 0.0], [0.0, 1.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 0.0], [0.0, 1.0, 1.0], [1.0, 0.0, 1.0], [0.0, 1.0, 1.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 1.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0], [1.0, 1.0, 1.0]] assert np.alltrue(np.array_equal(predictions, expected_predictions)) expected_correct_predictions = 26 assert correct_predictions == expected_correct_predictions expected_info = "ClassifierChain(base_estimator=SGDClassifier(max_iter=10, " \ "random_state=112), order=None, random_state=112)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info assert type(learner.predict(X)) == np.ndarray
def test_classifier_chains(): seed = 112 stream = MultilabelGenerator(random_state=seed, n_targets=3, n_samples=5150) stream.prepare_for_use() estimator = SGDClassifier(random_state=seed, tol=1e-3, max_iter=10) learner = ClassifierChain(base_estimator=estimator, random_state=seed) X, y = stream.next_sample(150) learner.partial_fit(X, y) cnt = 0 max_samples = 5000 predictions = [] true_labels = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) true_labels.append(y[0]) if np.array_equal(y[0], predictions[-1]): correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 if not sklearn_version.startswith("0.21"): expected_predictions = [[0.0, 0.0, 1.0], [1.0, 0.0, 0.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0], [1.0, 0.0, 1.0], [1.0, 0.0, 0.0], [1.0, 0.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [1.0, 1.0, 1.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 1.0, 1.0], [0.0, 1.0, 1.0], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0], [0.0, 1.0, 0.0], [1.0, 1.0, 1.0], [1.0, 1.0, 0.0], [0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0], [1.0, 1.0, 1.0]] assert np.alltrue(np.array_equal(predictions, expected_predictions)) expected_correct_predictions = 21 assert correct_predictions == expected_correct_predictions expected_info = "ClassifierChain(base_estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n" \ " early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n" \ " l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=10,\n" \ " n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',\n" \ " power_t=0.5, random_state=112, shuffle=True, tol=0.001,\n" \ " validation_fraction=0.1, verbose=0, warm_start=False),\n" \ " order=None, random_state=112)" assert learner.get_info() == expected_info else: expected_predictions = [[0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [1.0, 0.0, 0.0], [1.0, 0.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [0.0, 0.0, 0.0], [0.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 0.0, 1.0], [0.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 1.0, 0.0], [0.0, 1.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 0.0], [0.0, 1.0, 1.0], [1.0, 0.0, 1.0], [0.0, 1.0, 1.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 1.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0], [1.0, 1.0, 1.0]] assert np.alltrue(np.array_equal(predictions, expected_predictions)) expected_correct_predictions = 26 assert correct_predictions == expected_correct_predictions expected_info = "ClassifierChain(base_estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n" \ " early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n" \ " l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=10,\n" \ " n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,\n" \ " random_state=112, shuffle=True, tol=0.001,\n" \ " validation_fraction=0.1, verbose=0, warm_start=False),\n" \ " order=None, random_state=112)" assert learner.get_info() == expected_info assert type(learner.predict(X)) == np.ndarray
class LabelPredict: def __init__(self, texts: list): self.tokenizer = TfidfVectorizer() self.tokenizer.fit(texts) self.labels_sent = {"POSITIVE": np.array([1, 0, 0]), "NEUTRAL": np.array([0, 1, 0]), "NEGATIVE": np.array([0, 0, 1])} self.labels_sent = {"POSITIVE": 0, "NEUTRAL": 1, "NEGATIVE": 2} self.reverse_sent = {0: {"POSITIVE": True, "NEUTRAL": False, "NEGATIVE": False}, 1: {"POSITIVE": False, "NEUTRAL": True, "NEGATIVE": False}, 2: {"POSITIVE": False, "NEUTRAL": False, "NEGATIVE": True}} self.labels_relevance = ["Irrelevant"] self.labels = [] self.lcc = ClassifierChain(SGDClassifier(max_iter=100, loss='log', random_state=1)) self.clrel = KNNClassifier() self.clsent = KNNClassifier() def _labels2array(self, labeldict: dict): target = [] for label in self.labels: if label in labeldict and labeldict[label] == True: target.append(1) else: target.append(0) return np.array(target) def retrain(self, labeled_tweets: list): labels = set() for tweet in labeled_tweets: if "labels" in tweet and len(tweet["labels"]) > 0: labels.update([l for l in tweet["labels"] if not (l in self.labels_sent or l in self.labels_relevance)]) self.labels = list(labels) assert "Irrelevant" not in self.labels, "Something went wrong" self.lcc = ClassifierChain(SGDClassifier(max_iter=100, loss='log', random_state=1)) self.clrel = KNNClassifier() self.clsent = KNNClassifier() X, y, ys, yr = [], [], [], [] for tweet in labeled_tweets: if "labels" in tweet and len(tweet["labels"]) > 0: X.append(tweet["tweet"]) y.append(self._labels2array(tweet["labels"])) sls = [l for l, v in tweet["labels"].items() if l in self.labels_sent and v] if len(sls) == 1: ys.append(self.labels_sent[sls[0]]) else: ys.append(self.labels_sent["NEUTRAL"]) if self.labels_relevance[0] in tweet["labels"] and tweet["labels"][self.labels_relevance[0]]: yr.append(1) else: yr.append(0) X = np.array(self.tokenizer.transform(X).todense()) y = np.array(y) ys = np.array(ys) yr = np.array(yr) self.clsent.fit(X, ys) print("Trained Sentiment Classifier") self.clrel.fit(X, yr) print("Trained Relevance Classifier") X2, y2 = [], [] for Xe, ye in zip(X, y): if ye.sum() > 0: X2.append(Xe) y2.append(ye) X = np.array(X2) y = np.array(y2) self.lcc.fit(X, y) print("Trained Catecorical Classifier") def predict(self, text: str): X = np.array(self.tokenizer.transform([text]).todense()).reshape((1, -1)) predicted = self.lcc.predict(X) labels_add = {label: bool(value) for label, value in zip(self.labels, predicted.flatten())} sent_pred = self.clsent.predict(X) labels_add.update(self.reverse_sent[sent_pred.flatten()[0]]) assert "POSITIVE" in labels_add, "Klassifikation nicht eindeutig" if self.clrel.predict(X) == np.array([1]): labels_add[self.labels_relevance[0]] = True else: labels_add[self.labels_relevance[0]] = False return labels_add def train_item(self, tweet): text = tweet["tweet"] labeldict = tweet["labels"] for l in labeldict: if l not in self.labels and l not in self.labels_relevance and l not in self.labels_sent: print("RETRAIN!") return False y = self._labels2array(labeldict).reshape((1, -1)) X = np.array(self.tokenizer.transform([text]).todense()).reshape((1, -1)) sls = [l for l, v in labeldict.items() if l in self.labels_sent and v] if len(sls) == 1: ys = self.labels_sent[sls[0]] else: ys = self.labels_sent["NEUTRAL"] ys = np.array([ys]) if self.labels_relevance[0] in labeldict and labeldict[self.labels_relevance[0]]: yr = np.array([1]) else: yr = np.array([0]) if y.sum() > 0: self.lcc.partial_fit(X, y) if yr.sum() > 0: self.clrel.partial_fit(X, yr) if ys.sum() > 0: self.clsent.partial_fit(X, ys) return True