def retrain(self, raw_df): """ Retrains this Level1MLModule on the given data. Raises a ValueError if the given DataFrame is empty. - Preprocesses the result_full_descriptions and labels in the given DataFrame - Fits the vectorizer to the given result_full_descriptions - Selects the best classifier by using a 5-fold cross-validation process - Trains the selected classifier on the given data :param raw_df: a DataFrame containing the raw training data extracted from the database - required columns: {"result_full_description", "level_1"} :return: None """ if raw_df.empty: raise ValueError("Cannot retrain Level1MLModule on empty set.") print("Level1MLModule: Started retraining") df = preprocess(raw_df) self.vectorizer = self._get_vectorizer(df) self.classifier = best_classifier(df, "level_1", self._get_vectorizer, self._get_candidate_classifiers())() X = self.vectorizer.transform(df["result_full_description"]) y = df["level_1"] self.classifier.fit(X, y) if isinstance(self.classifier, LinearSVC): confidences, _ = get_confidences(self.classifier, X, scale=1) self.scale = np.max(confidences) print("Level1MLModule: Finished retraining")
def classify(self, raw_df, observations=False): """ Classifies the given data. Raises a ValueError if this TestOutcomeModule has not been trained. :param raw_df: a DataFrame containing the raw test data extracted from the database - required columns: {"test_key", "result_key", "obs_seq_nbr" (if observations is True), "result_full_description", "candidates" (if self.organisms is True)} :param observations: True if the data is given at the observation level, False if the data is given at the test level :return: a DataFrame containing the classification results - columns: {"test_key", "result_key", "obs_seq_nbr" (if observations is True), "test_outcome_pred", 'test_outcome_classifier", "test_outcome_confidence", "test_outcome_confidence_type"} """ if not self._is_trained(): raise ValueError("TestOutcomeModule is not trained.") keys = get_keys(observations) if raw_df.shape[0] == 0: return pd.DataFrame(columns=keys + [ "test_outcome_pred", "test_outcome_classifier", "test_outcome_confidence", "test_outcome_confidence_type" ]) df = preprocess(raw_df, organisms=self.organisms) X = self.vectorizer.transform(df["result_full_description"]) y_pred = self.classifier.predict(X) result = df.loc[:, keys] result["test_outcome_pred"] = y_pred result["test_outcome_classifier"] = json.dumps({ "type": self.classifier.__class__.__name__, "params": self.classifier.get_params() }) confidence, confidence_type\ = get_confidences(self.classifier, X, self.scale) result["test_outcome_confidence"] = confidence result["test_outcome_confidence_type"] = confidence_type return result
from util.plot import plot_confusion_matrix from util.preprocessor import preprocess from util.save_util import save_classifier save_path_prefix = "./resources/" """Data preparation""" data_path = "./data/train_data.csv" data_headers = ["polarity", "id", "date", "query", "user", "text"] train_size = 10000 test_size = train_size * 0.2 x_test, y_test, x_train, y_train = get_data(data_path, train_size, test_size, data_headers) x_test, y_test = preprocess(x_test, y_test) x_train, y_train = preprocess(x_train, y_train) train_labels, test_labels = update_labels(y_train, y_test) """TFiDF""" tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 1)) vectorizer = tfidf.fit(x_train["text"]) features_train = pd.DataFrame(vectorizer.transform(x_train["text"]).todense(), columns=tfidf.get_feature_names()) features_test = pd.DataFrame(vectorizer.transform(x_test["text"]).todense(), columns=tfidf.get_feature_names()) """Support Vector Machine Classifier""" clf = SVC(kernel='linear').fit(features_train.values, train_labels) predicted = clf.predict(features_test.values) """Metrics"""