Ejemplo n.º 1
0
    def retrain(self, raw_df):
        """
        Retrains this Level1MLModule on the given data. Raises a ValueError if
        the given DataFrame is empty.
        - Preprocesses the result_full_descriptions and labels in the given
          DataFrame
        - Fits the vectorizer to the given result_full_descriptions
        - Selects the best classifier by using a 5-fold cross-validation process
        - Trains the selected classifier on the given data
        :param raw_df: a DataFrame containing the raw training data extracted
        from the database
        - required columns: {"result_full_description", "level_1"}
        :return: None
        """
        if raw_df.empty:
            raise ValueError("Cannot retrain Level1MLModule on empty set.")

        print("Level1MLModule: Started retraining")

        df = preprocess(raw_df)

        self.vectorizer = self._get_vectorizer(df)
        self.classifier = best_classifier(df, "level_1", self._get_vectorizer,
                                          self._get_candidate_classifiers())()

        X = self.vectorizer.transform(df["result_full_description"])
        y = df["level_1"]

        self.classifier.fit(X, y)
        if isinstance(self.classifier, LinearSVC):
            confidences, _ = get_confidences(self.classifier, X, scale=1)
            self.scale = np.max(confidences)

        print("Level1MLModule: Finished retraining")
Ejemplo n.º 2
0
    def classify(self, raw_df, observations=False):
        """
        Classifies the given data. Raises a ValueError if this TestOutcomeModule
        has not been trained.
        :param raw_df: a DataFrame containing the raw test data extracted from
        the database
        - required columns: {"test_key", "result_key", "obs_seq_nbr" (if
          observations is True), "result_full_description", "candidates" (if
          self.organisms is True)}
        :param observations: True if the data is given at the observation level,
        False if the data is given at the test level
        :return: a DataFrame containing the classification results
        - columns: {"test_key", "result_key", "obs_seq_nbr" (if observations is
          True), "test_outcome_pred", 'test_outcome_classifier",
          "test_outcome_confidence", "test_outcome_confidence_type"}
        """
        if not self._is_trained():
            raise ValueError("TestOutcomeModule is not trained.")

        keys = get_keys(observations)

        if raw_df.shape[0] == 0:
            return pd.DataFrame(columns=keys + [
                "test_outcome_pred", "test_outcome_classifier",
                "test_outcome_confidence", "test_outcome_confidence_type"
            ])

        df = preprocess(raw_df, organisms=self.organisms)

        X = self.vectorizer.transform(df["result_full_description"])
        y_pred = self.classifier.predict(X)

        result = df.loc[:, keys]
        result["test_outcome_pred"] = y_pred

        result["test_outcome_classifier"] = json.dumps({
            "type":
            self.classifier.__class__.__name__,
            "params":
            self.classifier.get_params()
        })

        confidence, confidence_type\
            = get_confidences(self.classifier, X, self.scale)
        result["test_outcome_confidence"] = confidence
        result["test_outcome_confidence_type"] = confidence_type

        return result
Ejemplo n.º 3
0
from util.plot import plot_confusion_matrix
from util.preprocessor import preprocess
from util.save_util import save_classifier

save_path_prefix = "./resources/"
"""Data preparation"""
data_path = "./data/train_data.csv"
data_headers = ["polarity", "id", "date", "query", "user", "text"]

train_size = 10000
test_size = train_size * 0.2

x_test, y_test, x_train, y_train = get_data(data_path, train_size, test_size,
                                            data_headers)

x_test, y_test = preprocess(x_test, y_test)
x_train, y_train = preprocess(x_train, y_train)

train_labels, test_labels = update_labels(y_train, y_test)
"""TFiDF"""
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 1))
vectorizer = tfidf.fit(x_train["text"])

features_train = pd.DataFrame(vectorizer.transform(x_train["text"]).todense(),
                              columns=tfidf.get_feature_names())
features_test = pd.DataFrame(vectorizer.transform(x_test["text"]).todense(),
                             columns=tfidf.get_feature_names())
"""Support Vector Machine Classifier"""
clf = SVC(kernel='linear').fit(features_train.values, train_labels)
predicted = clf.predict(features_test.values)
"""Metrics"""