def fit(self,
            X,
            y=None,
            n_estimators=9,
            max_features="auto",
            sample_weight=False,
            random_state=None):
        # get features and labels
        features, labels = self.feature_extractor.transform(
            X, relation_labels=self.relation_labels)
        if is_empty(features):
            log.error("No examples to train, quiting...")
            return self

        log.info("Checking parameters...")
        if type(max_features) == str:
            if max_features == "auto":
                max_features = self._get_max_features(features)
            elif max_features == "log":
                max_features = self._get_max_features(features, method=log2)
            elif max_features == "sqrt":
                max_features = self._get_max_features(features, method=sqrt)
            else:
                raise ValueError(
                    "Unknown method '{}' for feature selection in Random Forest"
                    .format(max_features))
        if type(max_features) != int:
            raise TypeError(
                "The parameter 'max_features' must be either a string or integer."
            )

        self.config.set_parameters({
            "n_estimators": n_estimators,
            "max_features": max_features,
            "random_state": random_state
        })

        # create a model
        self.rf = RandomForestClassifier(
            n_estimators=self.config.get_parameter("n_estimators"),
            max_features=self.config.get_parameter("max_features"),
            random_state=self.config.get_parameter("random_state"))

        log.info("Training Random Forest...")
        weights = to_weights(labels) if sample_weight else None
        self.rf.fit(features, labels, sample_weight=weights)
        return self
Exemple #2
0
    def fit(self, X, y=None, sample_weight=False):
        log.info("Checking parameters...")
        self.config.validate()

        # create a model
        self.nb = MultinomialNB()

        # get features and labels
        features, labels = self.feature_extractor.transform(
            X, relation_labels=self.relation_labels)
        if is_empty(features):
            log.error("No examples to train, quiting...")
            return self

        log.info("Training Naive Bayes...")
        weights = to_weights(labels) if sample_weight else None
        self.nb.fit(features, labels, sample_weight=weights)
        return self
    def fit(self, X, y=None, max_iterations=100, C=1, sample_weight=False):
        log.info("Checking parameters...")
        self.config.set_parameters({"max_iterations": max_iterations, "C": C})
        self.config.validate()

        # create a model
        self.svm = CalibratedClassifierCV(
            LinearSVC(max_iter=self.config.get_parameter("max_iterations"),
                      C=self.config.get_parameter("C")))

        # get features and labels
        features, labels = self.feature_extractor.transform(
            X, relation_labels=self.relation_labels)
        if is_empty(features):
            log.error("No examples to train, quiting...")
            return self

        log.info("Training SVM...")
        weights = to_weights(labels) if sample_weight else None
        self.svm.fit(features, labels, sample_weight=weights)
        return self
    def transform(self, X, y=None):
        # get features (labels are ignored)
        features, _ = self.feature_extractor.transform(
            X, relation_labels=self.relation_labels)
        if is_empty(features):
            return X

        # make predictions
        log.info("Predicting relations in {} documents with SVM...".format(
            len(X)))
        probs = self.svm.predict_proba(features)
        # the order of labels corresponds to the order of probabilities
        labels = self.svm.classes_
        predicted_labels = []
        for prob in probs:
            score = numpy.amax(prob)
            label = labels[numpy.where(prob == score)[0][0]]
            predicted_labels += [{label: score}]

        # make annotated documents
        return self.feature_extractor.predictions_to_annotated_documents(
            predicted_labels)