Beispiel #1
0
 async def accuracy(self, sources: SourcesContext) -> Accuracy:
     # Load saved regression line
     regression_line = self.storage.get("regression_line", None)
     # Ensure the model has been trained before we try to make a prediction
     if regression_line is None:
         raise ModelNotTrained("Train model before assessing for accuracy")
     # Split regression line tuple into variables, ignore accuracy from
     # training data since we'll be re-calculating it for the test data
     m, b, _accuracy = regression_line
     # X and Y data
     x = []
     y = []
     # Go through all records that have the feature we're testing on and the
     # feature we want to predict.
     async for record in sources.with_features(
         [self.config.feature.name, self.config.predict.name]):
         x.append(record.feature(self.config.feature.name))
         y.append(record.feature(self.config.predict.name))
     # Use self.logger to report how many records are being used for testing
     self.logger.debug("Number of test records: %d", len(x))
     # Calculate the regression line for test data and accuracy of line
     regression_line = [m * x_element + b for x_element in x]
     accuracy = coeff_of_deter(y, regression_line)
     # Update the accuracy to be the accuracy when assessed on the test data
     self.storage["regression_line"] = m, b, accuracy
     return Accuracy(accuracy)
Beispiel #2
0
    async def train(self, sources: SourcesContext) -> None:
        # Number of features
        nof = len(self.features)
        # X and Y data
        X = []
        Y = []
        # Go through all records that have the feature we're training on and the
        # feature we want to predict.
        async for record in sources.with_features(
            self.features + [self.parent.config.predict.name]):
            record_data = []
            for feature in record.features(self.features).values():
                record_data.extend(
                    [feature] if np.isscalar(feature) else feature)

            X.append(record_data)
            Y.append(record.feature(self.parent.config.predict.name))

        k = self.parent.config.k
        X1, Xval = split(X, k)
        X = X1
        Y1, Yval = split(Y, k)
        Y = Y1

        X = np.reshape(X, (len(X), nof))
        Xval = np.reshape(Xval, (len(Xval), nof))
        Y = np.reshape(Y, (len(Y), 1))
        Yval = np.reshape(Yval, (len(Yval), 1))

        # Use self.logger to report how many records are being used for training
        self.logger.debug("Number of training records: %d", len(X))
        self.logger.debug("Number of records in validation set: %d", len(Xval))

        mu, sigma2 = estimateGaussian(X)
        p = multivariateGaussian(X, mu, sigma2)

        pval = multivariateGaussian(Xval, mu, sigma2)

        F1val, epsilon = selectThreshold(Yval, pval)

        outliers = p < epsilon

        # Outliers in training set
        listOfOl = findIndices(outliers)

        # Save epsilon and F1 score
        self.storage["anomalies"] = (
            epsilon,
            F1val,
            mu.tolist(),
            sigma2.tolist(),
        )
Beispiel #3
0
 async def train(self, sources: SourcesContext) -> None:
     # X and Y data
     x = []
     y = []
     # Go through all records that have the feature we're training on and the
     # feature we want to predict.
     async for record in sources.with_features(
         [self.config.feature.name, self.config.predict.name]):
         x.append(record.feature(self.config.feature.name))
         y.append(record.feature(self.config.predict.name))
     # Use self.logger to report how many records are being used for training
     self.logger.debug("Number of training records: %d", len(x))
     # Save m, b, and accuracy
     self.storage["regression_line"] = best_fit_line(x, y)
Beispiel #4
0
    async def accuracy(self, sources: SourcesContext) -> Accuracy:
        # Load saved anomalies
        anomalies = self.storage.get("anomalies", None)
        # Ensure the model has been trained before we try to make a prediction
        if anomalies is None:
            raise ModelNotTrained("Train model before assessing for accuracy.")

        epsilon, _F1val, mu, sigma2 = anomalies

        X = []
        Y = []
        # Go through all records that have the feature we're training on and the
        # feature we want to predict.
        async for record in sources.with_features(
            self.features + [self.parent.config.predict.name]):
            record_data = []
            for feature in record.features(self.features).values():
                record_data.extend(
                    [feature] if np.isscalar(feature) else feature)

            X.append(record_data)
            Y.append(record.feature(self.parent.config.predict.name))

        self.logger.debug("Number of test records: %d", len(X))

        # Number of features
        nof = len(self.features)

        X = np.reshape(X, (len(X), nof))

        Y = np.reshape(Y, (len(Y), 1))

        mu = np.array(mu)
        sigma2 = np.array(sigma2)
        p = multivariateGaussian(X, mu, sigma2)

        pred = (p < epsilon).astype(int)

        F1 = getF1(Y, pred)

        outliers = p < epsilon

        listOfOl = findIndices(outliers)

        accuracy = F1
        # Update the accuracy
        self.storage["anomalies"] = epsilon, F1, mu.tolist(), sigma2.tolist()
        return Accuracy(accuracy)
Beispiel #5
0
 async def predict(
         self, sources: SourcesContext
 ) -> AsyncIterator[Tuple[Record, Any, float]]:
     # Ensure the model has been trained before we try to make a prediction
     if self.separating_line is None:
         raise ModelNotTrained("Train model before prediction.")
     target = self.config.predict.name
     async for record in sources.with_features(
             self.parent.config.features.names()):
         feature_data = record.features(self.features)
         record.predicted(
             target,
             self.predict_input(feature_data[self.features[0]]),
             self.separating_line[2],
         )
         yield record
Beispiel #6
0
 async def predict(
     self, sources: SourcesContext
 ) -> AsyncIterator[Tuple[Record, Any, float]]:
     # Iterate through each record that needs a prediction
     if self.lm_trained is None:
         raise ModelNotTrained("Train model before prediction.")
     async for record in sources.with_features(
         self.parent.config.features.names()
     ):
         feature_data = record.features(self.features)
         predict = self.pd.DataFrame(feature_data, index=[0])
         preds = self.lm_predictor.compute(predict, self.lm_trained)
         target = self.parent.config.predict.name
         record.predicted(target, preds.prediction, float("nan"))
         # Yield the record to the caller
         yield record
Beispiel #7
0
 async def predict(self, sources: SourcesContext) -> AsyncIterator[Record]:
     # Load saved regression line
     regression_line = self.storage.get("regression_line", None)
     # Ensure the model has been trained before we try to make a prediction
     if regression_line is None:
         raise ModelNotTrained("Train model before prediction")
     # Expand the regression_line into named variables
     m, b, accuracy = regression_line
     # Iterate through each record that needs a prediction
     async for record in sources.with_features([self.config.feature.name]):
         # Grab the x data from the record
         x = record.feature(self.config.feature.name)
         # Calculate y
         y = m * x + b
         # Set the calculated value with the estimated accuracy
         record.predicted(self.config.predict.name, y, accuracy)
         # Yield the record to the caller
         yield record
Beispiel #8
0
 async def get_test_records(self, sources: SourcesContext):
     ret_record = []
     async for record in sources.with_features(self.features):
         ret_record.append(record)
     return ret_record
Beispiel #9
0
 async def predict(self, sources: SourcesContext) -> AsyncIterator[Record]:
     async for record in sources.with_features(["by_ten"]):
         record.predicted("Salary",
                          record.feature("by_ten") * 10, float(record.key))
         yield record
Beispiel #10
0
 async def get_input_data(self, sources: SourcesContext) -> list:
     saved_records = []
     async for record in sources.with_features(
             self.config.features.names()):
         saved_records.append(record)
     return saved_records