async def accuracy(self, sources: SourcesContext) -> Accuracy: # Load saved regression line regression_line = self.storage.get("regression_line", None) # Ensure the model has been trained before we try to make a prediction if regression_line is None: raise ModelNotTrained("Train model before assessing for accuracy") # Split regression line tuple into variables, ignore accuracy from # training data since we'll be re-calculating it for the test data m, b, _accuracy = regression_line # X and Y data x = [] y = [] # Go through all records that have the feature we're testing on and the # feature we want to predict. async for record in sources.with_features( [self.config.feature.name, self.config.predict.name]): x.append(record.feature(self.config.feature.name)) y.append(record.feature(self.config.predict.name)) # Use self.logger to report how many records are being used for testing self.logger.debug("Number of test records: %d", len(x)) # Calculate the regression line for test data and accuracy of line regression_line = [m * x_element + b for x_element in x] accuracy = coeff_of_deter(y, regression_line) # Update the accuracy to be the accuracy when assessed on the test data self.storage["regression_line"] = m, b, accuracy return Accuracy(accuracy)
async def predict( self, sources: SourcesContext ) -> AsyncIterator[Tuple[Record, Any, float]]: if not self.is_trained: raise ModelNotTrained("Train model before prediction.") async for record in sources.records(): doc = self.parent.nlp(record.feature("sentence")) prediction = [(ent.text, ent.label_) for ent in doc.ents] record.predicted("Tag", prediction, "Nan") yield record
async def train(self, sources: SourcesContext) -> None: # Number of features nof = len(self.features) # X and Y data X = [] Y = [] # Go through all records that have the feature we're training on and the # feature we want to predict. async for record in sources.with_features( self.features + [self.parent.config.predict.name]): record_data = [] for feature in record.features(self.features).values(): record_data.extend( [feature] if np.isscalar(feature) else feature) X.append(record_data) Y.append(record.feature(self.parent.config.predict.name)) k = self.parent.config.k X1, Xval = split(X, k) X = X1 Y1, Yval = split(Y, k) Y = Y1 X = np.reshape(X, (len(X), nof)) Xval = np.reshape(Xval, (len(Xval), nof)) Y = np.reshape(Y, (len(Y), 1)) Yval = np.reshape(Yval, (len(Yval), 1)) # Use self.logger to report how many records are being used for training self.logger.debug("Number of training records: %d", len(X)) self.logger.debug("Number of records in validation set: %d", len(Xval)) mu, sigma2 = estimateGaussian(X) p = multivariateGaussian(X, mu, sigma2) pval = multivariateGaussian(Xval, mu, sigma2) F1val, epsilon = selectThreshold(Yval, pval) outliers = p < epsilon # Outliers in training set listOfOl = findIndices(outliers) # Save epsilon and F1 score self.storage["anomalies"] = ( epsilon, F1val, mu.tolist(), sigma2.tolist(), )
async def predict( self, sources: SourcesContext ) -> AsyncIterator[Tuple[Record, Any, float]]: if not os.path.isdir(os.path.join(self.parent.config.output_dir, "ner")): raise ModelNotTrained("Train model before prediction.") self.nlp = spacy.load(self.parent.config.output_dir) async for record in sources.records(): doc = self.nlp(record.feature("sentence")) prediction = [(ent.text, ent.label_) for ent in doc.ents] record.predicted("Tag", prediction, "Nan") yield record
async def train(self, sources: SourcesContext) -> None: # X and Y data x = [] y = [] # Go through all records that have the feature we're training on and the # feature we want to predict. async for record in sources.with_features( [self.config.feature.name, self.config.predict.name]): x.append(record.feature(self.config.feature.name)) y.append(record.feature(self.config.predict.name)) # Use self.logger to report how many records are being used for training self.logger.debug("Number of training records: %d", len(x)) # Save m, b, and accuracy self.storage["regression_line"] = best_fit_line(x, y)
async def accuracy(self, sources: SourcesContext) -> Accuracy: # Load saved anomalies anomalies = self.storage.get("anomalies", None) # Ensure the model has been trained before we try to make a prediction if anomalies is None: raise ModelNotTrained("Train model before assessing for accuracy.") epsilon, _F1val, mu, sigma2 = anomalies X = [] Y = [] # Go through all records that have the feature we're training on and the # feature we want to predict. async for record in sources.with_features( self.features + [self.parent.config.predict.name]): record_data = [] for feature in record.features(self.features).values(): record_data.extend( [feature] if np.isscalar(feature) else feature) X.append(record_data) Y.append(record.feature(self.parent.config.predict.name)) self.logger.debug("Number of test records: %d", len(X)) # Number of features nof = len(self.features) X = np.reshape(X, (len(X), nof)) Y = np.reshape(Y, (len(Y), 1)) mu = np.array(mu) sigma2 = np.array(sigma2) p = multivariateGaussian(X, mu, sigma2) pred = (p < epsilon).astype(int) F1 = getF1(Y, pred) outliers = p < epsilon listOfOl = findIndices(outliers) accuracy = F1 # Update the accuracy self.storage["anomalies"] = epsilon, F1, mu.tolist(), sigma2.tolist() return Accuracy(accuracy)
async def predict( self, sources: SourcesContext ) -> AsyncIterator[Tuple[Record, Any, float]]: # Ensure the model has been trained before we try to make a prediction if self.separating_line is None: raise ModelNotTrained("Train model before prediction.") target = self.config.predict.name async for record in sources.with_features( self.parent.config.features.names()): feature_data = record.features(self.features) record.predicted( target, self.predict_input(feature_data[self.features[0]]), self.separating_line[2], ) yield record
async def predict( self, sources: SourcesContext ) -> AsyncIterator[Tuple[Record, Any, float]]: # Iterate through each record that needs a prediction if self.lm_trained is None: raise ModelNotTrained("Train model before prediction.") async for record in sources.with_features( self.parent.config.features.names() ): feature_data = record.features(self.features) predict = self.pd.DataFrame(feature_data, index=[0]) preds = self.lm_predictor.compute(predict, self.lm_trained) target = self.parent.config.predict.name record.predicted(target, preds.prediction, float("nan")) # Yield the record to the caller yield record
async def predict(self, sources: SourcesContext) -> AsyncIterator[Record]: # Load saved regression line regression_line = self.storage.get("regression_line", None) # Ensure the model has been trained before we try to make a prediction if regression_line is None: raise ModelNotTrained("Train model before prediction") # Expand the regression_line into named variables m, b, accuracy = regression_line # Iterate through each record that needs a prediction async for record in sources.with_features([self.config.feature.name]): # Grab the x data from the record x = record.feature(self.config.feature.name) # Calculate y y = m * x + b # Set the calculated value with the estimated accuracy record.predicted(self.config.predict.name, y, accuracy) # Yield the record to the caller yield record
async def get_test_records(self, sources: SourcesContext): ret_record = [] async for record in sources.with_features(self.features): ret_record.append(record) return ret_record
async def score(self, mctx: ModelContext, sources: SourcesContext, feature: Feature): accuracy: int = 0 async for record in sources.records(): accuracy += int(record.key) return accuracy
async def predict(self, sources: SourcesContext) -> AsyncIterator[Record]: async for record in sources.with_features(["by_ten"]): record.predicted("Salary", record.feature("by_ten") * 10, float(record.key)) yield record
async def get_input_data(self, sources: SourcesContext) -> list: saved_records = [] async for record in sources.with_features( self.config.features.names()): saved_records.append(record) return saved_records