Ejemplo n.º 1
0
    async def predict(
        self, sources: SourcesContext
    ) -> AsyncIterator[Tuple[Record, Any, float]]:
        """
        Uses trained data to make a prediction about the quality of a record.
        """
        if not os.path.isfile(os.path.join(self.model_path)):
            raise ModelNotTrained("Train model before prediction.")

        self._model.eval()
        async for record in sources.with_features(self.features):
            feature_data = record.features(self.features)[self.features[0]]
            predict = await self.prediction_data_generator(feature_data)
            target = self.parent.config.predict.name

            # Disable gradient calculation for prediction
            with torch.no_grad():
                for val in predict:
                    val = val.to(self.device)
                    output = self._model(val)

                    if self.classifications:
                        prob = torch.nn.functional.softmax(output, dim=1)
                        confidence, prediction_value = prob.topk(1, dim=1)
                        record.predicted(
                            target,
                            self.cids[prediction_value.item()],
                            confidence,
                        )
                    else:
                        confidence = 1.0 - self.criterion(val, output).item()
                        record.predicted(target, output, confidence)

            yield record
Ejemplo n.º 2
0
    async def predict(
            self, sources: SourcesContext
    ) -> AsyncIterator[Tuple[Record, Any, float]]:
        """
        Uses trained data to make a prediction about the quality of a record.
        """
        if not os.path.isfile(
                os.path.join(self.model_dir_path, "saved_model.pb")):
            raise ModelNotTrained("Train model before assessing for accuracy.")

        async for record in sources.with_features(self.features):
            feature_data = record.features(self.features)
            df = self.pd.DataFrame(feature_data, index=[0])
            predict = await self.prediction_data_generator(
                self.np.array(df)[0])
            all_prob = self._model.predict(predict)
            max_prob_idx = all_prob.argmax(axis=-1)
            target = self.parent.config.predict.name
            self.logger.debug("Predicted probability of {} for {}: {}".format(
                self.parent.config.predict.name,
                self.np.array(df)[0],
                all_prob[0],
            ))

            record.predicted(
                target,
                self.cids[max_prob_idx[0]],
                all_prob[0][max_prob_idx[0]],
            )
            yield record
Ejemplo n.º 3
0
 async def predict(
         self, sources: SourcesContext
 ) -> AsyncIterator[Tuple[Record, Any, float]]:
     if not self._filepath.is_file():
         raise ModelNotTrained("Train model before prediction.")
     async for record in sources.with_features(self.features):
         record_data = []
         for feature in record.features(self.features).values():
             record_data.extend(
                 [feature] if self.np.isscalar(feature) else feature)
         predict = self.np.array([record_data])
         self.logger.debug("Predicted Value of {} for {}: {}".format(
             self.parent.config.predict,
             predict,
             self.clf.predict(predict),
         ))
         target = self.parent.config.predict.name
         record.predicted(
             target,
             self.parent.config.predict.dtype(self.clf.predict(predict)[0])
             if self.parent.config.predict.dtype is not str else
             self.clf.predict(predict)[0],
             self.confidence,
         )
         yield record
Ejemplo n.º 4
0
    async def predict(
            self, sources: SourcesContext
    ) -> AsyncIterator[Tuple[Record, Any, float]]:
        if not self._filepath.is_file():
            raise ModelNotTrained("Train model before prediction.")
        estimator_type = self.clf._estimator_type
        if estimator_type == "clusterer":
            if hasattr(self.clf, "predict"):
                # inductive clusterer
                predictor = self.clf.predict
            else:
                # transductive clusterer
                self.logger.critical(
                    "Predict found transductive clusterer, ensure data being passed is training data"
                )

                def yield_labels():
                    for label in self.clf.labels_.astype(self.np.int):
                        yield label

                labels = yield_labels()
                predictor = lambda predict: [next(labels)]

        async for record in sources.with_features(self.features):
            feature_data = record.features(self.features)
            predict = self.np.array([list(feature_data.values())])
            prediction = predictor(predict)
            self.logger.debug("Predicted cluster for {}: {}".format(
                predict, prediction))
            target = self.parent.config.predict.name
            record.predicted(target, prediction[0], self.confidence)
            yield record
Ejemplo n.º 5
0
    async def predict(
        self, sources: SourcesContext
    ) -> AsyncIterator[Tuple[Record, Any, float]]:
        if not os.path.isfile(
            os.path.join(self.parent.config.output_dir, "tf_model.h5")
        ):
            raise ModelNotTrained("Train model before prediction.")
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.parent.config.output_dir
        )

        with self.parent.config.strategy.scope():
            self.model = TFAutoModelForSequenceClassification.from_pretrained(
                self.parent.config.output_dir
            )
        trainer = TFTrainer(model=self.model, args=self.parent.config,)
        async for record in sources.with_features(self.features):
            to_predict = record.features(self.features)
            eval_example = [
                InputExample(
                    0,
                    to_predict[self.features[0]],
                    None,
                    self.parent.config.label_list[0],
                )
            ]
            eval_features = glue_convert_examples_to_features(
                eval_example,
                self.tokenizer,
                self.parent.config.max_seq_length,
                self.parent.config.task_name,
                self.parent.config.label_list,
            )
            eval_dataset = await self.example_features_to_dataset(
                eval_features
            )

            all_prob = trainer.predict(eval_dataset).predictions
            max_prob_idx = all_prob.argmax(axis=-1)
            self.logger.debug(
                "Predicted probability of {} for {}: {}".format(
                    self.parent.config.predict.name, to_predict, all_prob[0],
                )
            )
            record.predicted(
                self.parent.config.predict.name,
                self.parent.config.label_list[max_prob_idx[0]],
                all_prob[0][max_prob_idx[0]],
            )
            yield record
Ejemplo n.º 6
0
    async def predict(
        self, sources: SourcesContext
    ) -> AsyncIterator[Tuple[Record, Any, float]]:
        if not os.path.isfile(self._filename()):
            raise ModelNotTrained("Train model before prediction.")
        importance, tag, base, class_cost = None, None, None, None
        if self.parent.config.importance:
            importance = self.parent.config.importance.name

        if self.parent.config.tag:
            tag = self.parent.config.tag.name

        if self.parent.config.base:
            base = self.parent.config.base.name
        async for record in sources.with_features(self.features):
            feature_data = record.features(
                self.features + self.parent.config.extra_cols
            )
            data = pd.DataFrame(feature_data, index=[0])
            if not self.parent.config.noconvert:
                data = df_to_vw_format(
                    data,
                    vwcmd=self.parent.config.vwcmd,
                    target=None,
                    namespace=self.parent.config.namespace,
                    importance=importance,
                    tag=tag,
                    base=base,
                    task=self.parent.config.task,
                    use_binary_label=self.parent.config.use_binary_label,
                )
            else:
                data = (
                    data.drop(self.parent.config.extra_cols, axis=1)
                    .to_numpy()
                    .flatten()
                )
            prediction = self.clf.predict(data[0])
            self.logger.debug(
                "Predicted Value of {} for {}: {}".format(
                    self.parent.config.predict.name, data, prediction,
                )
            )
            target = self.parent.config.predict.name
            record.predicted(target, prediction, self.confidence)
            yield record
Ejemplo n.º 7
0
 async def predict_input_fn(self, sources: SourcesContext, **kwargs):
     """
     Uses the numpy input function with data from record features.
     """
     x_cols: Dict[str, Any] = {feature: [] for feature in self.features}
     ret_records = []
     async for record in sources.with_features(self.features):
         ret_records.append(record)
         for feature, results in record.features(self.features).items():
             x_cols[feature].append(self.np.array(results))
     for feature in x_cols:
         x_cols[feature] = self.np.array(x_cols[feature])
     self.logger.info("------ Record Data ------")
     self.logger.info("x_cols:    %d", len(list(x_cols.values())[0]))
     self.logger.info("-----------------------")
     input_fn = self.tf.compat.v1.estimator.inputs.numpy_input_fn(
         x_cols, shuffle=False, num_epochs=1, **kwargs
     )
     return input_fn, ret_records
Ejemplo n.º 8
0
    async def score(
        self,
        mctx: ModelContext,
        sctx: SourcesContext,
        *features: Feature,
    ):
        if not mctx.is_trained:
            raise ModelNotTrained("Train model before assessing for accuracy.")

        if mctx.parent.clf._estimator_type not in ("classifier", "regressor"):
            raise ScorerWillNotWork(
                "SklearnModelAccuracy will not work with Clustering Models")
        is_multi = len(features) > 1
        if is_multi:
            predictions = [feature.name for feature in features]
        elif len(features) == 1:
            (features, ) = features
            predictions = features.name

        xdata = []
        ydata = []

        async for record in sctx.with_features(
                list(mctx.np.hstack(mctx.features + [predictions]))):
            feature_data = []
            predict_data = []
            for feature in record.features(mctx.features).values():
                feature_data.extend(
                    [feature] if mctx.np.isscalar(feature) else feature)
            xdata.append(feature_data)
            if is_multi:
                for feature in record.features(predictions).values():
                    predict_data.extend(
                        [feature] if mctx.np.isscalar(feature) else feature)
            else:
                predict_data = record.feature(predictions)
            ydata.append(predict_data)
        xdata = mctx.np.array(xdata)
        ydata = mctx.np.array(ydata)
        mctx.logger.debug("Number of input records: {}".format(len(xdata)))
        mctx.confidence = mctx.parent.clf.score(xdata, ydata)
        return mctx.confidence
Ejemplo n.º 9
0
    async def predict(
        self, sources: SourcesContext
    ) -> AsyncIterator[Tuple[Record, Any, float]]:
        if not os.path.isfile(
            os.path.join(self.parent.config.output_dir, "tf_model.h5")
        ):
            raise ModelNotTrained("Train model before prediction.")
        with self.parent.config.strategy.scope():
            self.model = TFAutoModelForTokenClassification.from_pretrained(
                self.parent.config.output_dir,
                config=self.config,
                cache_dir=self.parent.config.cache_dir,
            )

        async for record in sources.with_features(
            [self.parent.config.words.name]
        ):
            sentence = record.features([self.parent.config.words.name])
            df = self.pd.DataFrame(sentence, index=[0])
            test_dataset = self.get_dataset(df, self.tokenizer, mode="test",)
            trainer = TFTrainer(
                model=self.model,
                args=self.parent.config,
                train_dataset=None,
                eval_dataset=None,
                compute_metrics=self.compute_metrics,
            )
            predictions, label_ids, _ = trainer.predict(
                test_dataset.get_dataset()
            )
            preds_list, labels_list = self.align_predictions(
                predictions, label_ids
            )
            preds = [
                {word: preds_list[0][i]}
                for i, word in enumerate(
                    sentence[self.parent.config.words.name].split()
                )
            ]

            record.predicted(self.parent.config.predict.name, preds, "Nan")
            yield record
Ejemplo n.º 10
0
 async def predict(self, sources: SourcesContext) -> AsyncIterator[Record]:
     target = self.parent.config.predict.name
     async for record in sources.with_features(
             self.parent.config.features.names()):
         record.predicted(target, random.random(), float(record.key))
         yield record