def setUpClass(cls):
        # Create a temporary directory to store the trained model
        cls.model_dir = tempfile.TemporaryDirectory()
        # Create an instance of the model
        cls.model = XGBRegressorModel(
            XGBRegressorModelConfig(
                features=Features(Feature("Feature1", float, 1),
                                  Feature("Feature2")),
                predict=Feature("Target", float, 1),
                directory=cls.model_dir.name,
            ))
        # Generating data f(x1,x2) = 2*x1 + 3*x2
        _n_data = 2000
        _temp_data = np.random.rand(2, _n_data)
        cls.records = [
            Record(
                "x" + str(random.random()),
                data={
                    "features": {
                        "Feature1": float(_temp_data[0][i]),
                        "Feature2": float(_temp_data[1][i]),
                        "Target": 2 * _temp_data[0][i] + 3 * _temp_data[1][i],
                    }
                },
            ) for i in range(0, _n_data)
        ]

        cls.trainingsource = Sources(
            MemorySource(MemorySourceConfig(records=cls.records[:1800])))
        cls.testsource = Sources(
            MemorySource(MemorySourceConfig(records=cls.records[1800:])))
    def setUpClass(cls):
        # Create a temporary directory to store the trained model
        cls.model_dir = tempfile.TemporaryDirectory()
        # Create an instance of the model
        cls.model = AnomalyModel(
            features=Features(
                Feature("A", int, 1),
                Feature("B", int, 2),
            ),
            predict=Feature("Y", int, 1),
            directory=cls.model_dir.name,
        )

        # Generating data

        _n_data = 1800
        _temp_data = np.random.normal(2, 1, size=(2, _n_data))
        cls.records = [
            Record(
                "x" + str(random.random()),
                data={
                    "features": {
                        "A": float(_temp_data[0][i]),
                        "B": float(_temp_data[1][i]),
                        "Y":
                        (_temp_data[0][i] > 1 - _temp_data[1][i]).astype(int),
                    }
                },
            ) for i in range(0, _n_data)
        ]

        cls.trainingsource = Sources(
            MemorySource(MemorySourceConfig(records=cls.records[:1400])))
        cls.testsource = Sources(
            MemorySource(MemorySourceConfig(records=cls.records[1400:])))
Exemple #3
0
    def setUpClass(cls):
        (
            A_train,
            B_train,
            C_train,
            X_train,
            D_train,
            E_train,
        ) = list(zip(*TRAIN_DATA))
        A_test, B_test, C_test, X_test, D_test, E_test = list(zip(*TEST_DATA))

        cls.train_records = [
            Record(
                str(i),
                data={
                    "features": {
                        "title": A_train[i],
                        "context": B_train[i],
                        "question": C_train[i],
                        "answer_text": X_train[i],
                        "start_pos_char": D_train[i],
                        "is_impossible": E_train[i],
                        "answers": [],
                    }
                },
            ) for i in range(len(X_train))
        ]
        cls.test_records = [
            Record(
                str(i),
                data={
                    "features": {
                        "title": A_test[i],
                        "context": B_test[i],
                        "question": C_test[i],
                        "answer_text": X_test[i],
                        "start_pos_char": D_test[i],
                        "is_impossible": E_test[i],
                        "answers": [],
                    }
                },
            ) for i in range(len(X_test))
        ]

        cls.train_sources = Sources(
            MemorySource(MemorySourceConfig(records=cls.train_records)))
        cls.test_sources = Sources(
            MemorySource(MemorySourceConfig(records=cls.test_records)))
        cls.model_dir = tempfile.TemporaryDirectory()
        cls.model = QAModel(
            QAModelConfig(
                model_name_or_path="bert-base-cased",
                cache_dir=CACHE_DIR,
                directory=cls.model_dir.name,
                log_dir=cls.model_dir.name,
                model_type="bert",
                no_cuda=True,
            ))
Exemple #4
0
    def setUpClass(cls):
        A_train, B_train, X = list(zip(*TRAIN_DATA))
        A_predict, B_predict = list(zip(*PREDICT_DATA))

        cls.train_records = [
            Record(
                str(i),
                data={
                    "features": {
                        "sentence_id": A_train[i],
                        "words": B_train[i],
                        "ner_tag": X[i],
                    }
                },
            )
            for i in range(0, len(X))
        ]
        cls.train_sources = Sources(
            MemorySource(MemorySourceConfig(records=cls.train_records))
        )

        cls.predict_records = [
            Record(
                str(i),
                data={
                    "features": {
                        "sentence_id": A_predict[i],
                        "words": B_predict[i],
                    }
                },
            )
            for i in range(0, len(A_predict))
        ]
        cls.predict_sources = Sources(
            MemorySource(MemorySourceConfig(records=cls.predict_records))
        )

        cls.model_dir = tempfile.TemporaryDirectory()
        cls.model = NERModel(
            NERModelConfig(
                sid=Feature("sentence_id", int, 1),
                words=Feature("words", str, 1),
                predict=Feature("ner_tag", str, 1),
                output_dir=cls.model_dir.name,
                model_architecture_type="bert",
                model_name_or_path="bert-base-cased",
                no_cuda=True,
            )
        )
 async def train(self, sources: Sources):
     xdata = []
     ydata = []
     ### np.hstack helps flatten the lists wihtout splitting strings.
     async for record in sources.with_features(
         list(self.np.hstack(self.features + [self.predictions]))):
         feature_data = []
         predict_data = []
         for feature in record.features(self.features).values():
             feature_data.extend(
                 [feature] if self.np.isscalar(feature) else feature)
         xdata.append(feature_data)
         if self.is_multi:
             for feature in record.features(self.predictions).values():
                 predict_data.extend(
                     [feature] if self.np.isscalar(feature) else feature)
         else:
             predict_data = record.feature(self.predictions)
         ydata.append(predict_data)
     xdata = self.np.array(xdata)
     ydata = self.np.array(ydata)
     self.logger.info("Number of input records: {}".format(len(xdata)))
     if (self.is_multi
             and "MultiOutput" not in self.parent.clf.__class__.__name__):
         if self.estimator_type == "regressor":
             self.parent.clf = MultiOutputRegressor(self.parent.clf)
         elif self.estimator_type == "classifier":
             self.parent.clf = MultiOutputClassifier(self.parent.clf)
         else:
             raise NoMultiOutputSupport(
                 "Model does not support multi-output. Please refer the docs to find a suitable model entrypoint."
             )
     self.parent.clf.fit(xdata, ydata)
     self.is_trained = True
Exemple #6
0
    async def train(self, sources: Sources) -> None:
        """
        Trains and saves a model using the source data, and the config attributes
        """
        # Get data into memory
        xdata = []
        ydata = []
        async for record in sources.with_features(
                self.features + [self.parent.config.predict.name]):
            record_data = []
            for feature in record.features(self.features).values():
                record_data.extend(
                    [feature] if np.isscalar(feature) else feature)
            xdata.append(record_data)
            ydata.append(record.feature(self.parent.config.predict.name))
        x_data = pd.DataFrame(xdata)
        y_data = pd.DataFrame(ydata)

        self.saved = XGBClassifier(
            n_estimators=self.config.n_estimators,
            learning_rate=self.config.learning_rate,
            max_depth=self.config.max_depth,
            objective=self.config.objective,
            subsample=self.config.subsample,
            gamma=self.config.gamma,
            n_jobs=self.config.n_jobs,
            colsample_bytree=self.config.colsample_bytree,
            booster=self.config.booster,
            min_child_weight=self.config.min_child_weight,
            reg_lambda=self.config.reg_lambda,
            reg_alpha=self.config.reg_alpha,
        )

        self.saved.fit(x_data, y_data, eval_metric="merror")
        self.is_trained = True
 async def get_input_data(self, sources: Sources) -> list:
     saved_records = []
     async for record in sources.with_features(
         self.config.features.names()
     ):
         saved_records.append(record)
     return saved_records
Exemple #8
0
 def setUpClass(cls):
     cls.feature = StartsWithA()
     cls.features = Features(cls.feature)
     cls.model_dir = tempfile.TemporaryDirectory()
     cls.model = MiscModel(
         MiscModelConfig(
             directory=cls.model_dir.name,
             classifications=["not a", "a"],
             features=cls.features,
         ))
     cls.repos = [
         Repo(
             "a" + str(random.random()),
             data={"features": {
                 cls.feature.NAME: 1,
                 "string": "a"
             }},
         ) for _ in range(0, 1000)
     ]
     cls.repos += [
         Repo(
             "b" + str(random.random()),
             data={"features": {
                 cls.feature.NAME: 0,
                 "string": "not a"
             }},
         ) for _ in range(0, 1000)
     ]
     cls.sources = Sources(MemorySource(
         MemorySourceConfig(repos=cls.repos)))
Exemple #9
0
 def setUpClass(cls):
     cls.model_dir = tempfile.TemporaryDirectory()
     cls.features = Features()
     if cls.MODEL_TYPE is "CLASSIFICATION":
         cls.features.append(DefFeature("A", float, 1))
         cls.features.append(DefFeature("B", float, 1))
         cls.features.append(DefFeature("C", float, 1))
         cls.features.append(DefFeature("D", float, 1))
         cls.features.append(DefFeature("E", float, 1))
         cls.features.append(DefFeature("F", float, 1))
         cls.features.append(DefFeature("G", float, 1))
         cls.features.append(DefFeature("H", float, 1))
         cls.features.append(DefFeature("I", float, 1))
         A, B, C, D, E, F, G, H, I, X = list(
             zip(*FEATURE_DATA_CLASSIFICATION))
         cls.repos = [
             Repo(
                 str(i),
                 data={
                     "features": {
                         "A": A[i],
                         "B": B[i],
                         "C": C[i],
                         "D": D[i],
                         "E": E[i],
                         "F": F[i],
                         "G": G[i],
                         "H": H[i],
                         "I": I[i],
                         "X": X[i],
                     }
                 },
             ) for i in range(0, len(A))
         ]
     elif cls.MODEL_TYPE is "REGRESSION":
         cls.features.append(DefFeature("A", float, 1))
         cls.features.append(DefFeature("B", float, 1))
         cls.features.append(DefFeature("C", float, 1))
         A, B, C, X = list(zip(*FEATURE_DATA_REGRESSION))
         cls.repos = [
             Repo(
                 str(i),
                 data={
                     "features": {
                         "A": A[i],
                         "B": B[i],
                         "C": C[i],
                         "X": X[i],
                     }
                 },
             ) for i in range(0, len(A))
         ]
     cls.sources = Sources(MemorySource(
         MemorySourceConfig(repos=cls.repos)))
     cls.model = cls.MODEL(
         cls.MODEL_CONFIG(
             directory=cls.model_dir.name,
             predict="X",
             features=cls.features,
         ))
Exemple #10
0
    async def sources_to_array(self, sources: Sources):
        x_cols: Dict[str, Any] = {feature: [] for feature in self.features}
        y_cols = []
        for record in [
            record
            async for record in sources.with_features(
                self.features + [self.parent.config.predict.name]
            )
            if self.parent.config.clstype(
                record.feature(self.parent.config.predict.name)
            )
            in self.classifications
        ]:
            for feature, results in record.features(self.features).items():
                x_cols[feature].append(self.np.array(results))
            y_cols.append(
                self.classifications[
                    self.parent.config.clstype(
                        record.feature(self.parent.config.predict.name)
                    )
                ]
            )
        if not y_cols:
            raise ValueError("No records to train on")
        y_cols = self.np.array(y_cols)
        for feature in x_cols:
            x_cols[feature] = self.np.array(x_cols[feature])

        return x_cols, y_cols
Exemple #11
0
 def setUpClass(cls):
     cls.model_dir = tempfile.TemporaryDirectory()
     cls.model = DNNClassifierModel(
         DNNClassifierModelConfig(directory=cls.model_dir.name,
                                  steps=1000,
                                  epochs=30,
                                  hidden=[10, 20, 10],
                                  classification="string",
                                  classifications=["a", "not a"],
                                  clstype=str))
     cls.feature = StartsWithA()
     cls.features = Features(cls.feature)
     cls.repos = [
         Repo(
             "a" + str(random.random()),
             data={"features": {
                 cls.feature.NAME: 1,
                 "string": "a"
             }},
         ) for _ in range(0, 1000)
     ]
     cls.repos += [
         Repo(
             "b" + str(random.random()),
             data={"features": {
                 cls.feature.NAME: 0,
                 "string": "not a"
             }},
         ) for _ in range(0, 1000)
     ]
     cls.sources = Sources(MemorySource(
         MemorySourceConfig(repos=cls.repos)))
Exemple #12
0
 async def accuracy_input_fn(self, sources: Sources, **kwargs):
     """
     Uses the numpy input function with data from repo features.
     """
     x_cols: Dict[str, Any] = {feature: [] for feature in self.features}
     y_cols = []
     for repo in [
             repo async for repo in sources.with_features(
                 self.features + [self.parent.config.predict.NAME])
             if repo.feature(self.parent.config.predict.NAME) in
             self.classifications
     ]:
         for feature, results in repo.features(self.features).items():
             x_cols[feature].append(np.array(results))
         y_cols.append(self.classifications[repo.feature(
             self.parent.config.predict.NAME)])
     y_cols = np.array(y_cols)
     for feature in x_cols:
         x_cols[feature] = np.array(x_cols[feature])
     self.logger.info("------ Repo Data ------")
     self.logger.info("x_cols:    %d", len(list(x_cols.values())[0]))
     self.logger.info("y_cols:    %d", len(y_cols))
     self.logger.info("-----------------------")
     input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
         x_cols,
         y_cols,
         batch_size=self.parent.config.batchsize,
         shuffle=self.parent.config.shuffle,
         num_epochs=1,
         **kwargs,
     )
     return input_fn
Exemple #13
0
 def setUpClass(cls):
     cls.model_dir = tempfile.TemporaryDirectory()
     cls.feature1 = Feature_1()
     cls.feature2 = Feature_2()
     cls.features = Features(cls.feature1, cls.feature2)
     cls.model = DNNRegressionModel(
         DNNRegressionModelConfig(
             directory=cls.model_dir.name,
             steps=1000,
             epochs=40,
             hidden=[50, 20, 10],
             predict=DefFeature("TARGET", float, 1),
             features=cls.features,
         ))
     # Generating data f(x1,x2) = 2*x1 + 3*x2
     _n_data = 2000
     _temp_data = np.random.rand(2, _n_data)
     cls.repos = [
         Repo(
             "x" + str(random.random()),
             data={
                 "features": {
                     cls.feature1.NAME: float(_temp_data[0][i]),
                     cls.feature2.NAME: float(_temp_data[1][i]),
                     "TARGET": 2 * _temp_data[0][i] + 3 * _temp_data[1][i],
                 }
             },
         ) for i in range(0, _n_data)
     ]
     cls.sources = Sources(MemorySource(
         MemorySourceConfig(repos=cls.repos)))
Exemple #14
0
 def setUpClass(cls):
     cls.model_dir = tempfile.TemporaryDirectory()
     cls.feature = Feature("starts_with_a", int, 1)
     cls.features = Features(cls.feature)
     cls.records = [
         Record(
             "a" + str(random.random()),
             data={"features": {
                 cls.feature.name: 1,
                 "string": "a"
             }},
         ) for _ in range(0, 1000)
     ]
     cls.records += [
         Record(
             "b" + str(random.random()),
             data={"features": {
                 cls.feature.name: 0,
                 "string": "not a"
             }},
         ) for _ in range(0, 1000)
     ]
     cls.sources = Sources(
         MemorySource(MemorySourceConfig(records=cls.records)))
     cls.model = DNNClassifierModel(
         DNNClassifierModelConfig(
             directory=cls.model_dir.name,
             steps=1000,
             epochs=40,
             hidden=[50, 20, 10],
             predict=Feature("string", str, 1),
             classifications=["a", "not a"],
             clstype=str,
             features=cls.features,
         ))
Exemple #15
0
 async def test_02_predict(self):
     test_feature_val = [
         0,
         1.5,
         2,
     ]  # inserting zero so that its 1-indexable
     test_target = 2 * test_feature_val[1] + 3 * test_feature_val[2]
     # should be same function used in TestDNN.setupclass
     a = Repo(
         "a",
         data={
             "features": {
                 self.feature1.NAME: test_feature_val[1],
                 self.feature2.NAME: test_feature_val[2],
             }
         },
     )
     async with Sources(MemorySource(MemorySourceConfig(
             repos=[a]))) as sources, self.model as model:
         target_name = model.config.predict.NAME
         async with sources() as sctx, model() as mctx:
             res = [repo async for repo in mctx.predict(sctx.repos())]
             self.assertEqual(len(res), 1)
         self.assertEqual(res[0].key, a.key)
         test_error_norm = abs(
             (test_target - res[0].prediction(target_name).value) /
             test_target + 1e-6)
         error_threshold = 0.3
         self.assertLess(test_error_norm, error_threshold)
Exemple #16
0
    async def evaluate_input_fn(
        self,
        sources: Sources,
        batch_size=20,
        shuffle=False,
        epochs=1,
        **kwargs,
    ):
        """
        Uses the numpy input function with data from repo features.
        """
        x_cols: Dict[str, Any] = {feature: [] for feature in self.features}
        y_cols = []

        async for repo in sources.with_features(self.all_features):
            for feature, results in repo.features(self.features).items():
                x_cols[feature].append(np.array(results))
            y_cols.append(repo.feature(self.parent.config.predict))

        y_cols = np.array(y_cols)
        for feature in x_cols:
            x_cols[feature] = np.array(x_cols[feature])
        self.logger.info("------ Repo Data ------")
        self.logger.info("x_cols:    %d", len(list(x_cols.values())[0]))
        self.logger.info("y_cols:    %d", len(y_cols))
        self.logger.info("-----------------------")
        input_fn = tensorflow.estimator.inputs.numpy_input_fn(
            x_cols,
            y_cols,
            batch_size=batch_size,
            shuffle=shuffle,
            num_epochs=epochs,
            **kwargs,
        )
        return input_fn
Exemple #17
0
 def setUpClass(cls):
     cls.features = Features()
     cls.features.append(Feature("A", str, 1))
     A, X = list(zip(*DATA))
     cls.records = [
         Record(str(i), data={"features": {
             "A": A[i],
             "X": X[i]
         }}) for i in range(0, len(X))
     ]
     cls.sources = Sources(
         MemorySource(MemorySourceConfig(records=cls.records)))
     cls.model_dir = tempfile.TemporaryDirectory()
     cls.model = TextClassificationModel(
         TextClassifierConfig(
             directory=cls.model_dir.name,
             classifications=[0, 1],
             features=cls.features,
             predict=Feature("X", int, 1),
             add_layers=True,
             layers=[
                 "Dense(units = 120, activation='relu')",
                 "Dense(units = 64, activation=relu)",
                 "Dense(units = 2, activation='softmax')",
             ],
             model_path=
             "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim-with-oov/1",
             epochs=30,
         ))
Exemple #18
0
 def setUpClass(cls):
     cls.model_dir = tempfile.TemporaryDirectory()
     cls.model = Misc(ModelConfig(directory=cls.model_dir.name))
     cls.feature = StartsWithA()
     cls.features = Features(cls.feature)
     cls.classifications = ['a', 'not a']
     cls.repos = [
         Repo('a' + str(random.random()),
              data={
                  'features': {
                      cls.feature.NAME: 1
                  },
                  'classification': 'a'
              }) for _ in range(0, 1000)
     ]
     cls.repos += [
         Repo('b' + str(random.random()),
              data={
                  'features': {
                      cls.feature.NAME: 0
                  },
                  'classification': 'not a'
              }) for _ in range(0, 1000)
     ]
     cls.sources = \
         Sources(MemorySource(MemorySourceConfig(repos=cls.repos)))
Exemple #19
0
    def setUpClass(cls):
        cls.model_dir = tempfile.TemporaryDirectory()
        cls.features = Features()
        cls.features.append(Feature("A", float, 1))
        cls.features.append(Feature("B", float, 1))
        cls.features.append(Feature("C", float, 1))
        cls.features.append(Feature("D", float, 1))
        cls.features.append(Feature("E", float, 1))
        cls.features.append(Feature("F", float, 1))
        cls.features.append(Feature("G", int, 1))
        cls.features.append(Feature("H", int, 1))

        A, B, C, D, E, F, G, H, X = list(zip(*DATA))
        cls.records = [
            Record(
                str(i),
                data={
                    "features": {
                        "A": A[i],
                        "B": B[i],
                        "C": C[i],
                        "D": D[i],
                        "E": E[i],
                        "F": F[i],
                        "G": G[i],
                        "H": H[i],
                        "X": X[i],
                    }
                },
            )
            for i in range(0, len(A))
        ]

        cls.sources = Sources(
            MemorySource(MemorySourceConfig(records=cls.records))
        )
        cls.model = VWModel(
            VWConfig(
                location=cls.model_dir.name,
                features=cls.features,
                predict=Feature("X", float, 1),
                # A and B will be namespace n1
                # A and C will be in namespace n2
                namespace=["n1_A_B", "n2_A_C"],
                importance=Feature("H", int, 1),
                tag=Feature("G", int, 1),
                task="regression",
                vwcmd=[
                    "l2",
                    "0.1",
                    "loss_function",
                    "squared",
                    "passes",
                    "10",
                ],
            )
        )
        cls.scorer = MeanSquaredErrorAccuracy()
Exemple #20
0
 async def train(self, sources: Sources):
     async for repo in sources.with_features(
             self.features + [self.parent.config.predict.NAME]):
         feature_data = repo.features(self.features +
                                      [self.parent.config.predict.NAME])
         self.xData = np.append(self.xData, feature_data[self.features[0]])
         self.yData = np.append(
             self.yData, feature_data[self.parent.config.predict.NAME])
     self.regression_line = await self.best_fit_line()
Exemple #21
0
 async def train(self, sources: Sources):
     xdata = []
     async for record in sources.with_features(self.features):
         feature_data = record.features(self.features)
         xdata.append(list(feature_data.values()))
     xdata = self.np.array(xdata)
     self.logger.info("Number of input records: {}".format(len(xdata)))
     self.parent.clf.fit(xdata)
     self.is_trained = True
Exemple #22
0
 async def train(self, sources: Sources):
     xdata = []
     async for record in sources.with_features(self.features):
         feature_data = record.features(self.features)
         xdata.append(list(feature_data.values()))
     xdata = self.np.array(xdata)
     self.logger.info("Number of input records: {}".format(len(xdata)))
     self.clf.fit(xdata)
     self.joblib.dump(self.clf, str(self._filepath))
Exemple #23
0
 async def test_02_predict(self):
     a = Repo("a", data={"features": {self.feature.NAME: 1}})
     async with Sources(MemorySource(MemorySourceConfig(
             repos=[a]))) as sources, self.model as model:
         async with sources() as sctx, model() as mctx:
             res = [repo async for repo in mctx.predict(sctx.repos())]
             self.assertEqual(len(res), 1)
         self.assertEqual(res[0].src_url, a.src_url)
         self.assertTrue(res[0].prediction().value)
Exemple #24
0
    async def accuracy(self, sources: Sources) -> Accuracy:
        if not os.path.isfile(self._filename()):
            raise ModelNotTrained("Train model before assessing for accuracy.")
        data = []
        importance, tag, base, class_cost = None, None, None, None
        if self.parent.config.importance:
            importance = self.parent.config.importance.name

        if self.parent.config.tag:
            tag = self.parent.config.tag.name

        if self.parent.config.base:
            base = self.parent.config.base.name
        async for record in sources.with_features(self.features):
            feature_data = record.features(
                self.features
                + [self.parent.config.predict.name]
                + self.parent.config.extra_cols
            )
            data.append(feature_data)
        df = pd.DataFrame(data)
        xdata = df.drop([self.parent.config.predict.name], 1)
        self.logger.debug("Number of input records: {}".format(len(xdata)))
        if not self.parent.config.noconvert:
            xdata = df_to_vw_format(
                xdata,
                vwcmd=self.parent.config.vwcmd,
                target=None,
                namespace=self.parent.config.namespace,
                importance=importance,
                tag=tag,
                base=base,
                task=self.parent.config.task,
                use_binary_label=self.parent.config.use_binary_label,
            )
        else:
            xdata = (
                xdata.drop(self.parent.config.extra_cols, axis=1)
                .to_numpy()
                .flatten()
            )
        ydata = np.array(df[self.parent.config.predict.name])
        shape = [len(xdata)]
        # TODO support probabilites
        # if 'oaa' in self.parent.config.vwcmd and 'probabilities' in self.parent.config.vwcmd:
        #     shape.append(self.parent.config.vwcmd['oaa'])
        y_pred = np.empty(shape)
        for idx, x in enumerate(xdata):
            y_pred[idx] = self.clf.predict(x)

        if self.parent.config.task in ["regression"]:
            self.confidence = r2_score(ydata, y_pred)
        elif self.parent.config.task in ["classification"]:
            self.confidence = accuracy_score(ydata, y_pred)
        self.logger.debug("Model Accuracy: {}".format(self.confidence))
        return self.confidence
Exemple #25
0
 async def train(self, sources: Sources):
     data = []
     async for repo in sources.with_features(self.features):
         feature_data = repo.features(self.features)
         data.append(feature_data)
     df = pd.DataFrame(data)
     xdata = np.array(df)
     self.logger.info("Number of input repos: {}".format(len(xdata)))
     self.clf.fit(xdata)
     joblib.dump(self.clf, self._filename())
Exemple #26
0
 async def train(self, sources: Sources):
     data = []
     async for record in sources.with_features(self.features):
         feature_data = record.features(self.features)
         data.append(feature_data)
     df = self.pd.DataFrame(data)
     xdata = self.np.array(df)
     self.logger.info("Number of input records: {}".format(len(xdata)))
     self.clf.fit(xdata)
     self.joblib.dump(self.clf, str(self._filepath))
Exemple #27
0
 async def test_02_predict(self):
     a = Record("a", data={"features": {self.feature.NAME: 1}})
     async with Sources(MemorySource(MemorySourceConfig(
             records=[a]))) as sources, self.model as model:
         target_name = model.config.predict.NAME
         async with sources() as sctx, model() as mctx:
             res = [record async for record in mctx.predict(sctx.records())]
             self.assertEqual(len(res), 1)
         self.assertEqual(res[0].key, a.key)
         self.assertTrue(res[0].prediction(target_name).value)
Exemple #28
0
    async def train_data_generator(self, sources: Sources):

        self.logger.debug("Training on features: %r", self.parent.features)
        x_cols: Dict[str, Any] = {
            feature: [] for feature in self.parent.features
        }
        y_cols = []
        all_records = []
        all_sources = sources.with_features(
            self.parent.features + [self.classification]
        )
        async for record in all_sources:
            if record.feature(self.classification) in self.classifications:
                all_records.append(record)
        for record in all_records:
            for feature, results in record.features(
                self.parent.features
            ).items():
                x_cols[feature].append(self.np.array(results))
            y_cols.append(
                self.classifications[record.feature(self.classification)]
            )
        if not y_cols:
            raise ValueError("No records to train on")
        y_cols = self.np.array(y_cols)
        for feature in x_cols:
            x_cols[feature] = self.np.array(x_cols[feature])
        self.logger.info("------ Record Data ------")
        self.logger.info("x_cols:    %d", len(list(x_cols.values())[0]))
        self.logger.info("y_cols:    %d", len(y_cols))
        self.logger.info("-----------------------")

        if (len(self.parent.features)) > 1:
            self.logger.critical(
                "Found more than one feature to train on. Only first feature will be used"
            )
        # TODO add more embedTypes
        # so far only model available on tensorflow hub which requires special input preprocessing is `bert`
        if self.parent.config.embedType in ["bert"]:
            x_cols = bert_tokenizer(
                x_cols[self.parent.features[0]],
                self.parent.config.max_seq_length,
                self.parent._model.vocab_file.asset_path.numpy(),
                self.parent._model.do_lower_case.numpy(),
            )
            x_cols = dict(
                input_word_ids=x_cols[0],
                input_mask=x_cols[1],
                segment_ids=x_cols[2],
            )
        else:
            # Universal Sentence Encoder, Neural Network Language Model, Swivel Embeddings
            # No preprocessing needed
            x_cols = x_cols[self.parent.features[0]]
        return x_cols, y_cols
Exemple #29
0
    def setUpClass(cls):
        A_train, X_train = list(zip(*TRAIN_DATA))
        A_test, X_test = list(zip(*TEST_DATA))

        cls.train_records = [
            Record(
                str(i),
                data={
                    "features": {
                        "sentence": A_train[i],
                        "entities": X_train[i],
                    }
                },
            )
            for i in range(len(X_train))
        ]
        cls.test_records = [
            Record(
                str(i),
                data={
                    "features": {"sentence": A_test[i], "entities": X_test[i],}
                },
            )
            for i in range(len(X_test))
        ]

        cls.train_sources = Sources(
            MemorySource(MemorySourceConfig(records=cls.train_records))
        )
        cls.test_sources = Sources(
            MemorySource(MemorySourceConfig(records=cls.test_records))
        )
        cls.model_dir = tempfile.TemporaryDirectory()
        cls.model = SpacyNERModel(
            SpacyNERModelConfig(
                model_name="en_core_web_sm",
                location=cls.model_dir.name,
                n_iter=10,
                dropout=0.4,
            )
        )
        cls.scorer = SpacyNerAccuracy()
Exemple #30
0
    async def train(self, sources: Sources):
        data = []
        importance, tag, base, class_cost = None, None, None, None
        if self.parent.config.importance:
            importance = self.parent.config.importance.name

        if self.parent.config.tag:
            tag = self.parent.config.tag.name

        if self.parent.config.base:
            base = self.parent.config.base.name
        if self.parent.config.class_cost:
            class_cost = [
                feature.name for feature in self.parent.config.class_cost
            ]
        async for record in sources.with_features(
                self.parent.features + [self.parent.config.predict.name] +
                self.parent.config.extra_cols):
            feature_data = record.features(self.parent.features +
                                           [self.parent.config.predict.name] +
                                           self.parent.config.extra_cols)
            data.append(feature_data)
        vw_data = pd.DataFrame(data)
        if not self.parent.config.noconvert:
            vw_data = df_to_vw_format(
                vw_data,
                vwcmd=self.parent.config.vwcmd,
                target=self.parent.config.predict.name,
                namespace=self.parent.config.namespace,
                importance=importance,
                tag=tag,
                base=base,
                task=self.parent.config.task,
                use_binary_label=self.parent.config.use_binary_label,
                class_cost=class_cost,
            )
        # support data already in vw format
        # append `predict` to `features`
        else:
            if len(self.parent.features) > 1:
                raise InputError(
                    "Training features should be in vw format or `noconvert` should be false."
                )
            vw_data = (vw_data[self.parent.config.predict.name].map(str) +
                       " " + vw_data[self.parent.features[0]].map(str))
        self.logger.info("Number of input records: {}".format(len(vw_data)))
        for n in range(self.parent.config.passes):
            if n > 1:
                X = shuffle(vw_data)
            else:
                X = vw_data
            for x in X:
                self.parent.clf.learn(x)
        self.is_trained = True