def setUpClass(cls): # Create a temporary directory to store the trained model cls.model_dir = tempfile.TemporaryDirectory() # Create an instance of the model cls.model = XGBRegressorModel( XGBRegressorModelConfig( features=Features(Feature("Feature1", float, 1), Feature("Feature2")), predict=Feature("Target", float, 1), directory=cls.model_dir.name, )) # Generating data f(x1,x2) = 2*x1 + 3*x2 _n_data = 2000 _temp_data = np.random.rand(2, _n_data) cls.records = [ Record( "x" + str(random.random()), data={ "features": { "Feature1": float(_temp_data[0][i]), "Feature2": float(_temp_data[1][i]), "Target": 2 * _temp_data[0][i] + 3 * _temp_data[1][i], } }, ) for i in range(0, _n_data) ] cls.trainingsource = Sources( MemorySource(MemorySourceConfig(records=cls.records[:1800]))) cls.testsource = Sources( MemorySource(MemorySourceConfig(records=cls.records[1800:])))
def setUpClass(cls): # Create a temporary directory to store the trained model cls.model_dir = tempfile.TemporaryDirectory() # Create an instance of the model cls.model = AnomalyModel( features=Features( Feature("A", int, 1), Feature("B", int, 2), ), predict=Feature("Y", int, 1), directory=cls.model_dir.name, ) # Generating data _n_data = 1800 _temp_data = np.random.normal(2, 1, size=(2, _n_data)) cls.records = [ Record( "x" + str(random.random()), data={ "features": { "A": float(_temp_data[0][i]), "B": float(_temp_data[1][i]), "Y": (_temp_data[0][i] > 1 - _temp_data[1][i]).astype(int), } }, ) for i in range(0, _n_data) ] cls.trainingsource = Sources( MemorySource(MemorySourceConfig(records=cls.records[:1400]))) cls.testsource = Sources( MemorySource(MemorySourceConfig(records=cls.records[1400:])))
def setUpClass(cls): ( A_train, B_train, C_train, X_train, D_train, E_train, ) = list(zip(*TRAIN_DATA)) A_test, B_test, C_test, X_test, D_test, E_test = list(zip(*TEST_DATA)) cls.train_records = [ Record( str(i), data={ "features": { "title": A_train[i], "context": B_train[i], "question": C_train[i], "answer_text": X_train[i], "start_pos_char": D_train[i], "is_impossible": E_train[i], "answers": [], } }, ) for i in range(len(X_train)) ] cls.test_records = [ Record( str(i), data={ "features": { "title": A_test[i], "context": B_test[i], "question": C_test[i], "answer_text": X_test[i], "start_pos_char": D_test[i], "is_impossible": E_test[i], "answers": [], } }, ) for i in range(len(X_test)) ] cls.train_sources = Sources( MemorySource(MemorySourceConfig(records=cls.train_records))) cls.test_sources = Sources( MemorySource(MemorySourceConfig(records=cls.test_records))) cls.model_dir = tempfile.TemporaryDirectory() cls.model = QAModel( QAModelConfig( model_name_or_path="bert-base-cased", cache_dir=CACHE_DIR, directory=cls.model_dir.name, log_dir=cls.model_dir.name, model_type="bert", no_cuda=True, ))
def setUpClass(cls): A_train, B_train, X = list(zip(*TRAIN_DATA)) A_predict, B_predict = list(zip(*PREDICT_DATA)) cls.train_records = [ Record( str(i), data={ "features": { "sentence_id": A_train[i], "words": B_train[i], "ner_tag": X[i], } }, ) for i in range(0, len(X)) ] cls.train_sources = Sources( MemorySource(MemorySourceConfig(records=cls.train_records)) ) cls.predict_records = [ Record( str(i), data={ "features": { "sentence_id": A_predict[i], "words": B_predict[i], } }, ) for i in range(0, len(A_predict)) ] cls.predict_sources = Sources( MemorySource(MemorySourceConfig(records=cls.predict_records)) ) cls.model_dir = tempfile.TemporaryDirectory() cls.model = NERModel( NERModelConfig( sid=Feature("sentence_id", int, 1), words=Feature("words", str, 1), predict=Feature("ner_tag", str, 1), output_dir=cls.model_dir.name, model_architecture_type="bert", model_name_or_path="bert-base-cased", no_cuda=True, ) )
async def train(self, sources: Sources): xdata = [] ydata = [] ### np.hstack helps flatten the lists wihtout splitting strings. async for record in sources.with_features( list(self.np.hstack(self.features + [self.predictions]))): feature_data = [] predict_data = [] for feature in record.features(self.features).values(): feature_data.extend( [feature] if self.np.isscalar(feature) else feature) xdata.append(feature_data) if self.is_multi: for feature in record.features(self.predictions).values(): predict_data.extend( [feature] if self.np.isscalar(feature) else feature) else: predict_data = record.feature(self.predictions) ydata.append(predict_data) xdata = self.np.array(xdata) ydata = self.np.array(ydata) self.logger.info("Number of input records: {}".format(len(xdata))) if (self.is_multi and "MultiOutput" not in self.parent.clf.__class__.__name__): if self.estimator_type == "regressor": self.parent.clf = MultiOutputRegressor(self.parent.clf) elif self.estimator_type == "classifier": self.parent.clf = MultiOutputClassifier(self.parent.clf) else: raise NoMultiOutputSupport( "Model does not support multi-output. Please refer the docs to find a suitable model entrypoint." ) self.parent.clf.fit(xdata, ydata) self.is_trained = True
async def train(self, sources: Sources) -> None: """ Trains and saves a model using the source data, and the config attributes """ # Get data into memory xdata = [] ydata = [] async for record in sources.with_features( self.features + [self.parent.config.predict.name]): record_data = [] for feature in record.features(self.features).values(): record_data.extend( [feature] if np.isscalar(feature) else feature) xdata.append(record_data) ydata.append(record.feature(self.parent.config.predict.name)) x_data = pd.DataFrame(xdata) y_data = pd.DataFrame(ydata) self.saved = XGBClassifier( n_estimators=self.config.n_estimators, learning_rate=self.config.learning_rate, max_depth=self.config.max_depth, objective=self.config.objective, subsample=self.config.subsample, gamma=self.config.gamma, n_jobs=self.config.n_jobs, colsample_bytree=self.config.colsample_bytree, booster=self.config.booster, min_child_weight=self.config.min_child_weight, reg_lambda=self.config.reg_lambda, reg_alpha=self.config.reg_alpha, ) self.saved.fit(x_data, y_data, eval_metric="merror") self.is_trained = True
async def get_input_data(self, sources: Sources) -> list: saved_records = [] async for record in sources.with_features( self.config.features.names() ): saved_records.append(record) return saved_records
def setUpClass(cls): cls.feature = StartsWithA() cls.features = Features(cls.feature) cls.model_dir = tempfile.TemporaryDirectory() cls.model = MiscModel( MiscModelConfig( directory=cls.model_dir.name, classifications=["not a", "a"], features=cls.features, )) cls.repos = [ Repo( "a" + str(random.random()), data={"features": { cls.feature.NAME: 1, "string": "a" }}, ) for _ in range(0, 1000) ] cls.repos += [ Repo( "b" + str(random.random()), data={"features": { cls.feature.NAME: 0, "string": "not a" }}, ) for _ in range(0, 1000) ] cls.sources = Sources(MemorySource( MemorySourceConfig(repos=cls.repos)))
def setUpClass(cls): cls.model_dir = tempfile.TemporaryDirectory() cls.features = Features() if cls.MODEL_TYPE is "CLASSIFICATION": cls.features.append(DefFeature("A", float, 1)) cls.features.append(DefFeature("B", float, 1)) cls.features.append(DefFeature("C", float, 1)) cls.features.append(DefFeature("D", float, 1)) cls.features.append(DefFeature("E", float, 1)) cls.features.append(DefFeature("F", float, 1)) cls.features.append(DefFeature("G", float, 1)) cls.features.append(DefFeature("H", float, 1)) cls.features.append(DefFeature("I", float, 1)) A, B, C, D, E, F, G, H, I, X = list( zip(*FEATURE_DATA_CLASSIFICATION)) cls.repos = [ Repo( str(i), data={ "features": { "A": A[i], "B": B[i], "C": C[i], "D": D[i], "E": E[i], "F": F[i], "G": G[i], "H": H[i], "I": I[i], "X": X[i], } }, ) for i in range(0, len(A)) ] elif cls.MODEL_TYPE is "REGRESSION": cls.features.append(DefFeature("A", float, 1)) cls.features.append(DefFeature("B", float, 1)) cls.features.append(DefFeature("C", float, 1)) A, B, C, X = list(zip(*FEATURE_DATA_REGRESSION)) cls.repos = [ Repo( str(i), data={ "features": { "A": A[i], "B": B[i], "C": C[i], "X": X[i], } }, ) for i in range(0, len(A)) ] cls.sources = Sources(MemorySource( MemorySourceConfig(repos=cls.repos))) cls.model = cls.MODEL( cls.MODEL_CONFIG( directory=cls.model_dir.name, predict="X", features=cls.features, ))
async def sources_to_array(self, sources: Sources): x_cols: Dict[str, Any] = {feature: [] for feature in self.features} y_cols = [] for record in [ record async for record in sources.with_features( self.features + [self.parent.config.predict.name] ) if self.parent.config.clstype( record.feature(self.parent.config.predict.name) ) in self.classifications ]: for feature, results in record.features(self.features).items(): x_cols[feature].append(self.np.array(results)) y_cols.append( self.classifications[ self.parent.config.clstype( record.feature(self.parent.config.predict.name) ) ] ) if not y_cols: raise ValueError("No records to train on") y_cols = self.np.array(y_cols) for feature in x_cols: x_cols[feature] = self.np.array(x_cols[feature]) return x_cols, y_cols
def setUpClass(cls): cls.model_dir = tempfile.TemporaryDirectory() cls.model = DNNClassifierModel( DNNClassifierModelConfig(directory=cls.model_dir.name, steps=1000, epochs=30, hidden=[10, 20, 10], classification="string", classifications=["a", "not a"], clstype=str)) cls.feature = StartsWithA() cls.features = Features(cls.feature) cls.repos = [ Repo( "a" + str(random.random()), data={"features": { cls.feature.NAME: 1, "string": "a" }}, ) for _ in range(0, 1000) ] cls.repos += [ Repo( "b" + str(random.random()), data={"features": { cls.feature.NAME: 0, "string": "not a" }}, ) for _ in range(0, 1000) ] cls.sources = Sources(MemorySource( MemorySourceConfig(repos=cls.repos)))
async def accuracy_input_fn(self, sources: Sources, **kwargs): """ Uses the numpy input function with data from repo features. """ x_cols: Dict[str, Any] = {feature: [] for feature in self.features} y_cols = [] for repo in [ repo async for repo in sources.with_features( self.features + [self.parent.config.predict.NAME]) if repo.feature(self.parent.config.predict.NAME) in self.classifications ]: for feature, results in repo.features(self.features).items(): x_cols[feature].append(np.array(results)) y_cols.append(self.classifications[repo.feature( self.parent.config.predict.NAME)]) y_cols = np.array(y_cols) for feature in x_cols: x_cols[feature] = np.array(x_cols[feature]) self.logger.info("------ Repo Data ------") self.logger.info("x_cols: %d", len(list(x_cols.values())[0])) self.logger.info("y_cols: %d", len(y_cols)) self.logger.info("-----------------------") input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn( x_cols, y_cols, batch_size=self.parent.config.batchsize, shuffle=self.parent.config.shuffle, num_epochs=1, **kwargs, ) return input_fn
def setUpClass(cls): cls.model_dir = tempfile.TemporaryDirectory() cls.feature1 = Feature_1() cls.feature2 = Feature_2() cls.features = Features(cls.feature1, cls.feature2) cls.model = DNNRegressionModel( DNNRegressionModelConfig( directory=cls.model_dir.name, steps=1000, epochs=40, hidden=[50, 20, 10], predict=DefFeature("TARGET", float, 1), features=cls.features, )) # Generating data f(x1,x2) = 2*x1 + 3*x2 _n_data = 2000 _temp_data = np.random.rand(2, _n_data) cls.repos = [ Repo( "x" + str(random.random()), data={ "features": { cls.feature1.NAME: float(_temp_data[0][i]), cls.feature2.NAME: float(_temp_data[1][i]), "TARGET": 2 * _temp_data[0][i] + 3 * _temp_data[1][i], } }, ) for i in range(0, _n_data) ] cls.sources = Sources(MemorySource( MemorySourceConfig(repos=cls.repos)))
def setUpClass(cls): cls.model_dir = tempfile.TemporaryDirectory() cls.feature = Feature("starts_with_a", int, 1) cls.features = Features(cls.feature) cls.records = [ Record( "a" + str(random.random()), data={"features": { cls.feature.name: 1, "string": "a" }}, ) for _ in range(0, 1000) ] cls.records += [ Record( "b" + str(random.random()), data={"features": { cls.feature.name: 0, "string": "not a" }}, ) for _ in range(0, 1000) ] cls.sources = Sources( MemorySource(MemorySourceConfig(records=cls.records))) cls.model = DNNClassifierModel( DNNClassifierModelConfig( directory=cls.model_dir.name, steps=1000, epochs=40, hidden=[50, 20, 10], predict=Feature("string", str, 1), classifications=["a", "not a"], clstype=str, features=cls.features, ))
async def test_02_predict(self): test_feature_val = [ 0, 1.5, 2, ] # inserting zero so that its 1-indexable test_target = 2 * test_feature_val[1] + 3 * test_feature_val[2] # should be same function used in TestDNN.setupclass a = Repo( "a", data={ "features": { self.feature1.NAME: test_feature_val[1], self.feature2.NAME: test_feature_val[2], } }, ) async with Sources(MemorySource(MemorySourceConfig( repos=[a]))) as sources, self.model as model: target_name = model.config.predict.NAME async with sources() as sctx, model() as mctx: res = [repo async for repo in mctx.predict(sctx.repos())] self.assertEqual(len(res), 1) self.assertEqual(res[0].key, a.key) test_error_norm = abs( (test_target - res[0].prediction(target_name).value) / test_target + 1e-6) error_threshold = 0.3 self.assertLess(test_error_norm, error_threshold)
async def evaluate_input_fn( self, sources: Sources, batch_size=20, shuffle=False, epochs=1, **kwargs, ): """ Uses the numpy input function with data from repo features. """ x_cols: Dict[str, Any] = {feature: [] for feature in self.features} y_cols = [] async for repo in sources.with_features(self.all_features): for feature, results in repo.features(self.features).items(): x_cols[feature].append(np.array(results)) y_cols.append(repo.feature(self.parent.config.predict)) y_cols = np.array(y_cols) for feature in x_cols: x_cols[feature] = np.array(x_cols[feature]) self.logger.info("------ Repo Data ------") self.logger.info("x_cols: %d", len(list(x_cols.values())[0])) self.logger.info("y_cols: %d", len(y_cols)) self.logger.info("-----------------------") input_fn = tensorflow.estimator.inputs.numpy_input_fn( x_cols, y_cols, batch_size=batch_size, shuffle=shuffle, num_epochs=epochs, **kwargs, ) return input_fn
def setUpClass(cls): cls.features = Features() cls.features.append(Feature("A", str, 1)) A, X = list(zip(*DATA)) cls.records = [ Record(str(i), data={"features": { "A": A[i], "X": X[i] }}) for i in range(0, len(X)) ] cls.sources = Sources( MemorySource(MemorySourceConfig(records=cls.records))) cls.model_dir = tempfile.TemporaryDirectory() cls.model = TextClassificationModel( TextClassifierConfig( directory=cls.model_dir.name, classifications=[0, 1], features=cls.features, predict=Feature("X", int, 1), add_layers=True, layers=[ "Dense(units = 120, activation='relu')", "Dense(units = 64, activation=relu)", "Dense(units = 2, activation='softmax')", ], model_path= "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim-with-oov/1", epochs=30, ))
def setUpClass(cls): cls.model_dir = tempfile.TemporaryDirectory() cls.model = Misc(ModelConfig(directory=cls.model_dir.name)) cls.feature = StartsWithA() cls.features = Features(cls.feature) cls.classifications = ['a', 'not a'] cls.repos = [ Repo('a' + str(random.random()), data={ 'features': { cls.feature.NAME: 1 }, 'classification': 'a' }) for _ in range(0, 1000) ] cls.repos += [ Repo('b' + str(random.random()), data={ 'features': { cls.feature.NAME: 0 }, 'classification': 'not a' }) for _ in range(0, 1000) ] cls.sources = \ Sources(MemorySource(MemorySourceConfig(repos=cls.repos)))
def setUpClass(cls): cls.model_dir = tempfile.TemporaryDirectory() cls.features = Features() cls.features.append(Feature("A", float, 1)) cls.features.append(Feature("B", float, 1)) cls.features.append(Feature("C", float, 1)) cls.features.append(Feature("D", float, 1)) cls.features.append(Feature("E", float, 1)) cls.features.append(Feature("F", float, 1)) cls.features.append(Feature("G", int, 1)) cls.features.append(Feature("H", int, 1)) A, B, C, D, E, F, G, H, X = list(zip(*DATA)) cls.records = [ Record( str(i), data={ "features": { "A": A[i], "B": B[i], "C": C[i], "D": D[i], "E": E[i], "F": F[i], "G": G[i], "H": H[i], "X": X[i], } }, ) for i in range(0, len(A)) ] cls.sources = Sources( MemorySource(MemorySourceConfig(records=cls.records)) ) cls.model = VWModel( VWConfig( location=cls.model_dir.name, features=cls.features, predict=Feature("X", float, 1), # A and B will be namespace n1 # A and C will be in namespace n2 namespace=["n1_A_B", "n2_A_C"], importance=Feature("H", int, 1), tag=Feature("G", int, 1), task="regression", vwcmd=[ "l2", "0.1", "loss_function", "squared", "passes", "10", ], ) ) cls.scorer = MeanSquaredErrorAccuracy()
async def train(self, sources: Sources): async for repo in sources.with_features( self.features + [self.parent.config.predict.NAME]): feature_data = repo.features(self.features + [self.parent.config.predict.NAME]) self.xData = np.append(self.xData, feature_data[self.features[0]]) self.yData = np.append( self.yData, feature_data[self.parent.config.predict.NAME]) self.regression_line = await self.best_fit_line()
async def train(self, sources: Sources): xdata = [] async for record in sources.with_features(self.features): feature_data = record.features(self.features) xdata.append(list(feature_data.values())) xdata = self.np.array(xdata) self.logger.info("Number of input records: {}".format(len(xdata))) self.parent.clf.fit(xdata) self.is_trained = True
async def train(self, sources: Sources): xdata = [] async for record in sources.with_features(self.features): feature_data = record.features(self.features) xdata.append(list(feature_data.values())) xdata = self.np.array(xdata) self.logger.info("Number of input records: {}".format(len(xdata))) self.clf.fit(xdata) self.joblib.dump(self.clf, str(self._filepath))
async def test_02_predict(self): a = Repo("a", data={"features": {self.feature.NAME: 1}}) async with Sources(MemorySource(MemorySourceConfig( repos=[a]))) as sources, self.model as model: async with sources() as sctx, model() as mctx: res = [repo async for repo in mctx.predict(sctx.repos())] self.assertEqual(len(res), 1) self.assertEqual(res[0].src_url, a.src_url) self.assertTrue(res[0].prediction().value)
async def accuracy(self, sources: Sources) -> Accuracy: if not os.path.isfile(self._filename()): raise ModelNotTrained("Train model before assessing for accuracy.") data = [] importance, tag, base, class_cost = None, None, None, None if self.parent.config.importance: importance = self.parent.config.importance.name if self.parent.config.tag: tag = self.parent.config.tag.name if self.parent.config.base: base = self.parent.config.base.name async for record in sources.with_features(self.features): feature_data = record.features( self.features + [self.parent.config.predict.name] + self.parent.config.extra_cols ) data.append(feature_data) df = pd.DataFrame(data) xdata = df.drop([self.parent.config.predict.name], 1) self.logger.debug("Number of input records: {}".format(len(xdata))) if not self.parent.config.noconvert: xdata = df_to_vw_format( xdata, vwcmd=self.parent.config.vwcmd, target=None, namespace=self.parent.config.namespace, importance=importance, tag=tag, base=base, task=self.parent.config.task, use_binary_label=self.parent.config.use_binary_label, ) else: xdata = ( xdata.drop(self.parent.config.extra_cols, axis=1) .to_numpy() .flatten() ) ydata = np.array(df[self.parent.config.predict.name]) shape = [len(xdata)] # TODO support probabilites # if 'oaa' in self.parent.config.vwcmd and 'probabilities' in self.parent.config.vwcmd: # shape.append(self.parent.config.vwcmd['oaa']) y_pred = np.empty(shape) for idx, x in enumerate(xdata): y_pred[idx] = self.clf.predict(x) if self.parent.config.task in ["regression"]: self.confidence = r2_score(ydata, y_pred) elif self.parent.config.task in ["classification"]: self.confidence = accuracy_score(ydata, y_pred) self.logger.debug("Model Accuracy: {}".format(self.confidence)) return self.confidence
async def train(self, sources: Sources): data = [] async for repo in sources.with_features(self.features): feature_data = repo.features(self.features) data.append(feature_data) df = pd.DataFrame(data) xdata = np.array(df) self.logger.info("Number of input repos: {}".format(len(xdata))) self.clf.fit(xdata) joblib.dump(self.clf, self._filename())
async def train(self, sources: Sources): data = [] async for record in sources.with_features(self.features): feature_data = record.features(self.features) data.append(feature_data) df = self.pd.DataFrame(data) xdata = self.np.array(df) self.logger.info("Number of input records: {}".format(len(xdata))) self.clf.fit(xdata) self.joblib.dump(self.clf, str(self._filepath))
async def test_02_predict(self): a = Record("a", data={"features": {self.feature.NAME: 1}}) async with Sources(MemorySource(MemorySourceConfig( records=[a]))) as sources, self.model as model: target_name = model.config.predict.NAME async with sources() as sctx, model() as mctx: res = [record async for record in mctx.predict(sctx.records())] self.assertEqual(len(res), 1) self.assertEqual(res[0].key, a.key) self.assertTrue(res[0].prediction(target_name).value)
async def train_data_generator(self, sources: Sources): self.logger.debug("Training on features: %r", self.parent.features) x_cols: Dict[str, Any] = { feature: [] for feature in self.parent.features } y_cols = [] all_records = [] all_sources = sources.with_features( self.parent.features + [self.classification] ) async for record in all_sources: if record.feature(self.classification) in self.classifications: all_records.append(record) for record in all_records: for feature, results in record.features( self.parent.features ).items(): x_cols[feature].append(self.np.array(results)) y_cols.append( self.classifications[record.feature(self.classification)] ) if not y_cols: raise ValueError("No records to train on") y_cols = self.np.array(y_cols) for feature in x_cols: x_cols[feature] = self.np.array(x_cols[feature]) self.logger.info("------ Record Data ------") self.logger.info("x_cols: %d", len(list(x_cols.values())[0])) self.logger.info("y_cols: %d", len(y_cols)) self.logger.info("-----------------------") if (len(self.parent.features)) > 1: self.logger.critical( "Found more than one feature to train on. Only first feature will be used" ) # TODO add more embedTypes # so far only model available on tensorflow hub which requires special input preprocessing is `bert` if self.parent.config.embedType in ["bert"]: x_cols = bert_tokenizer( x_cols[self.parent.features[0]], self.parent.config.max_seq_length, self.parent._model.vocab_file.asset_path.numpy(), self.parent._model.do_lower_case.numpy(), ) x_cols = dict( input_word_ids=x_cols[0], input_mask=x_cols[1], segment_ids=x_cols[2], ) else: # Universal Sentence Encoder, Neural Network Language Model, Swivel Embeddings # No preprocessing needed x_cols = x_cols[self.parent.features[0]] return x_cols, y_cols
def setUpClass(cls): A_train, X_train = list(zip(*TRAIN_DATA)) A_test, X_test = list(zip(*TEST_DATA)) cls.train_records = [ Record( str(i), data={ "features": { "sentence": A_train[i], "entities": X_train[i], } }, ) for i in range(len(X_train)) ] cls.test_records = [ Record( str(i), data={ "features": {"sentence": A_test[i], "entities": X_test[i],} }, ) for i in range(len(X_test)) ] cls.train_sources = Sources( MemorySource(MemorySourceConfig(records=cls.train_records)) ) cls.test_sources = Sources( MemorySource(MemorySourceConfig(records=cls.test_records)) ) cls.model_dir = tempfile.TemporaryDirectory() cls.model = SpacyNERModel( SpacyNERModelConfig( model_name="en_core_web_sm", location=cls.model_dir.name, n_iter=10, dropout=0.4, ) ) cls.scorer = SpacyNerAccuracy()
async def train(self, sources: Sources): data = [] importance, tag, base, class_cost = None, None, None, None if self.parent.config.importance: importance = self.parent.config.importance.name if self.parent.config.tag: tag = self.parent.config.tag.name if self.parent.config.base: base = self.parent.config.base.name if self.parent.config.class_cost: class_cost = [ feature.name for feature in self.parent.config.class_cost ] async for record in sources.with_features( self.parent.features + [self.parent.config.predict.name] + self.parent.config.extra_cols): feature_data = record.features(self.parent.features + [self.parent.config.predict.name] + self.parent.config.extra_cols) data.append(feature_data) vw_data = pd.DataFrame(data) if not self.parent.config.noconvert: vw_data = df_to_vw_format( vw_data, vwcmd=self.parent.config.vwcmd, target=self.parent.config.predict.name, namespace=self.parent.config.namespace, importance=importance, tag=tag, base=base, task=self.parent.config.task, use_binary_label=self.parent.config.use_binary_label, class_cost=class_cost, ) # support data already in vw format # append `predict` to `features` else: if len(self.parent.features) > 1: raise InputError( "Training features should be in vw format or `noconvert` should be false." ) vw_data = (vw_data[self.parent.config.predict.name].map(str) + " " + vw_data[self.parent.features[0]].map(str)) self.logger.info("Number of input records: {}".format(len(vw_data))) for n in range(self.parent.config.passes): if n > 1: X = shuffle(vw_data) else: X = vw_data for x in X: self.parent.clf.learn(x) self.is_trained = True