async def train(self, sources: Sources) -> None: """ Trains and saves a model using the source data, and the config attributes """ # Get data into memory xdata = [] ydata = [] async for record in sources.with_features( self.features + [self.parent.config.predict.name]): record_data = [] for feature in record.features(self.features).values(): record_data.extend( [feature] if np.isscalar(feature) else feature) xdata.append(record_data) ydata.append(record.feature(self.parent.config.predict.name)) x_data = pd.DataFrame(xdata) y_data = pd.DataFrame(ydata) self.saved = XGBClassifier( n_estimators=self.config.n_estimators, learning_rate=self.config.learning_rate, max_depth=self.config.max_depth, objective=self.config.objective, subsample=self.config.subsample, gamma=self.config.gamma, n_jobs=self.config.n_jobs, colsample_bytree=self.config.colsample_bytree, booster=self.config.booster, min_child_weight=self.config.min_child_weight, reg_lambda=self.config.reg_lambda, reg_alpha=self.config.reg_alpha, ) self.saved.fit(x_data, y_data, eval_metric="merror") self.is_trained = True
async def get_input_data(self, sources: Sources) -> list: saved_records = [] async for record in sources.with_features( self.config.features.names() ): saved_records.append(record) return saved_records
async def accuracy_input_fn(self, sources: Sources, **kwargs): """ Uses the numpy input function with data from repo features. """ x_cols: Dict[str, Any] = {feature: [] for feature in self.features} y_cols = [] for repo in [ repo async for repo in sources.with_features( self.features + [self.parent.config.predict.NAME]) if repo.feature(self.parent.config.predict.NAME) in self.classifications ]: for feature, results in repo.features(self.features).items(): x_cols[feature].append(np.array(results)) y_cols.append(self.classifications[repo.feature( self.parent.config.predict.NAME)]) y_cols = np.array(y_cols) for feature in x_cols: x_cols[feature] = np.array(x_cols[feature]) self.logger.info("------ Repo Data ------") self.logger.info("x_cols: %d", len(list(x_cols.values())[0])) self.logger.info("y_cols: %d", len(y_cols)) self.logger.info("-----------------------") input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn( x_cols, y_cols, batch_size=self.parent.config.batchsize, shuffle=self.parent.config.shuffle, num_epochs=1, **kwargs, ) return input_fn
async def train(self, sources: Sources): xdata = [] ydata = [] ### np.hstack helps flatten the lists wihtout splitting strings. async for record in sources.with_features( list(self.np.hstack(self.features + [self.predictions]))): feature_data = [] predict_data = [] for feature in record.features(self.features).values(): feature_data.extend( [feature] if self.np.isscalar(feature) else feature) xdata.append(feature_data) if self.is_multi: for feature in record.features(self.predictions).values(): predict_data.extend( [feature] if self.np.isscalar(feature) else feature) else: predict_data = record.feature(self.predictions) ydata.append(predict_data) xdata = self.np.array(xdata) ydata = self.np.array(ydata) self.logger.info("Number of input records: {}".format(len(xdata))) if (self.is_multi and "MultiOutput" not in self.parent.clf.__class__.__name__): if self.estimator_type == "regressor": self.parent.clf = MultiOutputRegressor(self.parent.clf) elif self.estimator_type == "classifier": self.parent.clf = MultiOutputClassifier(self.parent.clf) else: raise NoMultiOutputSupport( "Model does not support multi-output. Please refer the docs to find a suitable model entrypoint." ) self.parent.clf.fit(xdata, ydata) self.is_trained = True
async def evaluate_input_fn( self, sources: Sources, batch_size=20, shuffle=False, epochs=1, **kwargs, ): """ Uses the numpy input function with data from repo features. """ x_cols: Dict[str, Any] = {feature: [] for feature in self.features} y_cols = [] async for repo in sources.with_features(self.all_features): for feature, results in repo.features(self.features).items(): x_cols[feature].append(np.array(results)) y_cols.append(repo.feature(self.parent.config.predict)) y_cols = np.array(y_cols) for feature in x_cols: x_cols[feature] = np.array(x_cols[feature]) self.logger.info("------ Repo Data ------") self.logger.info("x_cols: %d", len(list(x_cols.values())[0])) self.logger.info("y_cols: %d", len(y_cols)) self.logger.info("-----------------------") input_fn = tensorflow.estimator.inputs.numpy_input_fn( x_cols, y_cols, batch_size=batch_size, shuffle=shuffle, num_epochs=epochs, **kwargs, ) return input_fn
async def sources_to_array(self, sources: Sources): x_cols: Dict[str, Any] = {feature: [] for feature in self.features} y_cols = [] for record in [ record async for record in sources.with_features( self.features + [self.parent.config.predict.name] ) if self.parent.config.clstype( record.feature(self.parent.config.predict.name) ) in self.classifications ]: for feature, results in record.features(self.features).items(): x_cols[feature].append(self.np.array(results)) y_cols.append( self.classifications[ self.parent.config.clstype( record.feature(self.parent.config.predict.name) ) ] ) if not y_cols: raise ValueError("No records to train on") y_cols = self.np.array(y_cols) for feature in x_cols: x_cols[feature] = self.np.array(x_cols[feature]) return x_cols, y_cols
async def train(self, sources: Sources): xdata = [] async for record in sources.with_features(self.features): feature_data = record.features(self.features) xdata.append(list(feature_data.values())) xdata = self.np.array(xdata) self.logger.info("Number of input records: {}".format(len(xdata))) self.parent.clf.fit(xdata) self.is_trained = True
async def train(self, sources: Sources): async for repo in sources.with_features( self.features + [self.parent.config.predict.NAME]): feature_data = repo.features(self.features + [self.parent.config.predict.NAME]) self.xData = np.append(self.xData, feature_data[self.features[0]]) self.yData = np.append( self.yData, feature_data[self.parent.config.predict.NAME]) self.regression_line = await self.best_fit_line()
async def train(self, sources: Sources): xdata = [] async for record in sources.with_features(self.features): feature_data = record.features(self.features) xdata.append(list(feature_data.values())) xdata = self.np.array(xdata) self.logger.info("Number of input records: {}".format(len(xdata))) self.clf.fit(xdata) self.joblib.dump(self.clf, str(self._filepath))
async def accuracy(self, sources: Sources) -> Accuracy: if not os.path.isfile(self._filename()): raise ModelNotTrained("Train model before assessing for accuracy.") data = [] importance, tag, base, class_cost = None, None, None, None if self.parent.config.importance: importance = self.parent.config.importance.name if self.parent.config.tag: tag = self.parent.config.tag.name if self.parent.config.base: base = self.parent.config.base.name async for record in sources.with_features(self.features): feature_data = record.features( self.features + [self.parent.config.predict.name] + self.parent.config.extra_cols ) data.append(feature_data) df = pd.DataFrame(data) xdata = df.drop([self.parent.config.predict.name], 1) self.logger.debug("Number of input records: {}".format(len(xdata))) if not self.parent.config.noconvert: xdata = df_to_vw_format( xdata, vwcmd=self.parent.config.vwcmd, target=None, namespace=self.parent.config.namespace, importance=importance, tag=tag, base=base, task=self.parent.config.task, use_binary_label=self.parent.config.use_binary_label, ) else: xdata = ( xdata.drop(self.parent.config.extra_cols, axis=1) .to_numpy() .flatten() ) ydata = np.array(df[self.parent.config.predict.name]) shape = [len(xdata)] # TODO support probabilites # if 'oaa' in self.parent.config.vwcmd and 'probabilities' in self.parent.config.vwcmd: # shape.append(self.parent.config.vwcmd['oaa']) y_pred = np.empty(shape) for idx, x in enumerate(xdata): y_pred[idx] = self.clf.predict(x) if self.parent.config.task in ["regression"]: self.confidence = r2_score(ydata, y_pred) elif self.parent.config.task in ["classification"]: self.confidence = accuracy_score(ydata, y_pred) self.logger.debug("Model Accuracy: {}".format(self.confidence)) return self.confidence
async def train(self, sources: Sources): data = [] async for repo in sources.with_features(self.features): feature_data = repo.features(self.features) data.append(feature_data) df = pd.DataFrame(data) xdata = np.array(df) self.logger.info("Number of input repos: {}".format(len(xdata))) self.clf.fit(xdata) joblib.dump(self.clf, self._filename())
async def train(self, sources: Sources): data = [] async for record in sources.with_features(self.features): feature_data = record.features(self.features) data.append(feature_data) df = self.pd.DataFrame(data) xdata = self.np.array(df) self.logger.info("Number of input records: {}".format(len(xdata))) self.clf.fit(xdata) self.joblib.dump(self.clf, str(self._filepath))
async def train_data_generator(self, sources: Sources): self.logger.debug("Training on features: %r", self.parent.features) x_cols: Dict[str, Any] = { feature: [] for feature in self.parent.features } y_cols = [] all_records = [] all_sources = sources.with_features( self.parent.features + [self.classification] ) async for record in all_sources: if record.feature(self.classification) in self.classifications: all_records.append(record) for record in all_records: for feature, results in record.features( self.parent.features ).items(): x_cols[feature].append(self.np.array(results)) y_cols.append( self.classifications[record.feature(self.classification)] ) if not y_cols: raise ValueError("No records to train on") y_cols = self.np.array(y_cols) for feature in x_cols: x_cols[feature] = self.np.array(x_cols[feature]) self.logger.info("------ Record Data ------") self.logger.info("x_cols: %d", len(list(x_cols.values())[0])) self.logger.info("y_cols: %d", len(y_cols)) self.logger.info("-----------------------") if (len(self.parent.features)) > 1: self.logger.critical( "Found more than one feature to train on. Only first feature will be used" ) # TODO add more embedTypes # so far only model available on tensorflow hub which requires special input preprocessing is `bert` if self.parent.config.embedType in ["bert"]: x_cols = bert_tokenizer( x_cols[self.parent.features[0]], self.parent.config.max_seq_length, self.parent._model.vocab_file.asset_path.numpy(), self.parent._model.do_lower_case.numpy(), ) x_cols = dict( input_word_ids=x_cols[0], input_mask=x_cols[1], segment_ids=x_cols[2], ) else: # Universal Sentence Encoder, Neural Network Language Model, Swivel Embeddings # No preprocessing needed x_cols = x_cols[self.parent.features[0]] return x_cols, y_cols
async def train(self, sources: Sources): data = [] importance, tag, base, class_cost = None, None, None, None if self.parent.config.importance: importance = self.parent.config.importance.name if self.parent.config.tag: tag = self.parent.config.tag.name if self.parent.config.base: base = self.parent.config.base.name if self.parent.config.class_cost: class_cost = [ feature.name for feature in self.parent.config.class_cost ] async for record in sources.with_features( self.parent.features + [self.parent.config.predict.name] + self.parent.config.extra_cols): feature_data = record.features(self.parent.features + [self.parent.config.predict.name] + self.parent.config.extra_cols) data.append(feature_data) vw_data = pd.DataFrame(data) if not self.parent.config.noconvert: vw_data = df_to_vw_format( vw_data, vwcmd=self.parent.config.vwcmd, target=self.parent.config.predict.name, namespace=self.parent.config.namespace, importance=importance, tag=tag, base=base, task=self.parent.config.task, use_binary_label=self.parent.config.use_binary_label, class_cost=class_cost, ) # support data already in vw format # append `predict` to `features` else: if len(self.parent.features) > 1: raise InputError( "Training features should be in vw format or `noconvert` should be false." ) vw_data = (vw_data[self.parent.config.predict.name].map(str) + " " + vw_data[self.parent.features[0]].map(str)) self.logger.info("Number of input records: {}".format(len(vw_data))) for n in range(self.parent.config.passes): if n > 1: X = shuffle(vw_data) else: X = vw_data for x in X: self.parent.clf.learn(x) self.is_trained = True
async def train(self, sources: Sources): data = [] async for repo in sources.with_features(self.features): feature_data = repo.features(self.features + [self.parent.config.predict]) data.append(feature_data) df = pd.DataFrame(data) xdata = np.array(df.drop([self.parent.config.predict], 1)) ydata = np.array(df[self.parent.config.predict]) self.logger.info("Number of input repos: {}".format(len(xdata))) self.clf.fit(xdata, ydata) joblib.dump(self.clf, self._filename())
async def score( self, mctx: ModelContext, sctx: Sources, features: Feature ): # Load saved anomalies anomalies = mctx.storage.get("anomalies", None) # Ensure the model has been trained before we try to make a prediction if not mctx.is_trained: raise ModelNotTrained("Train model before assessing for accuracy.") epsilon, _F1val, mu, sigma2 = anomalies X = [] Y = [] # Go through all records that have the feature we're training on and the # feature we want to predict. async for record in sctx.with_features( mctx.features + [features.name] ): record_data = [] for feature in record.features(mctx.features).values(): record_data.extend( [feature] if np.isscalar(feature) else feature ) X.append(record_data) Y.append(record.feature(features.name)) mctx.logger.debug("Number of test records: %d", len(X)) # Number of features nof = len(mctx.features) X = np.reshape(X, (len(X), nof)) Y = np.reshape(Y, (len(Y), 1)) mu = np.array(mu) sigma2 = np.array(sigma2) p = multivariateGaussian(X, mu, sigma2) pred = (p < epsilon).astype(int) F1 = getF1(Y, pred) outliers = p < epsilon listOfOl = findIndices(outliers) accuracy = F1 # Update the accuracy mctx.storage["anomalies"] = epsilon, F1, mu.tolist(), sigma2.tolist() return accuracy
async def accuracy(self, sources: Sources) -> Accuracy: data = [] async for repo in sources.with_features(self.features): feature_data = repo.features(self.features + [self.parent.config.predict]) data.append(feature_data) df = pd.DataFrame(data) xdata = np.array(df.drop([self.parent.config.predict], 1)) ydata = np.array(df[self.parent.config.predict]) self.logger.debug("Number of input repos: {}".format(len(xdata))) self.confidence = self.clf.score(xdata, ydata) self.logger.debug("Model Accuracy: {}".format(self.confidence)) return self.confidence
async def _preprocess_data(self, sources: Sources): all_examples = [] all_sources = sources.with_features([ "sentence", "entities", ]) async for record in all_sources: all_examples.append(( record.feature("sentence"), { "entities": record.feature("entities") }, )) return all_examples
async def dataset_generator(self, sources: Sources): """ Get data from source and convert into Tensor format for further processing """ self.logger.debug("Training on features: %r", self.features) x_cols: Dict[str, Any] = {feature: [] for feature in self.features} y_cols = [] all_records = [] all_sources = sources.with_features( self.features + [self.parent.config.predict.name] ) async for record in all_sources: for feature, results in record.features(self.features).items(): x_cols[feature].append(np.array(results)) y_cols.append( self.classifications[ record.feature(self.parent.config.predict.name) ] if self.classifications else record.feature(self.parent.config.predict.name) ) if (len(self.features)) > 1: self.logger.critical( "Found more than one feature to train on. Only first feature will be used" ) if not y_cols: raise ValueError("No records to train on") y_cols = np.array(y_cols) for feature in x_cols: x_cols[feature] = np.array(x_cols[feature]) self.logger.info("------ Record Data ------") self.logger.info("x_cols: %d", len(list(x_cols.values())[0])) self.logger.info("y_cols: %d", len(y_cols)) self.logger.info("-----------------------") x_cols = x_cols[self.features[0]] # Convert x and y data to tensors and normalize them accordingly dataset = NumpyToTensor( x_cols, y_cols, size=self.parent.config.imageSize, norm_mean=self.parent.config.normalize_mean, norm_std=self.parent.config.normalize_std, ) return dataset, len(dataset)
async def train(self, sources: Sources): data = [] async for record in sources.with_features( self.features + [self.parent.config.predict.NAME] ): feature_data = record.features( self.features + [self.parent.config.predict.NAME] ) data.append(feature_data) df = self.pd.DataFrame(data) xdata = self.np.array(df.drop([self.parent.config.predict.NAME], 1)) ydata = self.np.array(df[self.parent.config.predict.NAME]) self.logger.info("Number of input records: {}".format(len(xdata))) self.clf.fit(xdata, ydata) self.joblib.dump(self.clf, str(self._filepath))
async def train(self, sources: Sources): data = [] async for repo in sources.with_features( self.features + [self.parent.config.predict.NAME] ): feature_data = repo.features( self.features + [self.parent.config.predict.NAME] ) slice_ = [feature_data[data] for data in self.features] data.append(slice_) self.yData = np.append( self.yData, feature_data[self.parent.config.predict.NAME] ) self.xData = np.asarray(data, dtype=float).reshape(-1, len(self.features)) await self.best_fit_line()
async def sources_to_array(self, sources: Sources): x_cols: Dict[str, Any] = {feature: [] for feature in self.features} y_cols = [] async for record in sources.with_features(self.all_features): for feature, results in record.features(self.features).items(): x_cols[feature].append(self.np.array(results)) y_cols.append(record.feature(self.parent.config.predict.name)) y_cols = self.np.array(y_cols) for feature in x_cols: x_cols[feature] = self.np.array(x_cols[feature]) return x_cols, y_cols
async def accuracy(self, sources: Sources) -> Accuracy: if not self._filepath.is_file(): raise ModelNotTrained("Train model before assessing for accuracy.") data = [] async for record in sources.with_features(self.features): feature_data = record.features(self.features + [self.parent.config.predict.name]) data.append(feature_data) df = self.pd.DataFrame(data) xdata = self.np.array(df.drop([self.parent.config.predict.name], 1)) ydata = self.np.array(df[self.parent.config.predict.name]) self.logger.debug("Number of input records: {}".format(len(xdata))) self.confidence = self.clf.score(xdata, ydata) self.logger.debug("Model Accuracy: {}".format(self.confidence)) return self.confidence
async def accuracy(self, sources: Sources) -> Accuracy: if not self._filepath.is_file(): raise ModelNotTrained("Train model before assessing for accuracy.") xdata = [] ydata = [] target = [] estimator_type = self.clf._estimator_type if estimator_type == "clusterer": target = ( [] if self.parent.config.tcluster is None else [self.parent.config.tcluster.name] ) async for record in sources.with_features(self.features): feature_data = record.features(self.features) xdata.append(list(feature_data.values())) ydata.append(list(record.features(target).values())) xdata = self.np.array(xdata) self.logger.debug("Number of input records: {}".format(len(xdata))) if target: ydata = self.np.array(ydata).flatten() if hasattr(self.clf, "predict"): # xdata can be training data or unseen data # inductive clusterer with ground truth y_pred = self.clf.predict(xdata) self.confidence = mutual_info_score(ydata, y_pred) else: # requires xdata = training data # transductive clusterer with ground truth self.logger.critical( "Accuracy found transductive clusterer, ensure data being passed is training data" ) self.confidence = mutual_info_score(ydata, self.clf.labels_) else: if hasattr(self.clf, "predict"): # xdata can be training data or unseen data # inductive clusterer without ground truth y_pred = self.clf.predict(xdata) self.confidence = silhouette_score(xdata, y_pred) else: # requires xdata = training data # transductive clusterer without ground truth self.logger.critical( "Accuracy found transductive clusterer, ensure data being passed is training data" ) self.confidence = silhouette_score(xdata, self.clf.labels_) self.logger.debug("Model Accuracy: {}".format(self.confidence)) return self.confidence
async def train(self, sources: Sources): xdata = [] ydata = [] async for record in sources.with_features( self.features + [self.parent.config.predict.name]): record_data = [] for feature in record.features(self.features).values(): record_data.extend( [feature] if self.np.isscalar(feature) else feature) xdata.append(record_data) ydata.append(record.feature(self.parent.config.predict.name)) xdata = self.np.array(xdata) ydata = self.np.array(ydata) self.logger.info("Number of input records: {}".format(len(xdata))) self.clf.fit(xdata, ydata) self.joblib.dump(self.clf, str(self._filepath))
async def training_input_fn( self, sources: Sources, batch_size=20, shuffle=False, epochs=1, **kwargs, ): """ Uses the numpy input function with data from repo features. """ self.logger.debug("Training on features: %r", self.features) x_cols: Dict[str, Any] = {feature: [] for feature in self.features} y_cols = [] for repo in [ repo async for repo in sources.with_features( self.features + [self.parent.config.predict.NAME] ) if repo.feature(self.parent.config.predict.NAME) in self.classifications ]: for feature, results in repo.features(self.features).items(): x_cols[feature].append(np.array(results)) y_cols.append( self.classifications[ repo.feature(self.parent.config.predict.NAME) ] ) if not y_cols: raise ValueError("No repos to train on") y_cols = np.array(y_cols) for feature in x_cols: x_cols[feature] = np.array(x_cols[feature]) self.logger.info("------ Repo Data ------") self.logger.info("x_cols: %d", len(list(x_cols.values())[0])) self.logger.info("y_cols: %d", len(y_cols)) self.logger.info("-----------------------") input_fn = tensorflow.estimator.inputs.numpy_input_fn( x_cols, y_cols, batch_size=batch_size, shuffle=shuffle, num_epochs=epochs, **kwargs, ) return input_fn
async def train(self, sources: Sources): data = [] importance, tag, base, class_cost = None, None, None, None if self.parent.config.importance: importance = self.parent.config.importance.name if self.parent.config.tag: tag = self.parent.config.tag.name if self.parent.config.base: base = self.parent.config.base.name if self.parent.config.class_cost: class_cost = [ feature.name for feature in self.parent.config.class_cost ] async for record in sources.with_features( self.features + [self.parent.config.predict.name] + self.parent.config.extra_cols): feature_data = record.features(self.features + [self.parent.config.predict.name] + self.parent.config.extra_cols) data.append(feature_data) vw_data = pd.DataFrame(data) if self.parent.config.convert_to_vw: vw_data = df_to_vw_format( vw_data, vwcmd=self.parent.config.vwcmd, target=self.parent.config.predict.name, namespace=self.parent.config.namespace, importance=importance, tag=tag, base=base, task=self.parent.config.task, use_binary_label=self.parent.config.use_binary_label, class_cost=class_cost, ) self.logger.info("Number of input records: {}".format(len(vw_data))) for n in range(self.parent.config.passes): if n > 1: X = shuffle(vw_data) else: X = vw_data for x in X: self.clf.learn(x) self._save_model()
async def accuracy(self, sources: Sources) -> Accuracy: if not os.path.isfile(self._filename()): raise ModelNotTrained("Train model before assessing for accuracy.") data = [] target = [] estimator_type = self.clf._estimator_type if estimator_type is "clusterer": target = ([] if self.parent.config.tcluster is None else [self.parent.config.tcluster.NAME]) async for repo in sources.with_features(self.features): feature_data = repo.features(self.features + target) data.append(feature_data) df = pd.DataFrame(data) xdata = np.array(df.drop(target, axis=1)) self.logger.debug("Number of input repos: {}".format(len(xdata))) if target: ydata = np.array(df[target]).flatten() if hasattr(self.clf, "predict"): # xdata can be training data or unseen data # inductive clusterer with ground truth y_pred = self.clf.predict(xdata) self.confidence = mutual_info_score(ydata, y_pred) else: # requires xdata = training data # transductive clusterer with ground truth self.logger.critical( "Accuracy found transductive clusterer, ensure data being passed is training data" ) self.confidence = mutual_info_score(ydata, self.clf.labels_) else: if hasattr(self.clf, "predict"): # xdata can be training data or unseen data # inductive clusterer without ground truth y_pred = self.clf.predict(xdata) self.confidence = silhouette_score(xdata, y_pred) else: # requires xdata = training data # transductive clusterer without ground truth self.logger.critical( "Accuracy found transductive clusterer, ensure data being passed is training data" ) self.confidence = silhouette_score(xdata, self.clf.labels_) self.logger.debug("Model Accuracy: {}".format(self.confidence)) return self.confidence
async def accuracy(self, sources: Sources) -> Accuracy: if not self._filepath.is_file(): raise ModelNotTrained("Train model before assessing for accuracy.") xdata = [] ydata = [] async for record in sources.with_features( self.features + [self.parent.config.predict.name]): record_data = [] for feature in record.features(self.features).values(): record_data.extend( [feature] if self.np.isscalar(feature) else feature) xdata.append(record_data) ydata.append(record.feature(self.parent.config.predict.name)) xdata = self.np.array(xdata) ydata = self.np.array(ydata) self.logger.debug("Number of input records: {}".format(len(xdata))) self.confidence = self.clf.score(xdata, ydata) self.logger.debug("Model Accuracy: {}".format(self.confidence)) return self.confidence
async def _preprocess_data(self, sources: Sources): x_cols: Dict[str, Any] = { feature: [] for feature in ( [self.parent.config.sid.name, self.parent.config.words.name] ) } y_cols = [] all_records = [] all_sources = sources.with_features( [ self.parent.config.sid.name, self.parent.config.words.name, self.parent.config.predict.name, ] ) async for record in all_sources: if ( record.feature(self.parent.config.predict.name) in self.parent.config.ner_tags ): all_records.append(record) for record in all_records: for feature, results in record.features( [self.parent.config.sid.name, self.parent.config.words.name] ).items(): x_cols[feature].append(self.np.array(results)) y_cols.append(record.feature(self.parent.config.predict.name)) if not y_cols: raise ValueError("No records to train on") y_cols = self.np.array(y_cols) for feature in x_cols: x_cols[feature] = self.np.array(x_cols[feature]) self.logger.info("------ Record Data ------") self.logger.info("x_cols: %d", len(list(x_cols.values())[0])) self.logger.info("y_cols: %d", len(y_cols)) self.logger.info("-----------------------") df = self.pd.DataFrame.from_dict(x_cols) df[self.parent.config.predict.name] = y_cols return df