def generate(self, dataset, dataset_name): """ Given a `dataset` pandas DataFrame and string `dataset_name`, add column `"Features"` to the provided `pd.DataFrame` and serialize the result to the results folder listed in `config.py`. If a given featurizer exposes a :func:`featurize_batch` method, that method will be called to perform featurization. Otherwise, :class:`Featurizer`'s will fall back to calling :func:`featurize` on each individual example. :param dataset: `pd.DataFrame` object that must contain a `Text` column. :param dataset_name: `str` name to use as a save location in the `config.FEATURES_DIRECTORY`. """ if os.path.exists(feature_set_location(dataset_name, self.__class__.__name__)): print("Skipping, already have this feature combination.") return if type(dataset) == list: text = [d[0] for d in dataset] features = self._features_from_text(text) new_dataset = pd.DataFrame(data={ "Text": text, "Targets": [d[1] for d in dataset], "Features": features }) elif type(dataset) == pd.DataFrame: features = self._features_from_text(dataset["Text"]) new_dataset = dataset.copy() # Don't want to modify the underlying dataframe new_dataset['Features'] = features else: raise ValueError("Unrecognised data format!!") self._write(new_dataset, dataset_name)
def generate(self, dataset, dataset_name): """ Given a `dataset` pandas DataFrame and string `dataset_name`, add column `"Features"` to the provided `pd.DataFrame` and serialize the result to the results folder listed in `config.py`. If a given featurizer exposes a :func:`featurize_batch` method, that method will be called to perform featurization. Otherwise, :class:`Featurizer`'s will fall back to calling :func:`featurize` on each individual example. :param dataset: `pd.DataFrame` object that must contain a `Text` column. :param dataset_name: `str` name to use as a save location in the `config.FEATURES_DIRECTORY`. """ if os.path.exists( feature_set_location(dataset_name, self.__class__.__name__)): print("Skipping, already have this feature combination.") return if type(dataset) != dict: raise ValueError("dataset must be a dict") text = dataset['text'] context = dataset['context'] labels = dataset['labels'] feats = [(t, c) for t, c in zip(text, context)] new_dataset = pd.DataFrame.from_dict({ 'Text': text, 'Features': feats, 'Targets': labels }) self._write(new_dataset, dataset_name)
def _write(self, featurized_dataset, dataset_name): """Responsible for taking a featurized dataset and writing it out to the filesystem.""" dump_location = feature_set_location(dataset_name, self.__class__.__name__) joblib.dump(featurized_dataset, dump_location)
def _load_dataset(dataset_name, featurizer_name): """Responsible for loading a given dataset given the dataset_name and featurizer.""" read_location = feature_set_location(dataset_name, featurizer_name) logging.info("Loading Dataset: %s" % read_location) return joblib.load(read_location)