def _train_model(self, model: Sequential = None, train_dataset: Dataset = None, validation_dataset: Dataset = None, n_epochs: int = None): model = model or self.model train_dataset = train_dataset or self.train_dataset validation_dataset = validation_dataset or self.validation_dataset n_epochs = n_epochs or self.config.NUM_EPOCHS validate_variables(model, train_dataset, validation_dataset, n_epochs) if isinstance(model, TFDistilBertForSequenceClassification): model.fit(train_dataset.batch(self.config.BATCH_SIZE), validation_data=validation_dataset.batch( self.config.BATCH_SIZE), epochs=n_epochs) else: model.fit(train_dataset, validation_data=validation_dataset, epochs=n_epochs) self.model = model self.logger.info("Successfully trained tf model") return model
def get_modeling_pipeline(self, img_size: Tuple[int, int] = None, learning_rate: float = None, metrics: List[str] = None, n_epochs: int = None, train_dataset: Dataset = None, validation_dataset: Dataset = None): img_size = img_size or self.img_size learning_rate = learning_rate or self.config.LEARNING_RATE metrics = metrics or self.config.METRICS n_epochs = n_epochs or self.config.NUM_EPOCHS train_dataset = train_dataset or self.train_dataset validation_dataset = validation_dataset or self.validation_dataset processing_pipeline = self.get_processing_pipeline() model = self.get_model() validate_variables(img_size, learning_rate, metrics, n_epochs, processing_pipeline, model) modeling_pipeline = tf.keras.Sequential([ tf.keras.Input(shape=img_size + (3, )), processing_pipeline, model, tf.keras.layers.GlobalAveragePooling2D(), tf.keras.layers.Dense(len(self.config.DISEASES), "softmax") ]) # TODO: Add weighted Adam model = self._compile_model(modeling_pipeline, learning_rate, metrics) modeling_pipeline = self._train_model(model, train_dataset, validation_dataset, n_epochs) self.modeling_pipeline = modeling_pipeline self.logger.info("Successfully loaded modeling pipeline") return modeling_pipeline
def get_best_modeling_pipeline_type(self, transformer_modeling_pipeline: TransformersModelingPipeline = None, sklearn_modeling_pipeline: Pipeline = None, x_test: DataFrame = None, y_test: Series = None, test_dataset: Dataset = None)\ -> Union[Pipeline, TransformersModelingPipeline]: x_test, y_test = self._set_dfs_test(x_test, y_test) test_dataset = test_dataset or self.test_dataset validate_variables(x_test, y_test, test_dataset) transformer_results_metric = (transformer_modeling_pipeline.evaluate( test_dataset, batch_size=self.config.BATCH_SIZE))[1] # TODO: Fix this to use specified metric sklearn_predictions = sklearn_modeling_pipeline.predict(x_test) sklearn_results_metric = self.scoring_function(y_test, sklearn_predictions) if transformer_results_metric > sklearn_results_metric: modeling_pipeline = transformer_modeling_pipeline else: modeling_pipeline = sklearn_modeling_pipeline self.modeling_pipeline = modeling_pipeline self.logger.info("Successfully found best modeling pipeline type") return modeling_pipeline
def _load_dataset(self, batch_size: int = None, data_path: Path = None) -> Tuple[Dataset, Dataset]: """ Utility function for loading tensorflow dataset from provided directory. The function splits train and validation :param batch_size: A batch size for the datasets :param data_path: Path to data directory. Files should be organized for tensorflow's text_dataset_from_directory :return: Returns a tuple with train and validation datasets """ batch_size = batch_size or self.config.BATCH_SIZE data_path = data_path or self.config.DATA_PATH validate_variables(batch_size, data_path) train_dataset = tf.keras.preprocessing.text_dataset_from_directory( directory=data_path, validation_split=self.config.TEST_SIZE, batch_size=batch_size, subset="training", seed=self.config.SEED, shuffle=True) validation_dataset = tf.keras.preprocessing.text_dataset_from_directory( directory=data_path, validation_split=self.config.TEST_SIZE, batch_size=batch_size, subset="validation", seed=self.config.SEED, shuffle=True) self.train_dataset = validation_dataset self.validation_dataset = validation_dataset self.logger.info(f"Successfully loaded train and validation datasets ") return train_dataset, validation_dataset
def _split_train_test_structured(self, x: DataFrame = None, y: Series = None, test_size: float = 0.2, random_state: int = 42)\ -> Tuple[DataFrame, DataFrame, Series, Series]: if isinstance(x, pd.DataFrame): x = x elif x is None: x = self.x if isinstance(y, pd.DataFrame): y = y elif y is None: y = self.y test_size = test_size or self.config.TEST_SIZE random_state = random_state or self.config.SEED validate_variables(x, y, test_size, random_state) x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=test_size, random_state=random_state) self.x_train = x_train self.x_test = x_test self.y_train = y_train self.y_test = y_test self.logger.info("Successfully splat train and test data") return x_train, x_test, y_train, y_test
def _get_avg_img_size(self, path: Path = None) -> Tuple[int, int]: """ Utility function to get average image size from your data, necessary to choose proper EfficientNet version :param path: Path to data files :return: Returns a tuple with mean image size from provided image data """ path = path or self.config.DATA_PATH validate_variables(path) height_list = [] width_list = [] for subclass_dir in path.iterdir(): for img_path in subclass_dir.iterdir(): img = cv2.imread(str(img_path)) height, width, _ = img.shape height_list.append(height) width_list.append(width) mean_height = int(sum(height_list) / len(height_list)) mean_width = int(sum(width_list) / len(width_list)) self.img_size = (mean_height, mean_width) self.logger.info( f"Mean height is: {mean_height}, mean width is: {mean_width}") return self.img_size
def _get_efficientnet_and_size( self, img_size: Tuple[int, int] = None) -> Tuple[Tuple[int, int], Sequential]: """ Utility function to get proper type of EfficientNet, the version is chosen based on the mean image size More on: https://keras.io/examples/vision/image_classification_efficientnet_fine_tuning/ :param img_size: Mean input size of image files :return: Returns a tuple of image_size after changing to right static value and EfficientNet object """ img_size = img_size or self.img_size validate_variables(img_size) img_size = (img_size[0] + img_size[1]) / 2 if 564 < img_size: img_size = (600, 600) model = tf.keras.applications.EfficientNetB7 elif 492 < img_size <= 564: img_size = (528, 528) model = tf.keras.applications.EfficientNetB6 else: img_size = (456, 456) model = tf.keras.applications.EfficientNetB5 self.img_size = img_size self.model = model self.logger.info(f"Chosen model is {model} with img_size {img_size}") return img_size, model
def _make_sklearn_prediction(self, data: dict) -> Tuple[np.ndarray, str]: """ Utility function for making predictions with sklearn :param data: Data to make predictions on :return: Returns a tuple of class probabilities and str with class prediction """ modeling_pipeline = self.modeling_pipeline or self.load_pipeline( self.path) validate_variables(modeling_pipeline, data) prediction = modeling_pipeline.predict_proba(data)[0] max_pred_idx = np.argmax(prediction) prediction_proba = prediction[max_pred_idx] try: map_ = self.config.LABEL_MAPPING except AttributeError: map_ = self.config.DISEASES prediction_string = map_[max_pred_idx] self.logger.info( f"Made predictions with model version: {dermclass_models_version} " f"Inputs: {data} " f"Prediction: {prediction_string} " f"Probability: {prediction_proba}") return prediction_proba, prediction_string
def _hyper_param_optimization(trial, model_name: str, trial_func: Trial, max_overfit: float, cv: int, x_train: DataFrame, x_test: DataFrame, y_train: Series, y_test: Series): validate_variables(trial, model_name, trial_func, x_train, x_test, y_train, y_test) model_obj = eval(model_name) cv_score = np.mean( cross_val_score(model_obj(**trial_func(trial)), x_train, y_train, scoring="accuracy", cv=cv)) model = model_obj(**trial_func(trial)) model.fit(x_train, y_train) train_score = accuracy_score(y_train, model.predict(x_train)) test_score = accuracy_score(y_test, model.predict(x_test)) if abs(train_score - test_score) > max_overfit: output = 0 else: output = cv_score return output
def __init__(self, config: StructuredConfig = StructuredConfig): """ A class for preprocessing structured data :param config: Config object for the class """ validate_variables(config) super().__init__(config)
def _make_tf_prediction(self, data: np.array, diseases=List[str]) -> Tuple[np.array, str]: """ Utility function for making predictions with tensorflow :param data: Data to make predictions in numpy ndarray format :param diseases: A list of diseases names in proper order :return: Returns a tuple of class probabilities and str with class prediction """ modeling_pipeline = self.modeling_pipeline or self.load_pipeline( self.path) diseases = diseases or self.config.DISEASES validate_variables(modeling_pipeline, data, diseases) prediction = modeling_pipeline.predict(data)[0] max_pred_idx = np.argmax(prediction) prediction_proba = prediction[max_pred_idx] try: map_ = self.config.LABEL_MAPPING except AttributeError: map_ = self.config.DISEASES prediction_string = map_[max_pred_idx] self.logger.info( f"Made predictions with model version: {dermclass_models_version} " f"Inputs: {data} " f"Prediction: {prediction_string} " f"Probability: {prediction_proba}") return prediction_proba, prediction_string
def save_pipeline(self, pipeline_object: Union[TransformersModelingPipeline, SklearnPipeline, Sequential], backend: str = None, path: Path = None): """ A function for saving pipeline using provided backend to given path :param pipeline_object: A pipeline object to save :param backend: Type of backend used for saving given pipeline, has to be one of ["joblib", "tf", "tfm"] :param path: Path to save file or directory """ if backend not in ["joblib", "tf", "tfm"]: raise ValidationError( "Please choose proper backend from ['joblib', 'tf', 'tfm']") path = path or self.config.PICKLE_DIR / f"{self.config.PIPELINE_TYPE}_{self.pipeline_version}" validate_variables(pipeline_object, backend, path) self.remove_old_pipelines() if backend == "joblib": joblib.dump(pipeline_object, str(path) + ".joblib") if backend == "tf": pipeline_object.save(path) if backend == "tfm": pipeline_object.processing_pipeline.tokenizer.save_pretrained(path) pipeline_object.model.save_pretrained(path) self.logger.info( f"Saved pipeline {str(pipeline_object)}, to path {path}")
def load_pipeline(self, backend: str = None, path: Path = None)\ -> Union[TransformersModelingPipeline, SklearnPipeline, Sequential]: """ Function for loading pipeline from given path using provided backend. Can be used either with set params or params from the config :param backend: Type of backend used for loading given pipeline, has to be one of ["joblib", "tf", "tfm"] :param path: Path to loaded file or directory :return: Returns a pipeline for making predictions """ if backend not in ["joblib", "tf", "tfm"]: raise ValidationError( "Please choose proper backend from ['joblib', 'tf', 'tfm']") path = path or self.config.PICKLE_DIR / f"{self.config.PIPELINE_TYPE}_{self.pipeline_version}" validate_variables(backend, path) if backend == "joblib": pipeline = joblib.load(str(path) + '.joblib') elif backend == "tf": pipeline = load_model(path) elif backend == "tfm": pipeline = TransformersModelingPipeline.load_from_pretrained(path) else: pipeline = None self.logger.info(f"{path.name} loaded") return pipeline
def set_img_size_and_model_obj(self, img_size: Tuple[int, int], model_obj: Sequential): validate_variables(img_size, model_obj) self.img_size = img_size self.model_obj = model_obj self.logger.info("Successfully set img size and model obj")
def fit_datasets(self, train_dataset: Dataset, validation_dataset: Dataset, test_dataset: Dataset): validate_variables(train_dataset, validation_dataset, test_dataset) self.train_dataset = train_dataset self.validation_dataset = validation_dataset self.test_dataset = test_dataset validate_variables(train_dataset, validation_dataset, test_dataset)
def test_validate_variables(structured_training_df): args_with_none = ["test", 1, None] args_pd = ["test2", 2, structured_training_df] with pytest.raises(TypeError): validate_variables(*args_with_none) # Check if doesn't raise an error validate_variables(*args_pd)
def load_from_pretrained(cls, path: Path): model = TFDistilBertForSequenceClassification.from_pretrained(path) tokenizer = DistilBertTokenizerFast.from_pretrained(path) processing_pipeline = TransformersProcessingPipeline( TextPipeline.encode_dataset, tokenizer) validate_variables(model, tokenizer, processing_pipeline) return cls(model=model, processing_pipeline=processing_pipeline)
def __init__(self, config): """ Class for saving and loading pipeline objects. :param config: Config object for the class """ validate_variables(config) self.config = config self.pipeline_version = _version self.logger = logging.getLogger(__name__)
def __init__(self, config): """ Abstract base class for training pipeline and saving it :param config: Config object for the class """ validate_variables(config) self.config = config self.logger = logging.getLogger(__name__) self.modeling_pipeline = None
def make_prediction(self, input_data: dict) -> Tuple[np.ndarray, str]: """ Function to make prediction on given data :param input_data: Input data to make prediction on :return: Returns a tuple of class probabilities and str with class prediction """ validate_variables(input_data) data = self._prepare_data(input_data) prediction_probabilities, prediction_string = self._make_sklearn_prediction( data) return prediction_probabilities, prediction_string
def _load_structured_data(self, path: Path = None) -> DataFrame: """ Utility function to loaod structured data from the csv :param path: Path to data file :return: Returns a pandas DataFrame with data loaded """ path = path or self.config.DATA_PATH validate_variables(path) df = pd.read_csv(path) self.df = df self.logger.info("Successfully loaded data from csv") return df
def _prepare_data(self, input_data: dict) -> DataFrame: """ Utility function to prepare data to format and validate data which can be used in modeling pipeline :param input_data: Input data to make prediction on :return: Returns a pandas DataFrame with data ready for making predictions using modeling pipeline """ validate_variables(input_data) if not self.validator: raise RuntimeError("No validator object fitted") df = pd.DataFrame(input_data, index=[0]) df_validated = self.validator.validate(df) return df_validated
def __init__(self, config): """ Abstract base class used for making prediction :param config: Config object for the class """ validate_variables(config) self.config = config self.logger = logging.getLogger(__name__) self.persister = BasePersistence(config) self.modeling_pipeline = None self.backend = None self.path = None
def __init__(self, config): """ An abstract class for for preprocessing data with tensorflow :param config: Config object for the class """ validate_variables(config) self.config = config self.logger = logging.getLogger(__name__) self.train_dataset = Dataset self.validation_dataset = Dataset self.test_dataset = Dataset self.prefetch = False
def load_data( self, path: Path = None) -> Tuple[DataFrame, DataFrame, Series, Series]: """ Function to load structured data using sklearn :param path: Path to data directory :return: Returns a tuple with x_train, x_test, y_train, y_test data """ path = path or self.config.DATA_PATH validate_variables(path) df = self._load_structured_data(path) x_train, x_test, y_train, y_test = self._load_data_structured(df) return x_train, x_test, y_train, y_test
def get_model(self, x_train: DataFrame = None, x_test: DataFrame = None, y_train: Series = None, y_test: Series = None): x_train, x_test, y_train, y_test = self._set_dfs( x_train, x_test, y_train, y_test) validate_variables(x_train, x_test, y_train, y_test) model = self._get_sklearn_model(x_train, x_test, y_train, y_test) self.model = model self.logger.info("Successfully loaded structured model") return model
def load_pipeline(self, backend: str = None, path: Path = None): """Function to load pipeline using persister and fit it as a modeling pipeline :param backend: Type of backend used for loading given pipeline, has to be one of ["joblib", "tf", "tfm"] :param path: Path to loaded file or directory :return: Returns a modeling pipeline to make predictions with """ backend = backend or self.backend validate_variables(backend) if not self.persister: raise RuntimeError("No preprocessor object fitted") modeling_pipeline = self.persister.load_pipeline(backend=backend, path=path) self.modeling_pipeline = modeling_pipeline return modeling_pipeline
def get_model(self, model_obj=None): model_obj = model_obj or self.model_obj validate_variables(model_obj) model = model_obj(include_top=False, weights='imagenet', classes=len(self.config.DISEASES)) model.trainable = False self.model = model self.logger.warning( "Warning! get_model function in ImagePipeline returns unfitted model" ) return model
def _prepare_data(self, input_data: dict, img_shape: Tuple[int, int]) -> np.array: """ Utility function to prepare data to format which can be used in modeling pipeline :param input_data: Input data to make prediction on :param img_shape: Shape of image to resize data :return: An array with data ready for making predictions using modeling pipeline """ img_shape = img_shape or self.img_shape validate_variables(input_data, img_shape) data = input_data["img_array"] data = np.resize(data, img_shape) data = np.expand_dims(data, 0) return data
def _get_img_shape(self, modeling_pipeline: Sequential) -> Tuple[int, int]: """ Utility function to get image shape, necessary for resizing input data :param modeling_pipeline: A tensorflow model object to get image shape from :return: A tuple with image shape """ validate_variables(modeling_pipeline) if modeling_pipeline.layers[1].name == "efficientnetb7": img_size = (600, 600, 3) elif modeling_pipeline.layers[1].name == "efficientnetb6": img_size = (528, 528, 3) else: img_size = (456, 456, 3) self.img_size = img_size return img_size