import click from src.utils.click import PathlibPath from pathlib import Path from joblib import load import numpy as np MODEL_DIR = os.environ["MODEL_DIR"] MODEL_FILE = os.environ["MODEL_FILE"] METADATA_FILE = os.environ["METADATA_FILE"] MODEL_PATH = os.path.join(MODEL_DIR, MODEL_FILE) METADATA_PATH = os.path.join(MODEL_DIR, METADATA_FILE) @click.command() @click.argument('input_file', type=PathlibPath(exists=True, dir_okay=False)) @click.option( '-o', 'output_file', type=PathlibPath(dir_okay=False), help= 'Path to the CSV file with generated predictions (default: <input directory>/predictions.csv)' ) @click.option( '-p', 'probabilities', is_flag=True, help= 'Whether to generate probabilities of class 1 instead of predicted classes' ) def main(input_file, output_file, probabilities):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y) ujob.dump_multiple( [X_train, X_test, y_train, y_test], output_path, ['X_train.joblib', 'X_test.joblib', 'y_train.joblib', 'y_test.joblib']) @click.command() @click.option( '-i', 'input_path', type=PathlibPath(exists=True, file_okay=False), default=DATASET_DIR, help= 'Input directory for the dataset (default: <project_dir>/data/processed/dataset)' ) @click.option( '-o', 'output_path', type=PathlibPath(exists=True, file_okay=False), default=TRAIN_TEST_DIR, help= 'Output directory for the train/test dataset parts (default: <project_dir>/data/processed/train_test)' ) def cli(input_path, output_path): """ Split dataset into train and test parts. """
def main(input_path=DATASET_PROCESSED_DIR, output_path=TRAIN_TEST_DIR): output_path.mkdir(parents=True, exist_ok=True) X, y = ujob.load_multiple(input_path, ['X.joblib', 'y.joblib']) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False) ujob.dump_multiple([X_train, X_test, y_train, y_test], output_path, ['X_train.joblib', 'X_test.joblib', 'y_train.joblib', 'y_test.joblib']) @click.command() @click.option('-i', 'input_path', type=PathlibPath(exists=True, file_okay=False), default=DATASET_PROCESSED_DIR, help='Input directory for the processed dataset (default: <project_dir>/data/processed/dataset_processed)') @click.option('-o', 'output_path', type=PathlibPath(exists=True, file_okay=False), default=TRAIN_TEST_DIR, help='Output directory for the train/test dataset parts (default: <project_dir>/data/processed/train_test)') def cli(input_path, output_path): """ Splits dataset into train and test parts. """ logger = logging.getLogger(__name__) logger.info('splitting data into train and test') main(input_path, output_path) if __name__ == '__main__': log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'