コード例 #1
0
import click
from src.utils.click import PathlibPath
from pathlib import Path

from joblib import load
import numpy as np

MODEL_DIR = os.environ["MODEL_DIR"]
MODEL_FILE = os.environ["MODEL_FILE"]
METADATA_FILE = os.environ["METADATA_FILE"]
MODEL_PATH = os.path.join(MODEL_DIR, MODEL_FILE)
METADATA_PATH = os.path.join(MODEL_DIR, METADATA_FILE)


@click.command()
@click.argument('input_file', type=PathlibPath(exists=True, dir_okay=False))
@click.option(
    '-o',
    'output_file',
    type=PathlibPath(dir_okay=False),
    help=
    'Path to the CSV file with generated predictions (default: <input directory>/predictions.csv)'
)
@click.option(
    '-p',
    'probabilities',
    is_flag=True,
    help=
    'Whether to generate probabilities of class 1 instead of predicted classes'
)
def main(input_file, output_file, probabilities):
コード例 #2
0
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=42,
                                                        stratify=y)

    ujob.dump_multiple(
        [X_train, X_test, y_train, y_test], output_path,
        ['X_train.joblib', 'X_test.joblib', 'y_train.joblib', 'y_test.joblib'])


@click.command()
@click.option(
    '-i',
    'input_path',
    type=PathlibPath(exists=True, file_okay=False),
    default=DATASET_DIR,
    help=
    'Input directory for the dataset (default: <project_dir>/data/processed/dataset)'
)
@click.option(
    '-o',
    'output_path',
    type=PathlibPath(exists=True, file_okay=False),
    default=TRAIN_TEST_DIR,
    help=
    'Output directory for the train/test dataset parts (default: <project_dir>/data/processed/train_test)'
)
def cli(input_path, output_path):
    """ Split dataset into train and test parts.
    """
コード例 #3
0
def main(input_path=DATASET_PROCESSED_DIR, output_path=TRAIN_TEST_DIR):
    output_path.mkdir(parents=True, exist_ok=True)
    X, y = ujob.load_multiple(input_path, ['X.joblib', 'y.joblib'])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
        test_size=0.3, shuffle=False)
    
    ujob.dump_multiple([X_train, X_test, y_train, y_test], 
                       output_path, 
                       ['X_train.joblib', 'X_test.joblib', 'y_train.joblib', 'y_test.joblib'])


@click.command()
@click.option('-i', 'input_path', 
              type=PathlibPath(exists=True, file_okay=False), default=DATASET_PROCESSED_DIR, 
              help='Input directory for the processed dataset (default: <project_dir>/data/processed/dataset_processed)')
@click.option('-o', 'output_path', 
              type=PathlibPath(exists=True, file_okay=False), default=TRAIN_TEST_DIR, 
              help='Output directory for the train/test dataset parts (default: <project_dir>/data/processed/train_test)')
def cli(input_path, output_path):
    """ Splits dataset into train and test parts.
    """
    logger = logging.getLogger(__name__)
    logger.info('splitting data into train and test')
    
    main(input_path, output_path)


if __name__ == '__main__':
    log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'