def run_training(): """Train the model.""" # read training data data = pd.read_csv(config.TRAINING_DATA_FILE) # divide train and test X_train, X_test, y_train, y_test = train_test_split(data.drop(config.TARGET, axis = 1), data[config.TARGET], test_size = 0.2, random_state = 0) # fit pipeline titanic_pipe.fit(X_train, y_train) # save pipeline joblib.dump(titanic_pipe, config.PIPELINE_NAME)
def run_training(): """Train the model.""" data = pd.read_csv(config.TRAINING_DATA_FILE) training_data, testing_data, training_target, testing_target = train_test_split( data.drop(config.TARGET, axis=1), data[config.TARGET], test_size=0.2, random_state=0) # we are setting the seed here titanic_pipe.fit(training_data, training_target) joblib.dump(titanic_pipe, config.PIPELINE_NAME)
def run_training(): """Train the model.""" # read training data data = pd.read_csv(config.TRAINING_DATA_FILE) # divide train and test X_train, X_test, y_train, y_test = train_test_split( data.drop('survived', axis=1), # predictors data['survived'], # target test_size=0.2, # percentage of obs in test set random_state=0) # seed to ensure reproducibility # fit pipeline titanic_pipe.fit(X_train, y_train) # save pipeline joblib.dump(titanic_pipe, config.PIPELINE_NAME)
def run_training(): """Train the model.""" # read training data X = pd.read_csv(config.TRAINING_DATA_FILE) # divide train and test features = config.NUMERICAL_VARS + config.CATEGORICAL_VARS X_train, X_test, y_train, y_test = train_test_split(X[features], X[config.TARGET], test_size=0.2, random_state=0) # fit pipeline titanic_pipe.fit(X_train[features], y_train) # save pipeline joblib.dump(titanic_pipe, config.PIPELINE_NAME)
def run_training(): """Train the model.""" # read training data data = pd.read_csv(config.TRAINING_DATA_FILE) # divide train and test X_train, X_test, y_train, y_test = train_test_split( data[config.FEATURES], data[config.TARGET], test_size=0.1, random_state=0) # we are setting the seed here # fit pipeline titanic_pipe.fit(X_train[config.FEATURES], y_train) # save pipeline joblib.dump(titanic_pipe, config.PIPELINE_NAME)
def run_training() -> None: """Train the model.""" # read training data data = pd.read_csv(config.CLEANED_DATA) # divide train and test X_train, X_test, y_train, y_test = train_test_split( data.drop(config.TARGET, axis=1), data[config.TARGET], test_size=0.2, random_state=config.SEED) # we are setting the seed here # Fit ML pipeline titanic_pipe.fit(X_train, y_train) logger.info("**Training ML Pipeline started") # save pipeline save_pipeline(pipeline_to_persist=titanic_pipe) logger.info("**Saved trained ML pipeline")
def run_training(): """Train the model.""" # read training data config = ut.read_config_file('config.yaml') path = config[0]['Paths'].get('directory') filename = config[0]['Paths'].get('data_filename') extension = config[0]['Paths'].get('data_extension') cols = config[2]['Feature_Groups'].get('data_columns') target = config[2]['Feature_Groups'].get('target') data = ut.load_data(path=path, filename=filename, extension=extension, cols=cols) # divide train and test data[target] = data[target].astype(int) X_train, X_test, y_train, y_test = train_test_split(data.drop(target, axis=1), data[target], test_size=0.2, random_state=0) # fit pipeline titanic_pipe.fit(X_train, y_train) # save pipeline joblib.dump(titanic_pipe, config[0]['Paths'].get('output_model_path'))