Esempio n. 1
0
def run_training():
    """Train the model."""

    # read training data
    data = load_dataset(config.TRAINING_DATA_FILE)
    print("------------")
    # select only the customer conversations
    data = data[data["message_source"] == "customer"][["message","case_type"]]


    # divide train and test
    sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=config.SEED)

    X = data.drop(config.TARGET, axis=1)
    y = data[config.TARGET].apply(lambda x: 0 if x=="cancel_order" else 1)


    for train_index, test_index in sss.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]


    X_train = X_train[config.FEATURES]
    X_test = X_test[config.FEATURES]

    print(X_train.iloc[0])


    pipeline.full_pipe.fit(X_train, y_train)

    save_pipeline(pipeline_to_persist=pipeline.full_pipe)
def run_training() -> None:

    data = load_data(filename=config.DATA_FILE)
    X_train, X_test, y_train, y_test = train_test_split(data[config.FEATURES],
                                                        data[config.TARGET],
                                                        test_size=0.4,
                                                        random_state=42)
    y_train = encode_target(y_train)
    status_pipeline.fit(X_train, y_train)
    save_pipeline(pipeline_to_persist=status_pipeline)
def run_training() -> None:
    """Train the model."""
    # read training data
    data = load_dataset(file_name=config.TRAINING_DATA_FILE)
    # divide train and test
    X_train, X_test, y_train, y_test = train_test_split(
        data.drop(config.TARGET, axis=1),
        data[config.TARGET],
        test_size=0.2,
        random_state=config.RANDOM_SEED)  # we are setting the seed here
    # fit pipeline
    pipeline.titanic_pipe.fit(X_train, y_train)

    _logger.info(f"saving model version: {_version}")
    # save pipeline
    save_pipeline(pipeline_to_persist=pipeline.titanic_pipe)
def run_training():
    """Train the model."""

    # read training data
    data = load_dataset(file_name=config.TRAINING_DATA_FILE)

    # divide train and test
    X_train, X_test, y_train, y_test = train_test_split(
        data.drop(config.TARGET, axis=1),
        data[config.TARGET],
        test_size=0.2,
        random_state=0)  # we are setting the seed here

    pipeline.titanic_pipe.fit(X_train, y_train)
    joblib.dump(pipeline.titanic_pipe, config.PIPELINE_NAME)

    _logger.info(f"saving model version: {_version}")
    save_pipeline(pipeline_to_persist=pipeline.titanic_pipe)
def run_training() -> None:

    data = load_dataset(file_name=config.TRAINING_DATA_FILE)

    X_train, X_test, y_train, y_test = train_test_split(data[config.FEATURES],
                                                        data[config.TARGET],
                                                        test_size=0.1,
                                                        random_state=0)

    X_train[config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES +
            config.DISCRETE_SET3_FEATURES] = X_train[
                config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES +
                config.DISCRETE_SET3_FEATURES].astype(str)

    X_test[config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES +
           config.DISCRETE_SET3_FEATURES] = X_test[
               config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES +
               config.DISCRETE_SET3_FEATURES].astype(str)

    pipeline.rf_pipe.fit(X_train[config.FEATURES], y_train)

    _logger.info(f"saving model version: {_version}")
    save_pipeline(pipeline_to_persist=pipeline.rf_pipe)