def train(config_file, experiment_name): "train model" output_directory = SCRIPT_DIR / 'output' / 'results' model = LudwigModel(config=str(config_file)) higgs_df = higgs.load().sample(frac=1) # shuffle data print(higgs_df.shape) sys.exit() higgs_data_file = SCRIPT_DIR / "higgs_small.parquet" higgs_data_file = ( "s3://data-science.s3.liftoff.io/datascience/temp/higgs_small.parquet") higgs_ddf = dd.from_pandas(higgs_df, npartitions=30) higgs_ddf.to_parquet(higgs_data_file, engine="pyarrow") (training_statistics, preprocessed_data, output_directory) = model.train( dataset=higgs_data_file, data_format="parquet", experiment_name=experiment_name, skip_save_processed_input=False, output_directory=str(output_directory), ) print('training_statistics keys: {}'.format(training_statistics.keys())) ( training_set, validation_set, test_set, training_set_metadata, ) = preprocessed_data print(f'{training_set.size=}, {validation_set.size=}, {test_set.size=}') print('training_set_metadata keys {}'.format(training_set_metadata.keys())) print(f'{output_directory=}')
import logging from ludwig.api import LudwigModel from ludwig.datasets import higgs model = LudwigModel( config='medium_config.yaml', logging_level=logging.INFO, ) higgs_df = higgs.load() model.train(dataset=higgs_df, experiment_name='higgs_medium', model_name='higgs_tabnet_medium')