def predict_chains(chains: Iterable[List[Method]], sources: Iterable[Method], method_feats: Dict[Method, MethodFeature], proj_feat: ProjectFeature, d2v_model: Doc2Vec, predictor: TabularPredictor) -> List[List[ChainEntry]]: df_list: List[pd.DataFrame] = [] for chain, source in zip(chains, sources): if len(chain) == 0: continue df = chain_to_df(chain=chain, source=source, method_features=method_feats, project_feature=proj_feat, d2v_model=d2v_model) df_list.append(df) large_df = pd.concat(df_list) prob: np.ndarray = predictor.predict_proba(large_df) results: List[List[ChainEntry]] = [] cur = 0 # row cursor of large df for chain in chains: chain_prob: List[ChainEntry] = [] for method in chain: chain_prob.append(ChainEntry(method, prob[cur])) cur += 1 results.append(chain_prob) assert cur == len(large_df) return results
def predict_chain(chain: List[Method], source: Method, method_features: Dict[Method, MethodFeature], project_feature: ProjectFeature, d2v_model: Doc2Vec, predictor: TabularPredictor) -> List[ChainEntry]: if len(chain) == 0: return [] df = chain_to_df(chain=chain, source=source, method_features=method_features, project_feature=project_feature, d2v_model=d2v_model) probabilities: np.ndarray = predictor.predict_proba(df) result = [ ChainEntry(method, probabilities[i]) for i, method in enumerate(chain) ] result.append(ChainEntry(None, 0.5)) return result
def train(args): model_output_dir = f'{args.output_dir}/data' is_distributed = len(args.hosts) > 1 host_rank = args.hosts.index(args.current_host) dist_ip_addrs = args.hosts dist_ip_addrs.pop(host_rank) # Load training and validation data print(f'Train files: {os.listdir(args.train)}') train_data = __load_input_data(args.train) # Extract column info target = args.init_args['label'] columns = train_data.columns.tolist() column_dict = {"columns": columns} with open('columns.pkl', 'wb') as f: pickle.dump(column_dict, f) # Train models args.init_args['path'] = args.model_dir #args.fit_args.pop('label', None) predictor = TabularPredictor(**args.init_args).fit(train_data, **args.fit_args) # Results summary predictor.fit_summary(verbosity=3) #model_summary_fname_src = os.path.join(predictor.output_directory, 'SummaryOfModels.html') model_summary_fname_src = os.path.join(args.model_dir, 'SummaryOfModels.html') model_summary_fname_tgt = os.path.join(model_output_dir, 'SummaryOfModels.html') if os.path.exists(model_summary_fname_src): shutil.copy(model_summary_fname_src, model_summary_fname_tgt) # ensemble visualization G = predictor._trainer.model_graph remove = [node for node, degree in dict(G.degree()).items() if degree < 1] G.remove_nodes_from(remove) A = nx.nx_agraph.to_agraph(G) A.graph_attr.update(rankdir='BT') A.node_attr.update(fontsize=10) for node in A.iternodes(): node.attr['shape'] = 'rectagle' A.draw(os.path.join(model_output_dir, 'ensemble-model.png'), format='png', prog='dot') # Optional test data if args.test: print(f'Test files: {os.listdir(args.test)}') test_data = __load_input_data(args.test) # Test data must be labeled for scoring if target in test_data: # Leaderboard on test data print('Running model on test data and getting Leaderboard...') leaderboard = predictor.leaderboard(test_data, silent=True) print(format_for_print(leaderboard), end='\n\n') leaderboard.to_csv(f'{model_output_dir}/leaderboard.csv', index=False) # Feature importance on test data # Note: Feature importance must be calculated on held-out (test) data. # If calculated on training data it will be biased due to overfitting. if args.feature_importance: print('Feature importance:') # Increase rows to print feature importance pd.set_option('display.max_rows', 500) feature_importance_df = predictor.feature_importance(test_data) print(feature_importance_df) feature_importance_df.to_csv( f'{model_output_dir}/feature_importance.csv', index=True) # Classification report and confusion matrix for classification model if predictor.problem_type in [BINARY, MULTICLASS]: from sklearn.metrics import classification_report, confusion_matrix X_test = test_data.drop(target, axis=1) y_test_true = test_data[target] y_test_pred = predictor.predict(X_test) y_test_pred_prob = predictor.predict_proba(X_test, as_multiclass=True) report_dict = classification_report( y_test_true, y_test_pred, output_dict=True, labels=predictor.class_labels) report_dict_df = pd.DataFrame(report_dict).T report_dict_df.to_csv( f'{model_output_dir}/classification_report.csv', index=True) cm = confusion_matrix(y_test_true, y_test_pred, labels=predictor.class_labels) cm_df = pd.DataFrame(cm, predictor.class_labels, predictor.class_labels) sns.set(font_scale=1) cmap = 'coolwarm' sns.heatmap(cm_df, annot=True, fmt='d', cmap=cmap) plt.title('Confusion Matrix') plt.ylabel('true label') plt.xlabel('predicted label') plt.show() plt.savefig(f'{model_output_dir}/confusion_matrix.png') get_roc_auc(y_test_true, y_test_pred_prob, predictor.class_labels, predictor.class_labels_internal, model_output_dir) else: warnings.warn( 'Skipping eval on test data since label column is not included.' ) # Files summary print(f'Model export summary:') print(f"/opt/ml/model/: {os.listdir('/opt/ml/model/')}") models_contents = os.listdir('/opt/ml/model/models') print(f"/opt/ml/model/models: {models_contents}") print(f"/opt/ml/model directory size: {du('/opt/ml/model/')}\n")
def run(args): if args.task == 'product_sentiment': train_df, test_df, label_column = load_machine_hack_product_sentiment(args.train_file, args.test_file) elif args.task == 'mercari_price': train_df, test_df, label_column = load_mercari_price_prediction(args.train_file, args.test_file) elif args.task == 'price_of_books': train_df, test_df, label_column = load_price_of_books(args.train_file, args.test_file) elif args.task == 'data_scientist_salary': train_df, test_df, label_column = load_data_scientist_salary(args.train_file, args.test_file) else: raise NotImplementedError hyperparameters = get_hyperparameter_config('multimodal') if args.preset is not None and args.mode in ['stacking', 'weighted']: hyperparameters['AG_TEXT_NN']['presets'] = args.preset if args.mode == 'stacking': predictor = TabularPredictor(label=label_column, eval_metric=args.eval_metric, path=args.exp_dir) predictor.fit(train_data=train_df, hyperparameters=hyperparameters, num_bag_folds=5, num_stack_levels=1) elif args.mode == 'weighted': predictor = TabularPredictor(label=label_column, eval_metric=args.eval_metric, path=args.exp_dir) predictor.fit(train_data=train_df, hyperparameters=hyperparameters) elif args.mode == 'single': # When no embedding is used, # we will just use TextPredictor that will train a single model internally. predictor = TextPredictor(label=label_column, eval_metric=args.eval_metric, path=args.exp_dir) predictor.fit(train_data=train_df, presets=args.preset, seed=args.seed) else: raise NotImplementedError if args.task == 'product_sentiment': test_probabilities = predictor.predict_proba(test_df, as_pandas=True, as_multiclass=True) test_probabilities.to_csv(os.path.join(args.exp_dir, 'submission.csv'), index=False) elif args.task == 'data_scientist_salary': predictions = predictor.predict(test_df, as_pandas=False) submission = pd.read_excel(args.sample_submission, engine='openpyxl') submission.loc[:, label_column] = predictions submission.to_excel(os.path.join(args.exp_dir, 'submission.xlsx')) elif args.task == 'price_of_books': predictions = predictor.predict(test_df, as_pandas=False) submission = pd.read_excel(args.sample_submission, engine='openpyxl') submission.loc[:, label_column] = np.power(10, predictions) - 1 submission.to_excel(os.path.join(args.exp_dir, 'submission.xlsx')) elif args.task == 'mercari_price': test_predictions = predictor.predict(test_df, as_pandas=False) submission = pd.read_csv(args.sample_submission) submission.loc[:, label_column] = np.exp(test_predictions) - 1 submission.to_csv(os.path.join(args.exp_dir, 'submission.csv'), index=False) else: raise NotImplementedError
predictor.distill(time_limit=time_limit, hyperparameters={ 'GBM': {}, 'NN': {} }, teacher_preds='soft', augment_method='munge', augment_args={ 'size_factor': 1, 'max_size': 100 }, models_name_suffix='munge') predictor.distill( augmentation_data=aug_data, time_limit=time_limit, teacher_preds='soft', models_name_suffix='extra') # augmentation with "extra" unlabeled data. predictor.distill( time_limit=time_limit, teacher_preds=None, models_name_suffix='noteacher') # standard training without distillation. # Compare performance of different models on test data after distillation: ldr = predictor.leaderboard(test_data) model_to_deploy = distilled_model_names[0] y_pred = predictor.predict_proba(test_data, model_to_deploy) print(y_pred[:5])
ag_predictor_args = config["ag_predictor_args"] ag_predictor_args["path"] = args.model_dir ag_fit_args = config["ag_fit_args"] predictor = TabularPredictor(**ag_predictor_args).fit( train_data, **ag_fit_args) # --------------------------------------------------------------- Inference if args.test_dir: test_file = get_input_path(args.test_dir) test_data = TabularDataset(test_file) # Predictions y_pred_proba = predictor.predict_proba(test_data) if config.get("output_prediction_format", "csv") == "parquet": y_pred_proba.to_parquet( f"{args.output_data_dir}/predictions.parquet") else: y_pred_proba.to_csv(f"{args.output_data_dir}/predictions.csv") # Leaderboard if config.get("leaderboard", False): lb = predictor.leaderboard(test_data, silent=False) lb.to_csv(f"{args.output_data_dir}/leaderboard.csv") # Feature importance if config.get("feature_importance", False): feature_importance = predictor.feature_importance(test_data) feature_importance.to_csv(
def train_model(dataset_name, text_presets, save_dir, model, tabular_presets, num_gpus=None, get_competition_results=False, seed=123): set_seed(seed) if get_competition_results: train_dataset = dataset_registry.create(dataset_name, 'train') test_dataset = dataset_registry.create(dataset_name, 'competition') else: train_dataset = dataset_registry.create(dataset_name, 'train') test_dataset = dataset_registry.create(dataset_name, 'test') feature_columns = train_dataset.feature_columns label_columns = train_dataset.label_columns metric = train_dataset.metric problem_type = train_dataset.problem_type train_data1, tuning_data1 = sklearn.model_selection.train_test_split( train_dataset.data, test_size=0.05, random_state=np.random.RandomState(seed)) train_data = train_dataset.data test_data = test_dataset.data column_types, inferred_problem_type = infer_column_problem_types( train_data1, tuning_data1, label_columns=label_columns, problem_type=problem_type) train_data = train_data[feature_columns + label_columns] # tuning_data = tuning_data[feature_columns + label_columns] if not get_competition_results: test_data = test_data[feature_columns + label_columns] train_tic = time.time() if model == 'ag_tabular_quick': MAX_NGRAM = 300 feature_generator = AutoMLPipelineFeatureGenerator( vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8)) predictor = TabularPredictor(label=label_columns[0], path=save_dir, problem_type=problem_type) predictor.fit(train_data, time_limit=30, feature_generator=feature_generator) elif model == 'ag_tabular_without_text': no_text_feature_columns = [] for col_name in feature_columns: if column_types[col_name] != _TEXT: no_text_feature_columns.append(col_name) train_data = train_data[no_text_feature_columns + label_columns] # tuning_data = tuning_data[no_text_feature_columns + label_columns] test_data = test_data[no_text_feature_columns + label_columns] predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets in ['best_quality']: predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS, presets=tabular_presets) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS, num_bag_folds=5, num_stack_levels=1) elif tabular_presets == 'no': predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError elif model == 'ag_tabular_old': predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets == 'best_quality': predictor.fit(train_data=train_data, presets=tabular_presets, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=5, num_stack_levels=1, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == 'no': predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError elif model == 'ag_text_only': text_feature_columns = [ col_name for col_name in feature_columns if column_types[col_name] == _TEXT ] train_data = train_data[text_feature_columns + label_columns] test_data = test_data[text_feature_columns + label_columns] predictor = TextPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) hparams = ag_text_presets.create(text_presets) if len(train_data) > 500000: hparams = set_epoch3(hparams) predictor.fit(train_data=train_data, hyperparameters=hparams, num_gpus=num_gpus, seed=seed) elif model == 'ag_text_multimodal': predictor = TextPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) hparams = ag_text_presets.create(text_presets) if len(train_data) > 500000: hparams = set_epoch3(hparams) predictor.fit(train_data=train_data, hyperparameters=hparams, num_gpus=num_gpus, seed=seed) elif model == 'pre_embedding' or model == 'tune_embedding_multimodal' or model == 'tune_embedding_text': feature_generator = AutoMLPipelineFeatureGenerator( enable_text_special_features=False, enable_text_ngram_features=False) pre_embedding_folder = os.path.join(_CURR_DIR, 'pre_computed_embeddings') if model == 'pre_embedding': train_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'pretrain_text_embedding', 'train.npy')) test_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'pretrain_text_embedding', 'test.npy')) elif model == 'tune_embedding_multimodal': train_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'multimodal_embedding', 'train.npy')) test_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'multimodal_embedding', 'test.npy')) elif model == 'tune_embedding_text': train_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'tuned_text_embedding', 'train.npy')) test_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'tuned_text_embedding', 'test.npy')) else: raise NotImplementedError train_data = train_data.join( pd.DataFrame(train_features, columns=[ f'pre_feat{i}' for i in range(train_features.shape[1]) ])) train_data.reset_index(drop=True, inplace=True) test_data = test_data.join( pd.DataFrame(test_features, columns=[ f'pre_feat{i}' for i in range(test_features.shape[1]) ])) test_data.reset_index(drop=True, inplace=True) predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets == 'best_quality': predictor.fit(train_data=train_data, presets=tabular_presets, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=5, num_stack_levels=1, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == 'no': predictor.fit(train_data=train_data, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError elif model == 'tabular_multimodal' or model == 'tabular_multimodal_just_table': if model == 'tabular_multimodal': MAX_NGRAM = 300 feature_generator = AutoMLPipelineFeatureGenerator( vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8), enable_raw_text_features=True) hyperparameters = get_multimodal_tabular_hparam_just_gbm( text_presets=text_presets) else: MAX_NGRAM = 300 feature_generator = AutoMLPipelineFeatureGenerator( vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8), enable_raw_text_features=True, enable_text_special_features=False, enable_text_ngram_features=False) hyperparameters = multimodal_tabular_just_table_hparam( text_presets=text_presets) predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets == 'best_quality': predictor.fit(train_data=train_data, presets=tabular_presets, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=5, num_stack_levels=1, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '3fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=3, num_stack_levels=1, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == 'no': predictor.fit(train_data=train_data, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError else: raise NotImplementedError train_toc = time.time() inference_tic = time.time() predictions = predictor.predict(test_data, as_pandas=True) predictor.save() inference_toc = time.time() if problem_type == MULTICLASS or problem_type == BINARY: prediction_prob = predictor.predict_proba(test_data, as_pandas=True) prediction_prob.to_csv( os.path.join(save_dir, 'test_prediction_prob.csv')) predictions.to_csv(os.path.join(save_dir, 'test_prediction.csv')) gt = test_data[label_columns[0]] gt.to_csv(os.path.join(save_dir, 'ground_truth.csv')) if not get_competition_results: score = predictor.evaluate(test_data) with open(os.path.join(save_dir, 'test_score.json'), 'w') as of: json.dump({metric: score}, of) with open(os.path.join(save_dir, 'speed_stats.json'), 'w') as of: json.dump( { 'train_time': train_toc - train_tic, 'inference_time': inference_toc - inference_tic, 'cpuinfo': cpuinfo.get_cpu_info() }, of)
label_columns = train_dataset.label_columns train_data = train_dataset.data test_data = test_dataset.data concat_df = pd.concat([train_data, test_data]) concat_df.reset_index(drop=True, inplace=True) competition_df = competition_dataset.data[feature_columns] if args.model_type == 'base': tabular_hparams = get_tabular_hparams(electra_base_late_fusion_concate_e10_avg3()) elif args.model_type == 'large': tabular_hparams = get_tabular_hparams(electra_large_late_fusion_concate_e10_avg3()) else: raise NotImplementedError time_str = strftime("%Y-%m-%d_%H-%M-%S", gmtime()) predictor = TabularPredictor( path=os.path.join(args.save_dir, args.model_type, time_str), problem_type=train_dataset.problem_type, eval_metric='log_loss', label=label_columns[0]) predictor.fit(concat_df[feature_columns + [label_columns[0]]], feature_generator=feature_generator, num_bag_folds=5, num_stack_levels=1, hyperparameters=tabular_hparams) predictor.save() predictions = predictor.predict_proba(competition_df, as_pandas=True) predictions.to_csv(os.path.join(args.save_dir, args.model_type, time_str, 'pred_probabilities.csv'))
def run(dataset, config): log.info(f"\n**** AutoGluon [v{__version__}] ****\n") metrics_mapping = dict( acc=metrics.accuracy, auc=metrics.roc_auc, f1=metrics.f1, logloss=metrics.log_loss, mae=metrics.mean_absolute_error, mse=metrics.mean_squared_error, r2=metrics.r2, rmse=metrics.root_mean_squared_error, ) perf_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if perf_metric is None: # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping log.warning("Performance metric %s not supported.", config.metric) is_classification = config.type == 'classification' training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } train, test = dataset.train.path, dataset.test.path label = dataset.target.name problem_type = dataset.problem_type models_dir = tempfile.mkdtemp() + os.sep # passed to AG with Timer() as training: predictor = TabularPredictor( label=label, eval_metric=perf_metric.name, path=models_dir, problem_type=problem_type, ).fit(train_data=train, time_limit=config.max_runtime_seconds, **training_params) del train if is_classification: with Timer() as predict: probabilities = predictor.predict_proba(test, as_multiclass=True) predictions = probabilities.idxmax(axis=1).to_numpy() else: with Timer() as predict: predictions = predictor.predict(test, as_pandas=False) probabilities = None prob_labels = probabilities.columns.values.astype( str).tolist() if probabilities is not None else None _leaderboard_extra_info = config.framework_params.get( '_leaderboard_extra_info', False) # whether to get extra model info (very verbose) _leaderboard_test = config.framework_params.get( '_leaderboard_test', False) # whether to compute test scores in leaderboard (expensive) leaderboard_kwargs = dict(silent=True, extra_info=_leaderboard_extra_info) # Disabled leaderboard test data input by default to avoid long running computation, remove 7200s timeout limitation to re-enable if _leaderboard_test: leaderboard_kwargs['data'] = test leaderboard = predictor.leaderboard(**leaderboard_kwargs) with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000): log.info(leaderboard) num_models_trained = len(leaderboard) if predictor._trainer.model_best is not None: num_models_ensemble = len( predictor._trainer.get_minimum_model_set( predictor._trainer.model_best)) else: num_models_ensemble = 1 save_artifacts(predictor, leaderboard, config) shutil.rmtree(predictor.path, ignore_errors=True) return result(output_file=config.output_predictions_file, predictions=predictions, probabilities=probabilities, probabilities_labels=prob_labels, target_is_encoded=False, models_count=num_models_trained, models_ensemble_count=num_models_ensemble, training_duration=training.duration, predict_duration=predict.duration)
predictor = TabularPredictor(label=label, path=save_path).fit(df_train, presets='best_quality') y_test = df_test[label] # values to predict test_data_nolab = df_test.drop( columns=[label]) # delete label column to prove we're not cheating predictor = TabularPredictor.load( save_path ) # unnecessary, just demonstrates how to load previously-trained predictor from file y_pred = predictor.predict(test_data_nolab) perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True) leaderboard = predictor.leaderboard(df_test, silent=True) st.dataframe(leaderboard) y_predproba = predictor.predict_proba(df_pred) # Enter text for testing s = 'pd.DataFrame' sample_dtypes = { 'list': [1, 'a', [2, 'c'], { 'b': 2 }], 'str': 'Hello Streamlit!', 'int': 17, 'float': 17.0, 'dict': { 1: 'a', 'x': [2, 'c'], 2: { 'b': 2
def run(dataset, config): log.info(f"\n**** AutoGluon [v{__version__}] ****\n") save_metadata(config, version=__version__) metrics_mapping = dict( acc=metrics.accuracy, auc=metrics.roc_auc, f1=metrics.f1, logloss=metrics.log_loss, mae=metrics.mean_absolute_error, mse=metrics.mean_squared_error, r2=metrics.r2, rmse=metrics.root_mean_squared_error, ) label = dataset.target.name problem_type = dataset.problem_type perf_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None if perf_metric is None: # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping log.warning("Performance metric %s not supported.", config.metric) is_classification = config.type == 'classification' training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')} load_raw = config.framework_params.get('_load_raw', False) if load_raw: train, test = load_data_raw(dataset=dataset) else: column_names, _ = zip(*dataset.columns) column_types = dict(dataset.columns) train = pd.DataFrame(dataset.train.data, columns=column_names).astype(column_types, copy=False) print(f"Columns dtypes:\n{train.dtypes}") test = pd.DataFrame(dataset.test.data, columns=column_names).astype(column_types, copy=False) del dataset gc.collect() output_dir = output_subdir("models", config) with utils.Timer() as training: predictor = TabularPredictor( label=label, eval_metric=perf_metric.name, path=output_dir, problem_type=problem_type, ).fit( train_data=train, time_limit=config.max_runtime_seconds, **training_params ) del train y_test = test[label] test = test.drop(columns=label) if is_classification: with utils.Timer() as predict: probabilities = predictor.predict_proba(test, as_multiclass=True) predictions = probabilities.idxmax(axis=1).to_numpy() else: with utils.Timer() as predict: predictions = predictor.predict(test, as_pandas=False) probabilities = None prob_labels = probabilities.columns.values.tolist() if probabilities is not None else None leaderboard = predictor.leaderboard(silent=True) # Removed test data input to avoid long running computation, remove 7200s timeout limitation to re-enable with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000): print(leaderboard) save_artifacts(predictor, leaderboard, config) num_models_trained = len(leaderboard) if predictor._trainer.model_best is not None: num_models_ensemble = len(predictor._trainer.get_minimum_model_set(predictor._trainer.model_best)) else: num_models_ensemble = 1 return result(output_file=config.output_predictions_file, predictions=predictions, truth=y_test, probabilities=probabilities, probabilities_labels=prob_labels, target_is_encoded=False, models_count=num_models_trained, models_ensemble_count=num_models_ensemble, training_duration=training.duration, predict_duration=predict.duration)