from sklearn.model_selection import train_test_split from sklearn.metrics import ( classification_report, confusion_matrix, roc_auc_score, f1_score, ) from utils.io_utils import yaml_loader, get_data, save_model, save_transformers from utils.training_utils import ( build_features_pipeline, build_label_encoder, build_model, early_stopping, ) config = yaml_loader("./config/config.yml") data = get_data(config) features_pipeline = build_features_pipeline(config) label_encoder = build_label_encoder(config) X = data.loc[:, data.columns != config["features"]["target"]] y = data[config["features"]["target"]] X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=12345, test_size=0.25, shuffle=True, stratify=y ) X_train = features_pipeline.fit_transform(X_train) X_test = features_pipeline.transform(X_test) y_train = label_encoder.transform(y_train)
import pandas as pd from utils.dataset_utils import ( split_list, mount_dataset, process_images, ) from utils.io_utils import txt_loader, yaml_loader config = yaml_loader("./config/dataset_config.yml") category_list = split_list( txt_loader(config["paths"]["categories_path"], skip_lines=2)) category_dict = {w[0]: int(w[1]) for w in category_list} attribute_list = split_list( txt_loader(config["paths"]["attributes_path"], skip_lines=2)) attribute_dict = {w[0]: int(w[1]) for w in attribute_list} datasets = ["train", "test", "val"] dataset_dict = {} for dataset in datasets: dataset_dict[dataset] = mount_dataset( files_path=config["paths"][f"{dataset}_files_path"], categories_path=config["paths"][f"{dataset}_categories_path"], attributes_path=config["paths"][f"{dataset}_attributes_path"], bboxes_path=config["paths"][f"{dataset}_bboxes_path"], category_dict=category_dict, attribute_dict=attribute_dict, ) full_dataset = pd.concat(dataset_dict.values()).reset_index(drop=True) full_dataset["file"] = [ f'{config["paths"]["data_folder_prefix"]}/{w}' for w in full_dataset["file"] ]