def grid_search(): segmentation_type = sys.argv[2] split_by_expert = sys.argv[3] == "True" drop_user_features = sys.argv[4] == "True" # get feature and label dataframes features_whole, labels_whole = import_data( path=DATA_PATH, segmentation_type=segmentation_type, drop_user_features=drop_user_features, return_type='pd', drop_expert=not split_by_expert) # if we dont split by expert, we only have one pair or features/labels if not split_by_expert: data = {"whole_data": (features_whole, labels_whole)} # otherwise we split them into the three experts else: temp_data = split_experts(features_whole, labels_whole) data = { f"expert_{int((i / 2) + 1)}": (temp_data[i], temp_data[i + 1]) for i in range(0, len(temp_data), 2) } # go through the dataframes (only one if we dont split) for name, (features, labels) in data.items(): print(f"Looking at -->{name}<-- data!") # get the the subject indices, in order to avoid putting samples from # the same individuals into test and training subject_indices = [l[0] for l in list(features.index)] # preprocess our features features = preprocessing_pipeline(features, drop_corr=False) # cross validation cross_validation_nn(features.values, labels.values, subject_indices, K=3, verbose=False, segmentation_type=segmentation_type, using_user_features=not drop_user_features, type_of_data=name)
def train_best_models(): """ Trains the best models using grid search results in "models/grid_search_results" and saves them in "models/weights" """ segmentation_types = ["no", "coarse", "fine"] gs_paths = ["no_gs.pkl", "coarse_gs.pkl", "fine_gs.pkl"] for segmentation, path in zip(segmentation_types, gs_paths): # get feature and label dataframes features, labels = import_data(path=DATA_PATH, segmentation_type=segmentation, drop_user_features=False, return_type='pd', drop_expert=True) # get the best parameters df = pd.read_pickle(GS_DIR + "/" + path) best_params = df.iloc[df['avg_auc'].argmax()] print(f"Generating best models for -->{segmentation}<-- data!") # get the the subject indices, in order to avoid putting samples from # the same individuals into test and training subject_indices = [l[0] for l in list(features.index)] # preprocess our features features = preprocessing_pipeline(features, drop_corr=False) # train a model model = train_model(features.values, labels.values, subject_indices, optimizer=best_params["optimizer"], smote=best_params["smote"], hidden_layer_dims=best_params["hidden_layer_dims"], learning_rate=best_params["learning_rate"], dropout=best_params["dropout"], weight_decay=best_params["weight_decay"], split_val=0.2, verbose=True, epochs=1000) # save this best model torch.save(model, PARAM_DIR + "/" + segmentation + "_model.pth")
def __init__(self, config): load_formats = {'audio': load_audio} assert config['format'] in load_formats, "Pass valid data format" self.dir_path = config['path'] self.loader_params = config['loader'] self.load_func = load_formats[config['format']] M_PATH = '../data' _, self.metadata_df = import_data(M_PATH, drop_user_features=True, segmentation_type='no', return_type='pd') self.metadata_df['cough_type'] = self.metadata_df.apply( lambda row: 'wet' if row['Label'] == 1 else 'dry', axis=1) self.classes = self._get_classes( self.metadata_df[['cough_type', 'Label']]) self.data_splits = self._split_data(self.metadata_df)
def predict_test_results(): """ Function that created predictions, given that models are in models/weights. Saves them into data/test/predictions_deep. """ segmentation_types = ["no", "coarse", "fine"] model_paths = ["no_model.pth", "coarse_model.pth", "fine_model.pth"] # create folder if not already there Path(PREDICTION_DATA + "/predictions_deep").mkdir(exist_ok=True) for segmentation, model_path in zip(segmentation_types, model_paths): # load the best model model = torch.load(PARAM_DIR + "/" + model_path) model.eval() # get the corresponding data X = import_data(DATA_PATH, segmentation_type=segmentation, drop_user_features=False, drop_expert=True, is_test=True) # preprocess it X = preprocessing_pipeline(X, drop_corr=False) # make predictions predictions = predict(X.values, model) predictions = [x[0] for x in predictions] # save predictions create_csv_submission(predictions, segm_type=segmentation, submission_path=PREDICTION_DATA + "/predictions_deep", expert=False, user_features=True)
'models': [GaussianNB(), GaussianNB(), GaussianNB()], 'oversampling': True, }, } ENSEMBLE_TYPE = "weighted" DATA_PATH = "./data" SUBMISSION_PATH = "./data/test/predictions_classical" if __name__ == "__main__": for segm_type, param in BEST_PARAMS_WITH_METADATA.items(): X_tr, y_tr = import_data(DATA_PATH, segmentation_type=segm_type, drop_user_features=False, drop_expert=True) X_te = import_data(DATA_PATH, segmentation_type=segm_type, drop_user_features=False, drop_expert=True, is_test=True) X_tr, X_te = preprocessing_pipeline(X_tr, X_te) y_pred = train_predict(X_tr, y_tr, X_te, param=param) create_csv_submission(y_pred, segm_type=segm_type, submission_path=SUBMISSION_PATH, expert=False, user_features=True)
import subprocess from tqdm import tqdm from src.utils.get_data import import_data def convert_and_split(filename): # command_webm = ['ffmpeg', '-i', f'audio_data/{filename}.webm', '-c:a', 'pcm_f32le', # f'wav_data/{filename}.wav'] # subprocess.run(command_webm, stdout=subprocess.PIPE, stdin=subprocess.PIPE) command_ogg = ['ffmpeg', '-i', f'audio_data/{filename}.ogg', f'wav_data/{filename}.wav'] subprocess.run(command_ogg, stdout=subprocess.PIPE, stdin=subprocess.PIPE) DATA_PATH = '../../data' if __name__ == '__main__': X, y = import_data(DATA_PATH, segmentation_type='no', drop_user_features=True, return_type='pd') for subject in tqdm(X.index.get_level_values(0)): convert_and_split(subject)
def train_test(): # if no command line arguments are given, use predefined ones if len(sys.argv) == 1: # to split the coughs by expert or not split_by_expert = False # to use the user supplied data drop_user_features = False # no, coarse or fine segmentation data segmentation_type = "coarse" # else, take the command line arguments else: segmentation_type = sys.argv[2] split_by_expert = sys.argv[3] == "True" drop_user_features = sys.argv[4] == "True" # get feature and label dataframes features_whole, labels_whole = import_data( path=DATA_PATH, segmentation_type=segmentation_type, drop_user_features=drop_user_features, return_type='pd', drop_expert=not split_by_expert) # if we dont split by expert, we only have one pair or features/labels if not split_by_expert: data = {"whole_data": (features_whole, labels_whole)} # otherwise we split them into the three experts else: temp_data = split_experts(features_whole, labels_whole) data = { f"expert_{int((i / 2) + 1)}": (temp_data[i], temp_data[i + 1]) for i in range(0, len(temp_data), 2) } # go through the dataframes (only one if we dont split) for name, (features, labels) in data.items(): print(f"Looking at -->{name}<-- data!") # get the the subject indices, in order to avoid putting samples from # the same individuals into test and training subject_indices = [l[0] for l in list(features.index)] # preprocess our features features = preprocessing_pipeline(features, drop_corr=False) # split them into train and test according to the groups gss = GroupShuffleSplit(n_splits=1, train_size=0.7, random_state=SEED) # since we only split once, use this command to get the # corresponding train and test indices for train_idx, test_idx in gss.split(features.values, labels.values, subject_indices): continue # train a model model = train_model(features.values[train_idx], labels.values[train_idx], [subject_indices[x] for x in train_idx], verbose=True, epochs=300) # calculate the SHAP values shap_df = get_shap_values(model, features.values[train_idx], features.values[test_idx], features.columns, device=device) print("\n\n\n SHAP Values") print(shap_df) # test the model test_model(features.values[test_idx], labels.values[test_idx], model, verbose=True)