Python import_dataの例、src.utils.get_data.import_data Pythonの例

コード例 #1

0

ファイルを表示

def grid_search():
    segmentation_type = sys.argv[2]
    split_by_expert = sys.argv[3] == "True"
    drop_user_features = sys.argv[4] == "True"

    # get feature and label dataframes
    features_whole, labels_whole = import_data(
        path=DATA_PATH,
        segmentation_type=segmentation_type,
        drop_user_features=drop_user_features,
        return_type='pd',
        drop_expert=not split_by_expert)
    # if we dont split by expert, we only have one pair or features/labels
    if not split_by_expert:
        data = {"whole_data": (features_whole, labels_whole)}
    # otherwise we split them into the three experts
    else:
        temp_data = split_experts(features_whole, labels_whole)
        data = {
            f"expert_{int((i / 2) + 1)}": (temp_data[i], temp_data[i + 1])
            for i in range(0, len(temp_data), 2)
        }

    # go through the dataframes (only one if we dont split)
    for name, (features, labels) in data.items():
        print(f"Looking at -->{name}<-- data!")

        # get the the subject indices, in order to avoid putting samples from
        # the same individuals into test and training
        subject_indices = [l[0] for l in list(features.index)]

        # preprocess our features
        features = preprocessing_pipeline(features, drop_corr=False)

        # cross validation
        cross_validation_nn(features.values,
                            labels.values,
                            subject_indices,
                            K=3,
                            verbose=False,
                            segmentation_type=segmentation_type,
                            using_user_features=not drop_user_features,
                            type_of_data=name)

コード例 #2

0

ファイルを表示

def train_best_models():
    """
    Trains the best models using grid search results in "models/grid_search_results"
    and saves them in "models/weights"
    """
    segmentation_types = ["no", "coarse", "fine"]
    gs_paths = ["no_gs.pkl", "coarse_gs.pkl", "fine_gs.pkl"]
    for segmentation, path in zip(segmentation_types, gs_paths):
        # get feature and label dataframes
        features, labels = import_data(path=DATA_PATH,
                                       segmentation_type=segmentation,
                                       drop_user_features=False,
                                       return_type='pd',
                                       drop_expert=True)
        # get the best parameters
        df = pd.read_pickle(GS_DIR + "/" + path)
        best_params = df.iloc[df['avg_auc'].argmax()]

        print(f"Generating best models for -->{segmentation}<-- data!")

        # get the the subject indices, in order to avoid putting samples from
        # the same individuals into test and training
        subject_indices = [l[0] for l in list(features.index)]

        # preprocess our features
        features = preprocessing_pipeline(features, drop_corr=False)

        # train a model
        model = train_model(features.values,
                            labels.values,
                            subject_indices,
                            optimizer=best_params["optimizer"],
                            smote=best_params["smote"],
                            hidden_layer_dims=best_params["hidden_layer_dims"],
                            learning_rate=best_params["learning_rate"],
                            dropout=best_params["dropout"],
                            weight_decay=best_params["weight_decay"],
                            split_val=0.2,
                            verbose=True,
                            epochs=1000)

        # save this best model
        torch.save(model, PARAM_DIR + "/" + segmentation + "_model.pth")

コード例 #3

0

ファイルを表示

ファイル: data_manager.py プロジェクト: CS-433/cs-433-project-2-cough_classifier

    def __init__(self, config):
        load_formats = {'audio': load_audio}

        assert config['format'] in load_formats, "Pass valid data format"

        self.dir_path = config['path']
        self.loader_params = config['loader']

        self.load_func = load_formats[config['format']]

        M_PATH = '../data'
        _, self.metadata_df = import_data(M_PATH,
                                          drop_user_features=True,
                                          segmentation_type='no',
                                          return_type='pd')
        self.metadata_df['cough_type'] = self.metadata_df.apply(
            lambda row: 'wet' if row['Label'] == 1 else 'dry', axis=1)
        self.classes = self._get_classes(
            self.metadata_df[['cough_type', 'Label']])
        self.data_splits = self._split_data(self.metadata_df)

コード例 #4

0

ファイルを表示

def predict_test_results():
    """
    Function that created predictions, given that models are in
    models/weights. Saves them into data/test/predictions_deep.
    """

    segmentation_types = ["no", "coarse", "fine"]
    model_paths = ["no_model.pth", "coarse_model.pth", "fine_model.pth"]

    # create folder if not already there
    Path(PREDICTION_DATA + "/predictions_deep").mkdir(exist_ok=True)

    for segmentation, model_path in zip(segmentation_types, model_paths):
        # load the best model
        model = torch.load(PARAM_DIR + "/" + model_path)
        model.eval()

        # get the corresponding data
        X = import_data(DATA_PATH,
                        segmentation_type=segmentation,
                        drop_user_features=False,
                        drop_expert=True,
                        is_test=True)

        # preprocess it
        X = preprocessing_pipeline(X, drop_corr=False)

        # make predictions
        predictions = predict(X.values, model)
        predictions = [x[0] for x in predictions]

        # save predictions
        create_csv_submission(predictions,
                              segm_type=segmentation,
                              submission_path=PREDICTION_DATA +
                              "/predictions_deep",
                              expert=False,
                              user_features=True)

コード例 #5

0

ファイルを表示

        'models': [GaussianNB(), GaussianNB(),
                   GaussianNB()],
        'oversampling': True,
    },
}

ENSEMBLE_TYPE = "weighted"

DATA_PATH = "./data"
SUBMISSION_PATH = "./data/test/predictions_classical"

if __name__ == "__main__":

    for segm_type, param in BEST_PARAMS_WITH_METADATA.items():
        X_tr, y_tr = import_data(DATA_PATH,
                                 segmentation_type=segm_type,
                                 drop_user_features=False,
                                 drop_expert=True)
        X_te = import_data(DATA_PATH,
                           segmentation_type=segm_type,
                           drop_user_features=False,
                           drop_expert=True,
                           is_test=True)

        X_tr, X_te = preprocessing_pipeline(X_tr, X_te)

        y_pred = train_predict(X_tr, y_tr, X_te, param=param)
        create_csv_submission(y_pred,
                              segm_type=segm_type,
                              submission_path=SUBMISSION_PATH,
                              expert=False,
                              user_features=True)

コード例 #6

0

ファイルを表示

import subprocess
from tqdm import tqdm

from src.utils.get_data import import_data


def convert_and_split(filename):
    # command_webm = ['ffmpeg', '-i', f'audio_data/{filename}.webm', '-c:a', 'pcm_f32le',
    #            f'wav_data/{filename}.wav']
    # subprocess.run(command_webm, stdout=subprocess.PIPE, stdin=subprocess.PIPE)
    command_ogg = ['ffmpeg', '-i', f'audio_data/{filename}.ogg', f'wav_data/{filename}.wav']
    subprocess.run(command_ogg, stdout=subprocess.PIPE, stdin=subprocess.PIPE)


DATA_PATH = '../../data'

if __name__ == '__main__':
    X, y = import_data(DATA_PATH, segmentation_type='no', drop_user_features=True, return_type='pd')
    for subject in tqdm(X.index.get_level_values(0)):
        convert_and_split(subject)

コード例 #7

0

ファイルを表示

def train_test():
    # if no command line arguments are given, use predefined ones
    if len(sys.argv) == 1:
        # to split the coughs by expert or not
        split_by_expert = False
        # to use the user supplied data
        drop_user_features = False
        # no, coarse or fine segmentation data
        segmentation_type = "coarse"
    # else, take the command line arguments
    else:
        segmentation_type = sys.argv[2]
        split_by_expert = sys.argv[3] == "True"
        drop_user_features = sys.argv[4] == "True"

    # get feature and label dataframes
    features_whole, labels_whole = import_data(
        path=DATA_PATH,
        segmentation_type=segmentation_type,
        drop_user_features=drop_user_features,
        return_type='pd',
        drop_expert=not split_by_expert)
    # if we dont split by expert, we only have one pair or features/labels
    if not split_by_expert:
        data = {"whole_data": (features_whole, labels_whole)}
    # otherwise we split them into the three experts
    else:
        temp_data = split_experts(features_whole, labels_whole)
        data = {
            f"expert_{int((i / 2) + 1)}": (temp_data[i], temp_data[i + 1])
            for i in range(0, len(temp_data), 2)
        }

    # go through the dataframes (only one if we dont split)
    for name, (features, labels) in data.items():
        print(f"Looking at -->{name}<-- data!")

        # get the the subject indices, in order to avoid putting samples from
        # the same individuals into test and training
        subject_indices = [l[0] for l in list(features.index)]

        # preprocess our features
        features = preprocessing_pipeline(features, drop_corr=False)

        # split them into train and test according to the groups
        gss = GroupShuffleSplit(n_splits=1, train_size=0.7, random_state=SEED)
        # since we only split once, use this command to get the
        # corresponding train and test indices
        for train_idx, test_idx in gss.split(features.values, labels.values,
                                             subject_indices):
            continue

        # train a model
        model = train_model(features.values[train_idx],
                            labels.values[train_idx],
                            [subject_indices[x] for x in train_idx],
                            verbose=True,
                            epochs=300)
        # calculate the SHAP values
        shap_df = get_shap_values(model,
                                  features.values[train_idx],
                                  features.values[test_idx],
                                  features.columns,
                                  device=device)
        print("\n\n\n SHAP Values")
        print(shap_df)

        # test the model
        test_model(features.values[test_idx],
                   labels.values[test_idx],
                   model,
                   verbose=True)