X = df_bands.values
y = df_meta['Megaclasse'].values

# sample data
X, _, y, _ = train_test_split(X,
                              y,
                              train_size=.3,
                              shuffle=True,
                              stratify=y,
                              random_state=random_state)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

## Final model - Parameter tuning
pipelines, param_grids = check_pipelines([anomally_detection, classifiers_1],
                                         random_state=0,
                                         n_runs=1)

fit_params = check_fit_params(pre_fit_params)

model_search = ModelSearchCV(pipelines,
                             param_grids,
                             scoring=scorers,
                             refit='accuracy',
                             n_jobs=-1,
                             cv=cv,
                             verbose=1)
model_search.fit(X, y, **fit_params)

pickle.dump(model_search,
            open(RESULTS_PATH + 'final_pipeline_parameter_tuning.pkl', 'wb'))
Ejemplo n.º 2
0
#            'selection_strategy': ['combined', 'minority', 'majority'],
#            'truncation_factor': [-.5,0,.5],
#            'deformation_factor': [0,.5,1],
#            'k_neighbors_filter': [3,5]
#            })
#]

classifiers = [
    ('randomforestclassifier', RandomForestClassifier(n_estimators=100, random_state=random_state), {})
]



objects_list = [noise_objs, data_filters, #oversamplers,
    classifiers]
pipelines, param_grid = check_pipelines(objects_list, random_state, 1)


fit_params = {}
for clf_name in list(dict(pipelines).keys()):
    clf_name_split = clf_name.split('|')
    if clf_name_split[1]=='DenoisedGeometricSMOTE':
        pass
    elif clf_name_split[1]=='singlefilter':
        fit_params[f'{clf_name_split[1]}__filters'] = [single_filter]
    elif clf_name_split[1]!='no_filter':
        fit_params[f'{clf_name_split[1]}__filters'] = filts

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
model_search = ModelSearchCV(pipelines, param_grid, n_jobs=-1, cv=cv, verbose=1)
model_search.fit(X,y,**fit_params)
df_meta = df.drop(df.columns[df.columns.str.startswith('X201')|df.columns.str.startswith('ND')], axis=1)
df_bands = df.drop(columns=df_meta.columns)
# normalize
znorm = StandardScaler()
df_bands = pd.DataFrame(znorm.fit_transform(df_bands.values), columns=df_bands.columns, index=df_bands.index)

X = df_bands.values
y = df_meta['Label'].values
ids = df_meta['Object'].values

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

## Experiment 1 (feature selection)
pipelines_feature, param_grid_feature = check_pipelines(
    [feature_selection, classifiers_1],
    random_state=0,
    n_runs=1
)

model_search_feature = ModelSearchCV(pipelines_feature, param_grid_feature, n_jobs=-1, cv=cv, verbose=1)
model_search_feature.fit(X,y)

df_results_feature = report_model_search_results(model_search_feature)\
    .sort_values('mean_test_score', ascending=False)
#df_results_feature.to_csv('results_feature_selection.csv')
pickle.dump(model_search_feature, open('gini_feature_selection.pkl','wb'))




rfc = RandomForestClassifier(
        'multi_class': ['ovr', 'multinomial'],
        'penalty': ['l2', 'none']
    }),
]


# setup scorers
def geometric_mean_macro(X, y):
    return geometric_mean_score(X, y, average='macro')


SCORERS['geometric_mean_macro'] = make_scorer(geometric_mean_macro)
scorers = ['accuracy', 'f1_macro', 'geometric_mean_macro']

pipelines, param_grid = check_pipelines([classifiers],
                                        random_state=random_state,
                                        n_runs=1)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

# run experiment
model_search = ModelSearchCV(pipelines,
                             param_grid,
                             scoring=scorers,
                             refit='accuracy',
                             n_jobs=-1,
                             cv=cv,
                             verbose=1)
model_search.fit(X, y)

pickle.dump(model_search,
X = df_bands.values
y = df_meta['Megaclasse'].values

# sample data
X, _, y, _ = train_test_split(X,
                              y,
                              train_size=.1,
                              shuffle=True,
                              stratify=y,
                              random_state=random_state)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

## Experiment 1 (feature selection)
pipelines_feature, param_grid_feature = check_pipelines(
    [feature_selection, classifiers], random_state=0, n_runs=1)

model_search_feature = ModelSearchCV(pipelines_feature,
                                     param_grid_feature,
                                     scoring=scorers,
                                     refit='accuracy',
                                     n_jobs=-1,
                                     cv=cv,
                                     verbose=1)
model_search_feature.fit(X, y)

df_results_feature = report_model_search_results(model_search_feature)\
    .sort_values('mean_test_accuracy', ascending=False)
df_results_feature.to_csv('results_feature_selection.csv')
pickle.dump(model_search_feature,
            open('model_search_feature_selection.pkl', 'wb'))
                  axis=1)
df_bands = df.drop(columns=df_meta.columns)
# normalize
znorm = StandardScaler()
df_bands = pd.DataFrame(znorm.fit_transform(df_bands.values),
                        columns=df_bands.columns,
                        index=df_bands.index)

X = df_bands.values
y = df_meta['Label'].values
ids = df_meta['Object'].values

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

## Experiment 1 (feature selection)
pipelines_feature, param_grid_feature = check_pipelines(
    [feature_selection, classifiers_1], random_state=0, n_runs=1)

model_search_feature = ModelSearchCV(pipelines_feature,
                                     param_grid_feature,
                                     n_jobs=-1,
                                     cv=cv,
                                     verbose=1)
model_search_feature.fit(X, y)

df_results_feature = report_model_search_results(model_search_feature)\
    .sort_values('mean_test_score', ascending=False)
df_results_feature.to_csv('results_feature_selection.csv')
pickle.dump(model_search_feature,
            open('model_search_feature_selection.pkl', 'wb'))

## Experiment 2 (anomally detection)