def test_classif_pipeline(self): """ test the training of classif with expected F1 score close to one """ data_train, labels_train = generate_data() data_test, labels_test = generate_data() for n in CLASSIFIER_NAMES: logging.info('created classif.: %s', n) clf, _ = seg_clf.create_classif_train_export(n, data_train, labels_train) self.classif_eval(clf, data_train, labels_train, data_test, labels_test)
def load_train_classifier(params, features, labels, feature_names, sizes, nb_holdout): logging.info('train classifier...') seg_clf.feature_scoring_selection(features, labels, feature_names, path_out=params['path_exp']) cv = seg_clf.CrossValidatePSetsOut(sizes, nb_hold_out=nb_holdout) # feature norm & train classification fname_classif = seg_clf.TEMPLATE_NAME_CLF.format(params['classif']) path_classif = os.path.join(params['path_exp'], fname_classif) if os.path.isfile(path_classif) and not FORCE_RETRAIN_CLASSIF: logging.info('loading classifier: %s', path_classif) params_local = params.copy() dict_classif = seg_clf.load_classifier(path_classif) classif = dict_classif['clf_pipeline'] params = dict_classif['params'] params.update({ k: params_local[k] for k in params_local if k.startswith('path_') or k.startswith('gc_') }) logging.debug('loaded PARAMETERS: %s', repr(params)) else: classif, path_classif = seg_clf.create_classif_train_export( params['classif'], features, labels, cross_val=cv, params=params, feature_names=feature_names, nb_search_iter=params['nb_classif_search'], nb_jobs=params['nb_jobs'], pca_coef=params['pca_coef'], path_out=params['path_exp']) params['path_classif'] = path_classif cv = seg_clf.CrossValidatePSetsOut(sizes, nb_hold_out=nb_holdout) seg_clf.eval_classif_cross_val_scores(params['classif'], classif, features, labels, cross_val=cv, path_out=params['path_exp']) seg_clf.eval_classif_cross_val_roc(params['classif'], classif, features, labels, cross_val=cv, path_out=params['path_exp']) return params, classif, path_classif
def train_classif_color2d_slic_features(list_images, list_annots, clr_space='rgb', sp_size=30, sp_regul=0.2, dict_features=FTS_SET_SIMPLE, clf_name=CLASSIF_NAME, label_purity=0.9, feature_balance='unique', pca_coef=None, nb_classif_search=1, nb_jobs=1): """ train classifier on list of annotated images :param [ndarray] list_images: :param [ndarray] list_annots: :param str clr_space: chose the color space :param int sp_size: initial size of a superpixel(meaning edge lenght) :param float sp_regul: regularisation in range(0;1) where "0" gives elastic and "1" nearly square segments :param {str: [str]} dict_features: list of features to be extracted :param str clf_name: selet udsed classifier :param float label_purity: set the sample-labels purity for training :param str feature_balance: set how to balance datasets :param float pca_coef: select PCA coef or None :param int nb_classif_search: number of tries for hyper-parameters seach :param int nb_jobs: parallelism :return: """ logging.info('TRAIN Superpixels-Features-Classifier') assert len(list_images) == len(list_annots) list_slic, list_features, list_labels = list(), list(), list() mproc_pool = mproc.Pool(nb_jobs) logging.debug('run feature extraction in parallel - %i threads', nb_jobs) wrapper_compute = partial(wrapper_compute_color2d_slic_features_labels, clr_space=clr_space, sp_size=sp_size, sp_regul=sp_regul, dict_features=dict_features, label_purity=label_purity) list_imgs_annot = zip(list_images, list_annots) for slic, fts, lbs in mproc_pool.imap_unordered(wrapper_compute, list_imgs_annot): list_slic.append(slic) list_features.append(fts) list_labels.append(lbs) mproc_pool.close() mproc_pool.join() # for img, annot in zip(list_images, list_annots): # assert img.shape[:2] == annot.shape[:2] # slic, features = compute_color2d_superpixels_features(img, clr_space, # sp_size, sp_regul, # dict_features, # fts_norm=False) # list_slic.append(slic) # list_features.append(features) # # label_hist = seg_lbs.histogram_regions_labels_norm(slic, annot) # labels = np.argmax(label_hist, axis=1) # purity = np.max(label_hist, axis=1) # labels[purity < label_purity] = -1 # list_labels.append(labels) logging.debug('concentrate features...') # concentrate features, labels features, labels, sizes = seg_clf.convert_set_features_labels_2_dataset( dict(zip(range(len(list_features)), list_features)), dict(zip(range(len(list_labels)), list_labels)), balance=feature_balance, drop_labels=[-1]) # drop do not care label whichare -1 features = np.nan_to_num(features) logging.debug('train classifier...') # clf_pipeline = seg_clf.create_clf_pipeline(clf_name, pca_coef) # clf_pipeline.fit(np.array(features), np.array(labels, dtype=int)) if len(sizes) > (CROSS_VAL_LEAVE_OUT * 5): cv = seg_clf.CrossValidatePSetsOut(sizes, nb_hold_out=CROSS_VAL_LEAVE_OUT) # for small nuber of training images this does not make sence else: cv = 10 classif, _ = seg_clf.create_classif_train_export( clf_name, features, labels, nb_search_iter=nb_classif_search, cross_val=cv, nb_jobs=nb_jobs, pca_coef=pca_coef) return classif, list_slic, list_features, list_labels
def main_train(params=CENTER_PARAMS): """ PIPELINE for training 0) load triplets or create triplets from path to images, annotations 1) load precomputed data or compute them now 2) train classifier with hyper-parameters 3) perform Leave-One-Out experiment :param {str: any} params: """ logging.info('run TRAINING...') params = prepare_experiment_folder(params, FOLDER_EXPERIMENT) tl_expt.set_experiment_logger(params['path_expt']) logging.info(tl_expt.string_dict(params, desc='PARAMETERS')) with open(os.path.join(params['path_expt'], NAME_JSON_PARAMS), 'w') as f: json.dump(params, f) tl_expt.create_subfolders(params['path_expt'], LIST_SUBDIRS) df_paths, path_csv = load_df_paths(params) path_dump_data = os.path.join(params['path_expt'], NAME_DUMP_TRAIN_DATA) if not os.path.isfile(path_dump_data) or FORCE_RECOMP_DATA: dict_imgs, dict_segms, dict_slics, dict_points, dict_centers, \ dict_features, dict_labels, feature_names = \ dataset_load_images_segms_compute_features(params, df_paths, params['nb_jobs']) assert len(dict_imgs) > 0, 'missing images' save_dump_data(path_dump_data, dict_imgs, dict_segms, dict_slics, dict_points, dict_centers, dict_features, dict_labels, feature_names) else: dict_imgs, dict_segms, dict_slics, dict_points, dict_centers, dict_features, \ dict_labels, feature_names = load_dump_data(path_dump_data) if is_drawing(params['path_expt']) and EXPORT_TRAINING_DATA: export_dataset_visual(params['path_expt'], dict_imgs, dict_segms, dict_slics, dict_points, dict_labels, params['nb_jobs']) # concentrate features, labels features, labels, sizes = seg_clf.convert_set_features_labels_2_dataset( dict_features, dict_labels, drop_labels=[-1], balance=params['balance']) # remove all bad values from features space features[np.isnan(features)] = 0 features[np.isinf(features)] = -1 assert np.sum(sizes) == len(labels), \ 'not equal sizes (%d) and labels (%i)' \ % (int(np.sum(sizes)), len(labels)) # feature norm & train classification nb_holdout = int(np.ceil(len(sizes) * CROSS_VAL_LEAVE_OUT_SEARCH)) cv = seg_clf.CrossValidatePSetsOut(sizes, nb_holdout) classif, params['path_classif'] = seg_clf.create_classif_train_export( params['classif'], features, labels, cross_val=cv, params=params, feature_names=feature_names, nb_search_iter=params['nb_classif_search'], pca_coef=params.get('pca_coef', None), nb_jobs=params['nb_jobs'], path_out=params['path_expt']) nb_holdout = int(np.ceil(len(sizes) * CROSS_VAL_LEAVE_OUT_EVAL)) cv = seg_clf.CrossValidatePSetsOut(sizes, nb_holdout) seg_clf.eval_classif_cross_val_scores(params['classif'], classif, features, labels, cross_val=cv, path_out=params['path_expt']) seg_clf.eval_classif_cross_val_roc(params['classif'], classif, features, labels, cross_val=cv, path_out=params['path_expt']) if RUN_LEAVE_ONE_OUT : experiment_loo(classif, dict_imgs, dict_segms, dict_centers, dict_slics, dict_points, dict_features, feature_names) logging.info('DONE')
def main_train(params=CENTER_PARAMS): """ PIPELINE for training 0) load triplets or create triplets from path to images, annotations 1) load precomputed data or compute them now 2) train classifier with hyper-parameters 3) perform Leave-One-Out experiment :param {str: any} params: """ logging.info('run TRAINING...') # check_paths_patterns(paths) if not os.path.exists(params['path_output']): assert os.path.isdir(os.path.dirname(params['path_output'])), \ 'missing: %s' % params['path_output'] logging.debug('creating missing folder: %s', params['path_output']) os.mkdir(params['path_output']) tl_expt.set_experiment_logger(params['path_output']) logging.info(tl_expt.string_dict(params, desc='PARAMETERS')) with open(os.path.join(params['path_output'], NAME_JSON_PARAMS), 'w') as f: json.dump(params, f) tl_expt.create_subfolders(params['path_output'], LIST_SUBDIRS) path_csv = os.path.join(params['path_output'], NAME_CSV_TRIPLES) if not os.path.isfile(path_csv) or FORCE_RELOAD: # df_paths = find_match_images_segms_centers(params['path_images'], # params['path_segms'], # params['path_centers']) logging.info('loading path pairs "%s"', path_csv) df_paths = pd.DataFrame.from_csv(params['path_list']) df_paths.to_csv(path_csv) else: logging.info('loading path pairs "%s"', path_csv) df_paths = pd.DataFrame.from_csv(path_csv) df_paths.index = list(range(len(df_paths))) path_dump_data = os.path.join(params['path_output'], NAME_DUMP_TRAIN_DATA) if not os.path.isfile(path_dump_data) or FORCE_RECOMP_DATA: dict_imgs, dict_segms, dict_slics, dict_points, dict_centers, \ dict_features, dict_labels, feature_names = \ dataset_load_images_segms_compute_features(params, df_paths, params['nb_jobs']) assert len(dict_imgs) > 0, 'missing images' save_dump_data(path_dump_data, dict_imgs, dict_segms, dict_slics, dict_points, dict_centers, dict_features, dict_labels, feature_names) else: dict_imgs, dict_segms, dict_slics, dict_points, dict_centers, dict_features, \ dict_labels, feature_names = load_dump_data(path_dump_data) if is_drawing(params['path_output']) and EXPORT_TRAINING_DATA: export_dataset_visual(params['path_output'], dict_imgs, dict_segms, dict_slics, dict_points, dict_labels, params['nb_jobs']) # concentrate features, labels features, labels, sizes = seg_clf.convert_set_features_labels_2_dataset( dict_features, dict_labels, drop_labels=[-1], balance=params['balance']) # remove all bad values from features space features[np.isnan(features)] = 0 features[np.isinf(features)] = -1 assert np.sum(sizes) == len(labels), \ 'not equal sizes (%d) and labels (%i)' \ % (int(np.sum(sizes)), len(labels)) # feature norm & train classification nb_holdout = int(np.ceil(len(sizes) * CROSS_VAL_LEAVE_OUT_SEARCH)) cv = seg_clf.CrossValidatePSetsOut(sizes, nb_holdout) classif, params['path_classif'] = seg_clf.create_classif_train_export( params['classif'], features, labels, cross_val=cv, params=params, feature_names=feature_names, nb_search_iter=params['nb_classif_search'], pca_coef=params.get('pca_coef', None), nb_jobs=params['nb_jobs'], path_out=params['path_output']) nb_holdout = int(np.ceil(len(sizes) * CROSS_VAL_LEAVE_OUT_EVAL)) cv = seg_clf.CrossValidatePSetsOut(sizes, nb_holdout) seg_clf.eval_classif_cross_val_scores(params['classif'], classif, features, labels, cross_val=cv, path_out=params['path_output']) seg_clf.eval_classif_cross_val_roc(params['classif'], classif, features, labels, cross_val=cv, path_out=params['path_output']) if RUN_LEAVE_ONE_OUT: logging.info('run LOO prediction on training data...') # test classif on images gener_data = ((name, dict_imgs[name], dict_segms[name], dict_centers[name], dict_slics[name], dict_points[name], dict_features[name], feature_names) for name in dict_imgs) wrapper_detection = partial(wrapper_detect_center_candidates, params=params, classif=classif, path_output=params['path_output']) df_stat = pd.DataFrame() tqdm_bar = tqdm.tqdm(total=len(dict_imgs), desc='experiment LOO') pool = mproc.Pool(params['nb_jobs']) for dict_stat in pool.imap_unordered(wrapper_detection, gener_data): df_stat = df_stat.append(dict_stat, ignore_index=True) df_stat.to_csv( os.path.join(params['path_output'], NAME_CSV_STAT_TRAIN)) tqdm_bar.update() pool.close() pool.join() df_stat.set_index(['image'], inplace=True) df_stat.to_csv(os.path.join(params['path_output'], NAME_CSV_STAT_TRAIN)) logging.info('STATISTIC: \n %s', repr(df_stat.describe().transpose())) logging.info('DONE')