Ejemplo n.º 1
0
 def test_classif_pipeline(self):
     """ test the training of classif with expected F1 score close to one """
     data_train, labels_train = generate_data()
     data_test, labels_test = generate_data()
     for n in CLASSIFIER_NAMES:
         logging.info('created classif.: %s', n)
         clf, _ = seg_clf.create_classif_train_export(n, data_train, labels_train)
         self.classif_eval(clf, data_train, labels_train,
                           data_test, labels_test)
def load_train_classifier(params, features, labels, feature_names, sizes,
                          nb_holdout):
    logging.info('train classifier...')
    seg_clf.feature_scoring_selection(features,
                                      labels,
                                      feature_names,
                                      path_out=params['path_exp'])
    cv = seg_clf.CrossValidatePSetsOut(sizes, nb_hold_out=nb_holdout)
    # feature norm & train classification
    fname_classif = seg_clf.TEMPLATE_NAME_CLF.format(params['classif'])
    path_classif = os.path.join(params['path_exp'], fname_classif)
    if os.path.isfile(path_classif) and not FORCE_RETRAIN_CLASSIF:
        logging.info('loading classifier: %s', path_classif)
        params_local = params.copy()
        dict_classif = seg_clf.load_classifier(path_classif)
        classif = dict_classif['clf_pipeline']
        params = dict_classif['params']
        params.update({
            k: params_local[k]
            for k in params_local
            if k.startswith('path_') or k.startswith('gc_')
        })
        logging.debug('loaded PARAMETERS: %s', repr(params))
    else:
        classif, path_classif = seg_clf.create_classif_train_export(
            params['classif'],
            features,
            labels,
            cross_val=cv,
            params=params,
            feature_names=feature_names,
            nb_search_iter=params['nb_classif_search'],
            nb_jobs=params['nb_jobs'],
            pca_coef=params['pca_coef'],
            path_out=params['path_exp'])
    params['path_classif'] = path_classif
    cv = seg_clf.CrossValidatePSetsOut(sizes, nb_hold_out=nb_holdout)
    seg_clf.eval_classif_cross_val_scores(params['classif'],
                                          classif,
                                          features,
                                          labels,
                                          cross_val=cv,
                                          path_out=params['path_exp'])
    seg_clf.eval_classif_cross_val_roc(params['classif'],
                                       classif,
                                       features,
                                       labels,
                                       cross_val=cv,
                                       path_out=params['path_exp'])
    return params, classif, path_classif
Ejemplo n.º 3
0
def train_classif_color2d_slic_features(list_images,
                                        list_annots,
                                        clr_space='rgb',
                                        sp_size=30,
                                        sp_regul=0.2,
                                        dict_features=FTS_SET_SIMPLE,
                                        clf_name=CLASSIF_NAME,
                                        label_purity=0.9,
                                        feature_balance='unique',
                                        pca_coef=None,
                                        nb_classif_search=1,
                                        nb_jobs=1):
    """ train classifier on list of annotated images

    :param [ndarray] list_images:
    :param [ndarray] list_annots:
    :param str clr_space: chose the color space
    :param int sp_size: initial size of a superpixel(meaning edge lenght)
    :param float sp_regul: regularisation in range(0;1) where "0" gives elastic
           and "1" nearly square segments
    :param {str: [str]} dict_features: list of features to be extracted
    :param str clf_name: selet udsed classifier
    :param float label_purity: set the sample-labels purity for training
    :param str feature_balance: set how to balance datasets
    :param float pca_coef: select PCA coef or None
    :param int nb_classif_search: number of tries for hyper-parameters seach
    :param int nb_jobs: parallelism
    :return:
    """
    logging.info('TRAIN Superpixels-Features-Classifier')
    assert len(list_images) == len(list_annots)

    list_slic, list_features, list_labels = list(), list(), list()

    mproc_pool = mproc.Pool(nb_jobs)
    logging.debug('run feature extraction in parallel - %i threads', nb_jobs)
    wrapper_compute = partial(wrapper_compute_color2d_slic_features_labels,
                              clr_space=clr_space,
                              sp_size=sp_size,
                              sp_regul=sp_regul,
                              dict_features=dict_features,
                              label_purity=label_purity)
    list_imgs_annot = zip(list_images, list_annots)
    for slic, fts, lbs in mproc_pool.imap_unordered(wrapper_compute,
                                                    list_imgs_annot):
        list_slic.append(slic)
        list_features.append(fts)
        list_labels.append(lbs)
    mproc_pool.close()
    mproc_pool.join()

    # for img, annot in zip(list_images, list_annots):
    #     assert img.shape[:2] == annot.shape[:2]
    #     slic, features = compute_color2d_superpixels_features(img, clr_space,
    #                                                           sp_size, sp_regul,
    #                                                           dict_features,
    #                                                           fts_norm=False)
    #     list_slic.append(slic)
    #     list_features.append(features)
    #
    #     label_hist = seg_lbs.histogram_regions_labels_norm(slic, annot)
    #     labels = np.argmax(label_hist, axis=1)
    #     purity = np.max(label_hist, axis=1)
    #     labels[purity < label_purity] = -1
    #     list_labels.append(labels)

    logging.debug('concentrate features...')
    # concentrate features, labels
    features, labels, sizes = seg_clf.convert_set_features_labels_2_dataset(
        dict(zip(range(len(list_features)), list_features)),
        dict(zip(range(len(list_labels)), list_labels)),
        balance=feature_balance,
        drop_labels=[-1])
    # drop do not care label whichare -1
    features = np.nan_to_num(features)

    logging.debug('train classifier...')
    # clf_pipeline = seg_clf.create_clf_pipeline(clf_name, pca_coef)
    # clf_pipeline.fit(np.array(features), np.array(labels, dtype=int))

    if len(sizes) > (CROSS_VAL_LEAVE_OUT * 5):
        cv = seg_clf.CrossValidatePSetsOut(sizes,
                                           nb_hold_out=CROSS_VAL_LEAVE_OUT)
    # for small nuber of training images this does not make sence
    else:
        cv = 10

    classif, _ = seg_clf.create_classif_train_export(
        clf_name,
        features,
        labels,
        nb_search_iter=nb_classif_search,
        cross_val=cv,
        nb_jobs=nb_jobs,
        pca_coef=pca_coef)

    return classif, list_slic, list_features, list_labels
def main_train(params=CENTER_PARAMS):
    """ PIPELINE for training
    0) load triplets or create triplets from path to images, annotations
    1) load precomputed data or compute them now
    2) train classifier with hyper-parameters
    3) perform Leave-One-Out experiment

    :param {str: any} params:
    """
    logging.info('run TRAINING...')
    params = prepare_experiment_folder(params, FOLDER_EXPERIMENT)

    tl_expt.set_experiment_logger(params['path_expt'])
    logging.info(tl_expt.string_dict(params, desc='PARAMETERS'))

    with open(os.path.join(params['path_expt'], NAME_JSON_PARAMS), 'w') as f:
        json.dump(params, f)

    tl_expt.create_subfolders(params['path_expt'], LIST_SUBDIRS)

    df_paths, path_csv = load_df_paths(params)

    path_dump_data = os.path.join(params['path_expt'], NAME_DUMP_TRAIN_DATA)
    if not os.path.isfile(path_dump_data) or FORCE_RECOMP_DATA:
        dict_imgs, dict_segms, dict_slics, dict_points, dict_centers, \
        dict_features, dict_labels, feature_names = \
            dataset_load_images_segms_compute_features(params, df_paths,
                                                       params['nb_jobs'])
        assert len(dict_imgs) > 0, 'missing images'
        save_dump_data(path_dump_data, dict_imgs, dict_segms, dict_slics, dict_points,
                       dict_centers, dict_features, dict_labels, feature_names)
    else:
        dict_imgs, dict_segms, dict_slics, dict_points, dict_centers, dict_features, \
        dict_labels, feature_names = load_dump_data(path_dump_data)

    if is_drawing(params['path_expt']) and EXPORT_TRAINING_DATA:
        export_dataset_visual(params['path_expt'], dict_imgs, dict_segms, dict_slics,
                              dict_points, dict_labels, params['nb_jobs'])

    # concentrate features, labels
    features, labels, sizes = seg_clf.convert_set_features_labels_2_dataset(
        dict_features, dict_labels, drop_labels=[-1], balance=params['balance'])
    # remove all bad values from features space
    features[np.isnan(features)] = 0
    features[np.isinf(features)] = -1
    assert np.sum(sizes) == len(labels), \
        'not equal sizes (%d) and labels (%i)' \
        % (int(np.sum(sizes)), len(labels))

    # feature norm & train classification
    nb_holdout = int(np.ceil(len(sizes) * CROSS_VAL_LEAVE_OUT_SEARCH))
    cv = seg_clf.CrossValidatePSetsOut(sizes, nb_holdout)
    classif, params['path_classif'] = seg_clf.create_classif_train_export(
        params['classif'], features, labels, cross_val=cv, params=params,
        feature_names=feature_names, nb_search_iter=params['nb_classif_search'],
        pca_coef=params.get('pca_coef', None), nb_jobs=params['nb_jobs'],
        path_out=params['path_expt'])
    nb_holdout = int(np.ceil(len(sizes) * CROSS_VAL_LEAVE_OUT_EVAL))
    cv = seg_clf.CrossValidatePSetsOut(sizes, nb_holdout)
    seg_clf.eval_classif_cross_val_scores(params['classif'], classif, features, labels,
                                          cross_val=cv, path_out=params['path_expt'])
    seg_clf.eval_classif_cross_val_roc(params['classif'], classif, features, labels,
                                       cross_val=cv, path_out=params['path_expt'])

    if RUN_LEAVE_ONE_OUT :
        experiment_loo(classif, dict_imgs, dict_segms, dict_centers, dict_slics,
                       dict_points, dict_features, feature_names)

    logging.info('DONE')
def main_train(params=CENTER_PARAMS):
    """ PIPELINE for training
    0) load triplets or create triplets from path to images, annotations
    1) load precomputed data or compute them now
    2) train classifier with hyper-parameters
    3) perform Leave-One-Out experiment

    :param {str: any} params:
    """
    logging.info('run TRAINING...')

    # check_paths_patterns(paths)
    if not os.path.exists(params['path_output']):
        assert os.path.isdir(os.path.dirname(params['path_output'])), \
            'missing: %s' % params['path_output']
        logging.debug('creating missing folder: %s', params['path_output'])
        os.mkdir(params['path_output'])

    tl_expt.set_experiment_logger(params['path_output'])
    logging.info(tl_expt.string_dict(params, desc='PARAMETERS'))

    with open(os.path.join(params['path_output'], NAME_JSON_PARAMS), 'w') as f:
        json.dump(params, f)

    tl_expt.create_subfolders(params['path_output'], LIST_SUBDIRS)

    path_csv = os.path.join(params['path_output'], NAME_CSV_TRIPLES)
    if not os.path.isfile(path_csv) or FORCE_RELOAD:
        # df_paths = find_match_images_segms_centers(params['path_images'],
        #                                            params['path_segms'],
        #                                            params['path_centers'])
        logging.info('loading path pairs "%s"', path_csv)
        df_paths = pd.DataFrame.from_csv(params['path_list'])
        df_paths.to_csv(path_csv)
    else:
        logging.info('loading path pairs "%s"', path_csv)
        df_paths = pd.DataFrame.from_csv(path_csv)
    df_paths.index = list(range(len(df_paths)))

    path_dump_data = os.path.join(params['path_output'], NAME_DUMP_TRAIN_DATA)
    if not os.path.isfile(path_dump_data) or FORCE_RECOMP_DATA:
        dict_imgs, dict_segms, dict_slics, dict_points, dict_centers, \
        dict_features, dict_labels, feature_names = \
            dataset_load_images_segms_compute_features(params, df_paths, params['nb_jobs'])
        assert len(dict_imgs) > 0, 'missing images'
        save_dump_data(path_dump_data, dict_imgs, dict_segms, dict_slics,
                       dict_points, dict_centers, dict_features, dict_labels,
                       feature_names)
    else:
        dict_imgs, dict_segms, dict_slics, dict_points, dict_centers, dict_features, \
        dict_labels, feature_names = load_dump_data(path_dump_data)

    if is_drawing(params['path_output']) and EXPORT_TRAINING_DATA:
        export_dataset_visual(params['path_output'], dict_imgs, dict_segms,
                              dict_slics, dict_points, dict_labels,
                              params['nb_jobs'])

    # concentrate features, labels
    features, labels, sizes = seg_clf.convert_set_features_labels_2_dataset(
        dict_features,
        dict_labels,
        drop_labels=[-1],
        balance=params['balance'])
    # remove all bad values from features space
    features[np.isnan(features)] = 0
    features[np.isinf(features)] = -1
    assert np.sum(sizes) == len(labels), \
        'not equal sizes (%d) and labels (%i)' \
        % (int(np.sum(sizes)), len(labels))

    # feature norm & train classification
    nb_holdout = int(np.ceil(len(sizes) * CROSS_VAL_LEAVE_OUT_SEARCH))
    cv = seg_clf.CrossValidatePSetsOut(sizes, nb_holdout)
    classif, params['path_classif'] = seg_clf.create_classif_train_export(
        params['classif'],
        features,
        labels,
        cross_val=cv,
        params=params,
        feature_names=feature_names,
        nb_search_iter=params['nb_classif_search'],
        pca_coef=params.get('pca_coef', None),
        nb_jobs=params['nb_jobs'],
        path_out=params['path_output'])
    nb_holdout = int(np.ceil(len(sizes) * CROSS_VAL_LEAVE_OUT_EVAL))
    cv = seg_clf.CrossValidatePSetsOut(sizes, nb_holdout)
    seg_clf.eval_classif_cross_val_scores(params['classif'],
                                          classif,
                                          features,
                                          labels,
                                          cross_val=cv,
                                          path_out=params['path_output'])
    seg_clf.eval_classif_cross_val_roc(params['classif'],
                                       classif,
                                       features,
                                       labels,
                                       cross_val=cv,
                                       path_out=params['path_output'])

    if RUN_LEAVE_ONE_OUT:
        logging.info('run LOO prediction on training data...')
        # test classif on images
        gener_data = ((name, dict_imgs[name], dict_segms[name],
                       dict_centers[name], dict_slics[name], dict_points[name],
                       dict_features[name], feature_names)
                      for name in dict_imgs)
        wrapper_detection = partial(wrapper_detect_center_candidates,
                                    params=params,
                                    classif=classif,
                                    path_output=params['path_output'])
        df_stat = pd.DataFrame()
        tqdm_bar = tqdm.tqdm(total=len(dict_imgs), desc='experiment LOO')
        pool = mproc.Pool(params['nb_jobs'])
        for dict_stat in pool.imap_unordered(wrapper_detection, gener_data):
            df_stat = df_stat.append(dict_stat, ignore_index=True)
            df_stat.to_csv(
                os.path.join(params['path_output'], NAME_CSV_STAT_TRAIN))
            tqdm_bar.update()
        pool.close()
        pool.join()

        df_stat.set_index(['image'], inplace=True)
        df_stat.to_csv(os.path.join(params['path_output'],
                                    NAME_CSV_STAT_TRAIN))
        logging.info('STATISTIC: \n %s', repr(df_stat.describe().transpose()))

    logging.info('DONE')