def train_pipeline(df, pipe, clf_params, folds=10):
    """
    trains the  pipeline by the given classifier and predefined dimension reducer parameters.
    :return: tuple of best estimator and best parameters
    """
    features, target = split_features_labels(df)
    # params to tune, reducers: pca, rfe and kbest
    max_n = features.columns.size
    params = create_pipeline_params(clf_params, max_n)
    # use stratified splits
    folder = StratifiedShuffleSplit(folds, test_size=.3, train_size=.7)
    folder.random_state = random_state
    # use f1 scoring to train combined precision and recall metrics
    grid = GridSearchCV(pipe,
                        cv=folder,
                        n_jobs=4,
                        param_grid=params,
                        scoring='f1')
    grid.fit(features, target)
    return [grid.best_estimator_, grid.best_params_]
Exemple #2
0
def run(
    word_params: Optional["CNNParams"],
    char_params: Optional["CNNParams"],
    training_sizes: List[int],
    window_sizes: List[Tuple[int, int]],
    k: int = 5,
    nocluster_dropout: float = 0.5,
    kmeans_path: str = "../clustered",
    gmm_path: str = "../clustered_gmm",
    num_clusters: int = 10,
    num_clusters_gmm: int = 10,
    use_cluster_cnn: bool = False,
    use_only_clusters: bool = False,
    use_bow: bool = False,
) -> Tuple[Results, Results]:
    if not (word_params or char_params):
        print("Need at least one of {word_params, char_params")
        return Results(None, None, None), Results(None, None, None)

    both_models = word_params and char_params

    baseline = defaultdict(dict)
    dbscan = defaultdict(dict)
    gmm = defaultdict(dict)
    char_baseline = defaultdict(dict)
    char_dbscan = defaultdict(dict)
    char_gmm = defaultdict(dict)

    if use_cluster_cnn:
        def fn(w, n):
            return lambda r: CNNClusterLabels(r, w, n, word_params.dropout)
    elif use_only_clusters:
        def fn(w, n):
            return lambda r: OnlyClusterLabels(r, n * (sum(w) + 1), word_params.dropout)
    else:
        def fn(w, n):
            return lambda r: CategoricalClusterLabels(r, n * (sum(w) + 1), word_params.dropout)

    for training_size in training_sizes:
        for window_size in window_sizes:
            optim_fn = lambda p: torch.optim.Adam(p)
            model_fns = []

            if nocluster_dropout >= 0:
                model_fns.append(lambda r: NoClusterLabels(r, nocluster_dropout))
            if word_params:
                model_fns += [
                    fn(window_size, num_clusters),
                    fn(window_size, num_clusters_gmm),
                ]

            if nocluster_dropout >= 0:
                model_fns.append(lambda r: NoClusterLabels(r, nocluster_dropout))
            if char_params:
                model_fns += [
                    fn(window_size, num_clusters),
                    fn(window_size, num_clusters_gmm),
                ]

            dataset, validset, testset = load_dataset(
                kmeans_path, gmm_path, num_clusters, num_clusters_gmm, window_size[0], window_size[1], old_test=True
            )
            splitter = StratifiedShuffleSplit(
                n_splits=k,
                train_size=training_size,
                test_size=None,
                random_state=100,
            )

            params_list = []
            multiplier = 3 if nocluster_dropout >= 0 else 2
            params_list += ([word_params] * multiplier) if word_params else []
            params_list += ([char_params] * multiplier) if char_params else []

            use_dist_list: List[bool]
            if nocluster_dropout >= 0:
                use_dist_list = [False, False, True] * (2 if both_models else 1)
            else:
                use_dist_list = [False, True] * (2 if both_models else 1)

            splitter.random_state = 100
            if use_bow:
                values = cross_val_bow(k, splitter, dataset, testset=testset)
            else:
                values = cross_val(
                    k,
                    splitter,
                    model_fns,
                    use_dist_list,
                    optim_fn,
                    dataset,
                    params=params_list,
                    early_stopping=3,
                    validation_set=validset,
                    batch_size=128,
                    testset=testset,
                )

            result_order = []
            if word_params:
                if nocluster_dropout >= 0:
                    result_order.append(baseline)
                result_order += [dbscan, gmm]
            if char_params:
                if nocluster_dropout >= 0:
                    result_order.append(char_baseline)
                result_order += [char_dbscan, char_gmm]

            if use_bow:
                # special case, override the order
                result_order = [baseline]

            num_iter = len(values[0])
            assert(num_iter == len(result_order))

            for i, var in enumerate(result_order):
                var[window_size][training_size] = [v[i] for v in values]

    return (
        Results(baseline, dbscan, gmm),
        Results(char_baseline, char_dbscan, char_gmm),
    )