Example #1
0
def multi_label_crf(
        labels,
        df_train,
        df_valid,
        df_test,
        binary,
        connectivity='full',
        vectorizer_method='count'
    ):
    """
    Do you suffer from acute label correlations? Are your samples a part of more than
    one class? Do you have signs of labels have dependency? If you answered yes to at least
    one of those questions then sign up for structured learning today. For a low monthly
    membership fee of $39.99 you can solve all your multi-label woes!

    @param labels:
    @param df_train:
    @param df_valid:
    @param df_test:
    @param connectivity:
    @param vectorizer_method:

    @return:
    """
    stats_container_valid = Statistics()
    stats_container_test = Statistics()

    if vectorizer_method == 'tf-idf':
        vectorizer_node = TfidfVectorizer(
            stop_words=['go:', '', ' '], binary=binary, lowercase=True, sublinear_tf=False, max_df=1.0, min_df=0)
        vectorizer_node.fit(df_train['terms'].values)
        alpha = None
        percentile = 100
    elif vectorizer_method == 'count':
        vectorizer_node = CountVectorizer(stop_words=['go', '', ' '], binary=binary, lowercase=True)
        vectorizer_node.fit(df_train['terms'].values)
        alpha = None
        percentile = 100
    else:
        raise TypeError("Vectorizer_method has type {}.".format(type(vectorizer_method)))

    x_node_train, y_train, feature_names, selector_node = prep.select_features(
        df = df_train,
        vectorizer=vectorizer_node,
        feature_col='terms',
        label_col='label',
        select_method=None,
        continuous_col=[],
        alpha=alpha,
        percentile=percentile
    )
    x_node_valid, y_valid = prep.transform_features(
        df=df_valid,
        vectorizer=vectorizer_node,
        selector=selector_node,
        feature_col='terms',
        label_col='label',
        continuous_cols=[]
    )

    y_train = np.asarray([prep.binarise_labels(x, labels) for x in y_train], dtype=int)
    y_valid = np.asarray([prep.binarise_labels(x, labels) for x in y_valid], dtype=int)

    if connectivity == 'full' or connectivity == 'tree':
        n_labels = len(labels)
        edges = np.vstack([x for x in itertools.combinations(range(n_labels), 2)])
        model = MultiLabelClf(n_labels=len(labels), edges=edges, inference_method='ad3')
    elif connectivity == 'tree':
        edges = chow_liu_tree(y_train)
        model = MultiLabelClf(n_labels=len(labels), edges=edges, inference_method='max-product')
    else:
        edges = None
        model = MultiLabelClf(n_labels=len(labels), edges=edges, inference_method='unary')
    x_train = x_node_train.toarray()
    x_valid = x_node_valid.toarray()

    # -------------------- MAKE THE ESTIMATOR -------------------- #
    estimator = OneSlackSSVM(model, max_iter=2, tol=0.001, n_jobs=1)

    # -------------------- LEARN/STATS -------------------- #
    estimator.fit(x_train, y_train)
    stats_container_valid.merge(evaluate_crf_model(x_valid, y_valid, estimator, labels))

    if isinstance(df_test, pd.DataFrame):
        x_node_test, y_test = prep.transform_features(
            df=df_test,
            vectorizer=vectorizer_node,
            selector=selector_node,
            feature_col='terms',
            label_col='label',
            continuous_cols=[]
        )
        y_test = np.asarray([prep.binarise_labels(x, labels) for x in y_test], dtype=int)
        x_test = x_node_test.toarray()
        stats_container_test.merge(evaluate_crf_model(x_test, y_test, estimator, labels))

    # -------------------- RETURN -------------------- #
    if isinstance(df_test, pd.DataFrame):
        return stats_container_valid, stats_container_test
    else:
        return stats_container_valid
Example #2
0
    df_train, df_hprd = prep.prep_data_frames(selection)

    vectorizer = CountVectorizer(stop_words=['go', '', ' '], binary=False, lowercase=True)
    vectorizer.fit(df_train['terms'].values)

    print("Transforming features...")
    x_train, y_train, feature_names, selector = prep.select_features(
        df = df_train,
        vectorizer=vectorizer,
        feature_col='terms',
        label_col='label',
        continuous_col=['sim'],
        alpha=None,
        percentile=100
    )
    y_train = np.asarray([prep.binarise_labels(y, labels) for y in y_train])

    x_hprd, y_hprd = prep.transform_features(
        df = df_hprd,
        vectorizer=vectorizer,
        selector=selector,
        feature_col='terms',
        label_col='label',
        continuous_cols=['sim']
    )
    y_hprd = np.asarray([prep.binarise_labels(y, labels) for y in y_hprd])

    x_test, y_test = prep.transform_features(
        df = df_test,
        vectorizer=vectorizer,
        selector=selector,