Exemple #1
0
def gen_data_for_stacking(clf, X, y, X_test, n_splits=5, random_state=None):
    """ generate single model result data for stacking
    Args:
        clf: single model
        X: original X
        y: original y
        X_test: original X_test
        n_splits: n_splits for skf
        random_state: random_state for skf
    Returns:
        X, y, X_test
    """
    skf = StratifiedKFold(n_splits=n_splits, shuffle=bool(random_state), random_state=random_state)
    y_pred = np.zeros((X.shape[0],))  # for printing score of each fold
    y_pred_proba = np.zeros((X.shape[0], N_CLASSES))
    y_test_pred_proba = np.zeros((X_test.shape[0], N_CLASSES))
    for ind, (train_index, cv_index) in enumerate(skf.split(X, y)):  # cv split
        X_train, X_cv = X[train_index], X[cv_index]
        y_train, y_cv = y[train_index], y[cv_index]
        clf.fit(X_train, y_train)
        y_pred[cv_index] = clf.predict(X_cv)
        y_pred_proba[cv_index] = predict_proba(clf, X_cv)
        print("%d/%d cv macro f1 :" % (ind + 1, n_splits),
              f1_score(y_cv, y_pred[cv_index], average='macro'))
        y_test_pred_proba += predict_proba(clf, X_test)
    print("macro f1:", f1_score(y, y_pred, average='macro'))  # calc macro_f1 score

    y_test_pred_proba /= n_splits  # normalize to 1
    return y_pred_proba, y, y_test_pred_proba
Exemple #2
0
def train_and_gen_result(clf,
                         X,
                         y,
                         X_test,
                         use_proba=False,
                         save_url=None,
                         n_splits=1,
                         random_state=None):
    """ train and generate result with specific clf

    Args:
        clf: classifier
        X: vectorized data
        y: target
        X_test: test data
        use_proba: predict probabilities of labels instead of label
        save_url: url to save the result file
        n_splits: n_splits for K-fold, None to not use k-fold
        random_state: random_state for 5-fold

    """
    if n_splits > 1:
        slf = StratifiedKFold(n_splits=n_splits,
                              shuffle=bool(random_state),
                              random_state=random_state)
        y_pred_proba = np.zeros((X_test.shape[0], N_CLASSES))
        for train_index, cv_index in slf.split(X, np.zeros((len(y), ))):
            X_train = X[train_index]
            y_train = y[train_index]
            clf.fit(X_train, y_train)
            #y_pred = clf.predict(X_test)
            #print(y_pred)
            y_pred_proba += predict_proba(clf, X_test, X_train, y_train)
        y_pred_proba /= n_splits
        y_pred = y_pred_proba.argmax(axis=1) + 1
        # 正确生成多标签?并正确评价多标签

    else:
        clf.fit(X, y)
        y_pred_proba = predict_proba(clf, X_test, X, y)
        y_pred = clf.predict(X_test)

    if use_proba:
        result_df = pd.DataFrame(
            y_pred_proba,
            columns=['class_prob_' + str(i + 1) for i in range(N_CLASSES)])
    else:
        result_df = pd.DataFrame(y_pred, columns=['class'])
    if save_url:
        result_df.to_csv(save_url, index_label='id')
    return result_df
Exemple #3
0
def gen_multi_data_for_stacking(n_splits=5, random_state=233):
    clf = OneVsRestClassifier(LogisticRegression(solver='liblinear'))
    X_one, _, X_test = joblib.load(from_project_root('data/vector/stacked_one_XyX_test_32_subjects.pk'))
    _, _, X_multi = joblib.load(from_project_root('data/vector/stacked_one_XyX_multi_32_subjects.pk'))

    train_df = pd.read_csv(from_project_root("data/train_2_ex.csv"))
    skf = StratifiedKFold(n_splits=n_splits, shuffle=bool(random_state), random_state=random_state)
    y = MultiLabelBinarizer().fit_transform(train_df['subjects'].apply(str.split))

    one_ind = train_df['n_subjects'] == 1
    multi_ind = train_df['n_subjects'] > 1
    y_for_split = train_df['subjects'][one_ind].values.astype(int)
    y_one = y[one_ind]

    y_proba = np.zeros((len(train_df), 10))
    y_pred_one = np.zeros((X_one.shape[0], 10))  # for printing score of each fold
    y_proba_one = np.zeros((X_one.shape[0], 10))
    y_test_proba = np.zeros((X_test.shape[0], 10))
    y_proba_multi = np.zeros((X_multi.shape[0], 10))

    for ind, (train_index, cv_index) in enumerate(skf.split(X_one, y_for_split)):  # cv split
        X_train, X_cv = X_one[train_index], X_one[cv_index]
        y_train, y_cv = y_one[train_index], y_one[cv_index]
        clf.fit(X_train, y_train)
        y_pred_one[cv_index] = clf.predict(X_cv)
        y_proba_one[cv_index] = predict_proba(clf, X_cv)
        print("%d/%d cv micro f1 :" % (ind + 1, n_splits),
              f1_score(y_cv, y_pred_one[cv_index], average='micro'))
        y_test_proba += predict_proba(clf, X_test)
        y_proba_multi += predict_proba(clf, X_multi)
    print("micro f1:", f1_score(y_one, y_pred_one, average='micro'))  # calc micro_f1 score

    y_test_proba /= n_splits  # avg
    y_proba_multi /= n_splits  # avg

    y_proba[one_ind] = y_proba_one
    y_proba[multi_ind] = y_proba_multi

    print(y_proba.shape, y.shape, y_test_proba.shape)
    return y_proba, y, y_test_proba
Exemple #4
0
def validate(pkl_url=None, cv=5, evaluating=False):
    """ do validating

        Args:
            pkl_url: load data from pickle file, set to None to generate data instantly
            cv: do cross validation or not
            evaluating: whether to do evaluating on test_gold

    """
    clfs = init_clfs()
    val_url = from_project_root("data/preliminary/test_gold_ex.csv")
    if pkl_url is not None:
        # load from pickle
        print("loading data from", pkl_url)
        X, y, X_val = joblib.load(pkl_url)
    else:
        train_url = from_project_root("data/preliminary/train_ex.csv")
        # generate from original csv
        X, y, X_val = generate_vectors(train_url,
                                       val_url,
                                       column='article',
                                       max_n=3,
                                       min_df=3,
                                       max_df=0.8,
                                       max_features=20000,
                                       trans_type='dc',
                                       sublinear_tf=True,
                                       balanced=True,
                                       multilabel_out=False,
                                       label_col='subjects',
                                       only_single=True,
                                       shuffle=True)

    print("data shapes:\n", X.shape, y.shape, X_val.shape)
    for name, clf in clfs.items():
        if len(y.shape) > 1:
            clf = OneVsRestClassifier(clf)
        print("cross validation on %s is running" % name)
        validate_clf(clf, X, y, cv=5, scoring='f1_micro')
        if evaluating:
            print("metrics of %s classifier:" % name)
            clf.fit(X, y)
            y_true = pd.read_csv(val_url, usecols=list(map(
                str, range(10)))).values < 2
            y_pred = clf.predict(X_val)
            y_probas = predict_proba(clf, X_val)
            calc_metrics(y_true, y_pred, y_probas)
Exemple #5
0
def gen_10bi_result(train_url, test_url, validating=False, evaluating=False):
    """

    Args:
        train_url: url of csv train data
        test_url: url of csv  test data
        validating: whether to do validating
        evaluating: whether to do evaluating on test_gold

    Returns:
        stacked probabilities of belonging to each subjects

    """
    tdf = pd.read_csv(test_url)['content_id']
    n_samples = len(tdf)
    y_probas = np.empty(shape=(n_samples, 0))
    y_pred = np.empty(shape=(n_samples, 0), dtype=int)
    for col in range(10):
        # X, y, X_test = generate_vectors(train_url, test_url, column='article', max_n=3, min_df=3, max_df=0.8,
        #                                 max_features=30000, trans_type='dc', sublinear_tf=True, balanced=True,
        #                                 multilabel_out=False, label_col='subjects', only_single=False, shuffle=True,
        #                                 apply_fun=lambda label: str(col) in label)
        X, y, X_test = joblib.load(
            from_project_root("data/vector/stacked_all_XyX_val_32_%d.pk" %
                              col))
        clf = LinearSVC()
        print("running on subject %s" % id2sub(col))
        if validating:
            validate_clf(clf, X, y, scoring='f1')
        clf.fit(X, y)
        proba = predict_proba(clf, X_test)[:, 1:2]
        y_probas = np.hstack((y_probas, proba))
        y_pred = np.hstack((y_pred, clf.predict(X_test).reshape(-1, 1)))

    if evaluating:
        y_true = pd.read_csv(test_url, usecols=list(map(str,
                                                        range(10)))).values < 2
        calc_metrics(y_true, y_pred, y_probas)
    return y_pred, y_probas