Ejemplo n.º 1
0
def rf_model(train, target, test, text_train_tfidf, text_test_tfidf):

    text_train = train["Title"].values + ". " + train["BodyMarkdown"].values
    text_test = test["Title"].values + ". " + test["BodyMarkdown"].values
    print("Creating word2vec model...")
    w2v.make_word2vec_model(text_train, text_test)
    wv_train, wv_test = w2v.word2vec_features(text_train, text_test, load=False)

    X_train, X_test = fe.extract_features(train), fe.extract_features(test)


    X_train, X_test = fe.categories_to_counters(X_train, X_test, target)
    X_train, X_test = fe.transform_features(X_train, X_test)
    print("Creating linear model metafeature...")
    X_train["LinearModelText"], X_test["LinearModelText"] = mf.linear_model_as_feature(text_train_tfidf, target, text_test_tfidf, load=False)
    print("Creating word2vec model metafeature...")
    X_train["w2vModelRFText"], X_test["w2vModelRFText"] = mf.w2v_model_as_feature(wv_train, target, wv_test, load=False, model_to_train="rf")

    scaler = sklearn.preprocessing.StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    model = sklearn.ensemble.RandomForestClassifier(criterion="entropy", max_depth=14, n_estimators=2000,
                                                    min_samples_leaf=4, min_samples_split=16, n_jobs=4, random_state=1234)

    result = make_predictions(model, X_train, target, X_test)
    io.save_result(test["PostId"], result)

    return result
Ejemplo n.º 2
0
def linear_model(train, target, test, text_train_tfidf, text_test_tfidf):

    X_train, X_test = fe.extract_features(train), fe.extract_features(test)

    X_train, X_test = fe.categories_to_counters(X_train, X_test, target)
    X_train, X_test = fe.transform_features(X_train, X_test)

    feature_train = np.load("w2v/word2vec_feature_train")
    feature_test = np.load("w2v/word2vec_feature_test")

    X_train = np.column_stack((X_train.values, feature_train))
    X_test = np.column_stack((X_test.values, feature_test))


    scaler = sklearn.preprocessing.StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)


    X_train = scipy.sparse.hstack((text_train_tfidf, X_train), format="csr")
    X_test = scipy.sparse.hstack((text_test_tfidf, X_test), format="csr")

    model = sklearn.linear_model.LogisticRegression(C=0.7, penalty="l2")

    result = make_predictions(model, X_train, target, X_test)
    io.save_result(test["PostId"], result)

    return result