Beispiel #1
0
def do_submission():
    train, test = load_dataset()
    train_X = train["tweet"]
    train_Y = get_labels(train)
    test_X = test["tweet"]

    feature_type = ["wordcount", "char"]
    test_ids = get_test_ids(test)
    meta_train_X, meta_test_X = get_extracted_features(feature_type, train_X, test_X)

    print("n_samples: %d, n_features: %d" % meta_train_X.shape)

    predict_and_sub(meta_train_X, train_Y.values, meta_test_X, test_ids, predict_ridge)
Beispiel #2
0
def train():
    train, _ = load_dataset()
    train_X = train["tweet"]
    train_Y = get_labels(train)

    n_samples = len(train_Y)

    X_train, _, y_train, _ = train_test_split(train_X[:n_samples], train_Y[:n_samples], test_size=0.2, random_state=1)

    t0 = time()

    feature_type = ["wordcount", "char"]

    rmse_avg = do_cross_val(X_train, y_train, feature_type, nfolds=3)

    print("Average RMSE %.6f" % rmse_avg)

    duration = time() - t0
    print("training time: %fs" % duration)
Beispiel #3
0
def train_model():

    train, _ = load_dataset()
    train_X = train["tweet"]
    train_Y = get_labels(train)

    n_samples = len(train_Y)

    X_train, _, y_train, _ = train_test_split(train_X[:n_samples], train_Y[:n_samples], test_size=0.2, random_state=1)

    scorer = make_scorer(rmse_score, greater_is_better=False)

    pipeline, parameters = get_ridge_model()
    # pipeline, parameters = get_three_predictor_model()
    # pipeline, parameters = get_elasticnet_model()
    # pipeline, parameters = get_three_predictor_model2()
    # pipeline, parameters = get_three_predictor_model3()
    # pipeline, parameters = get_ridge_model2()
    # pipeline, parameters = get_ridge_model3()
    # pipeline, parameters = get_advanced_ridge()

    do_gridsearch(X_train, y_train, pipeline, parameters, scorer)
Beispiel #4
0
def train_final():
    """
    train a model using grid search for parameter estimation
    """

    train, test = load_dataset()
    train_X = train["tweet"]
    train_Y = get_labels(train)
    test_X = test["tweet"]

    tfidf1 = TfidfVectorizer(
        max_df=0.6,
        min_df=0.0000003,
        stop_words="english",
        strip_accents="unicode",
        token_pattern="\w{1,}",
        max_features=5000,
        norm="l2",
        use_idf=False,
        smooth_idf=False,
        ngram_range=(1, 3),
    )

    tfidf2 = TfidfVectorizer(
        max_df=0.6,
        analyzer="char",
        min_df=0.00001,
        stop_words="english",
        strip_accents="unicode",
        norm="l2",
        max_features=5000,
        ngram_range=(1, 7),
        smooth_idf=False,
        use_idf=False,
    )

    tfidf1.fit(np.hstack((train_X, test_X)))
    tfidf2.fit(np.hstack((train_X, test_X)))

    train_X1 = tfidf1.transform(train_X)
    train_X2 = tfidf2.transform(train_X)

    train_X = hstack([train_X1, train_X2]).tocsr()

    n_samples = len(train_Y)

    X_train, _, y_train, _ = train_test_split(train_X[:n_samples], train_Y[:n_samples], test_size=0.2, random_state=1)

    scorer = make_scorer(rmse_score, greater_is_better=False)

    pipeline, parameters = get_advanced_ridge2()
    # pipeline, parameters = get_three_predictor_model()
    # pipeline, parameters = get_elasticnet_model()
    # pipeline, parameters = get_three_predictor_model2()
    # pipeline, parameters = get_three_predictor_model3()
    # pipeline, parameters = get_ridge_model2()
    # pipeline, parameters = get_ridge_model3()
    # pipeline, parameters = get_advanced_ridge()

    best_estimator = do_gridsearch(X_train, y_train, pipeline, parameters, n_jobs=5, verbose=1, scoring=scorer)

    # predict test data
    test_1 = tfidf1.transform(test_X)
    test_2 = tfidf2.transform(test_X)

    test_d = hstack([test_1, test_2])

    final_preds = best_estimator.predict(test_d)
    save_prediction_subs(test["id"], final_preds)