def train_kaggle(dataset, alg="rig", data="bow"):
    train_x, train_y, test_x = dataset
    print "shape for training data is", train_x.shape

    if alg == "svm":
        clf = SGDClassifier(verbose=1, n_jobs=2, n_iter=20)
    elif alg == "svm_sq":
        clf = SGDClassifier(verbose=1, n_jobs=2, n_iter=20, loss="squared_hinge")
    elif alg == "log":
        clf = LogisticRegression(verbose=1, n_jobs=2)
    elif alg == "per":
        clf = Perceptron(verbose=1, n_jobs=2, n_iter=25)
    elif alg == "rig":
        clf = RidgeClassifier()
    elif alg == "pa":
        clf = PassiveAggressiveClassifier(n_jobs=2, n_iter=25)
    else:
        raise NotImplementedError

    print "training with %s..." % alg

    clf.fit(train_x, train_y)
    # clf.fit(validate_x, validate_y)
    predicted = clf.predict(test_x)
    save_csv(predicted, fname=alg + "_" + data)

    if alg != "nb":
        return clf.decision_function(train_x), clf.decision_function(test_x)
    else:
        return clf.predict_proba(train_x), clf.predict_proba(test_x)
def multi_learner(n_estimators=200, alg="et"):
    train_x_2, train_y, test_x_2 = bow_kaggle_dataset()
    train_x, test_x = read_all_predict_score()
    train_x = sparse.hstack([train_x, train_x_2])
    test_x = sparse.hstack([test_x, test_x_2])
    print "training with", alg, n_estimators
    if alg == "rf":
        clf = RandomForestClassifier(n_estimators=n_estimators, oob_score=True, verbose=1)
    elif alg == "et":
        clf = ExtraTreesClassifier(n_estimators=n_estimators, verbose=1)
    elif alg == "log":
        clf = LogisticRegression(verbose=1, n_jobs=2)
    else:
        raise NotImplementedError
    clf.fit(train_x, train_y)
    predicted = clf.predict(test_x)
    save_csv(predicted, "test")
def train(data=SST_KAGGLE, alg='logcv'):
    train_x, train_y, test_x = read_aggregated_vectors(google=True, data=data)

    train_x = np.asarray(train_x)
    train_y = np.asarray(train_y)
    test_x = np.asarray(test_x)

    print "shape for training data is", train_x.shape

    if alg == 'svm':
        clf = SVC(verbose=1)
    elif alg == 'log':
        clf = LogisticRegression(verbose=1)
    elif alg == 'logcv':
        clf = LogisticRegressionCV(cv=5, verbose=1)
    else:
        raise NotImplementedError

    print "training..."
    clf.fit(train_x, train_y)
    # clf.fit(validate_x, validate_y)
    predicted = clf.predict(test_x)
    save_csv(predicted)
def train(data=SST_KAGGLE, alg='log'):
    _, train_y, _ = vectorize_text(data=data)
    train_x, test_x = read_doc2vec_pickle(dm=False)
    # train_x_1, test_x_1 = senti_lexicon_vectorizor(data=data, tfidf=True)
    # train_x_2, test_x_2 = senti_wordnet_vectorizer(data=data, tfidf=True)
    #
    # train_x = sparse.hstack((train_x_1, train_x_2))
    # test_x = sparse.hstack((test_x_1, test_x_2))

    print "shape for training data is", train_x.shape

    if alg == 'svm':
        clf = SVC(verbose=1)
    elif alg == 'log':
        clf = LogisticRegression(verbose=1)     # 61.756, no phrase,
    elif alg == 'nb':
        clf = MultinomialNB()
    else:
        raise NotImplementedError

    print "training..."
    clf.fit(train_x, train_y)
    predicted = clf.predict(test_x)
    save_csv(predicted)