def wrapper_kaggle(epochs=40, validate_ratio=0.1, save_prob=True):
    train_x_1, test_x_1 = read_all_predict_score()
    train_x_2, train_y, test_x_2 = read_aggregated_vectors()
    train_x_3, train_y, test_x_3 = read_aggregated_vectors(google=False)

    old_train_x = train_x_1  # np.concatenate((train_x_1, train_x_2, train_x_3), axis=1)
    test_x = test_x_1  # np.concatenate((test_x_1, test_x_2, test_x_3), axis=1)
    old_train_y = np.asarray(train_y)
    # split train validate data
    sss_indices = StratifiedShuffleSplit(y=old_train_y, n_iter=1, test_size=validate_ratio)
    for indices in sss_indices:
        train_index, test_index = indices
    train_x = old_train_x[train_index]
    validate_x = old_train_x[test_index]
    train_y = old_train_y[train_index]
    validate_y = old_train_y[test_index]

    # add validation set for training
    # train_x = np.concatenate((train_x, validate_x))
    # train_y = np.concatenate((train_y, validate_y))

    # get dataset info
    dim = train_x[0].shape[0]
    n_out = len(np.unique(validate_y))
    datasets = (train_x, train_y, validate_x, validate_y, test_x)

    n_layers = 1

    print "input dimension is %d, output dimension is %d" % (dim, n_out)

    return_val = train_dropout_net(
        datasets=datasets,
        use_bias=True,
        n_epochs=epochs,
        dim=dim,
        lr_rate=0.02,
        n_out=n_out,
        dropout=True,
        dropout_rates=[0.7],
        n_hidden=[100],
        activations=[tanh] * n_layers,
        batch_size=50,
        update_rule='adagrad',
        no_test_y=True,
        save_prob=save_prob
    )
    if not save_prob:
        save_csv(return_val)
    else:
        train_prob, validate_prob, test_prob = return_val
        saved_train_prob = np.zeros((old_train_x.shape[0], n_out))
        saved_train_prob[train_index] = train_prob
        saved_train_prob[test_index] = validate_prob
        save_path = "D:/data/nlpdata/pickled_data/" + SST_KAGGLE + "_prob.pkl"
        print "saving probability feature to %s" % save_path

        f = open(Path(save_path), "wb")
        pkl.dump((saved_train_prob, test_prob), f, -1)
        f.close()
def train(data=SST_KAGGLE, alg='logcv'):
    train_x, train_y, test_x = read_aggregated_vectors(google=True, data=data)

    train_x = np.asarray(train_x)
    train_y = np.asarray(train_y)
    test_x = np.asarray(test_x)

    print "shape for training data is", train_x.shape

    if alg == 'svm':
        clf = SVC(verbose=1)
    elif alg == 'log':
        clf = LogisticRegression(verbose=1)
    elif alg == 'logcv':
        clf = LogisticRegressionCV(cv=5, verbose=1)
    else:
        raise NotImplementedError

    print "training..."
    clf.fit(train_x, train_y)
    # clf.fit(validate_x, validate_y)
    predicted = clf.predict(test_x)
    save_csv(predicted)
def average_word_kaggle_dataset(google=True):
    train_x, train_y, test_x = read_aggregated_vectors(data=SST_KAGGLE, google=google)
    return np.asarray(train_x), np.asarray(train_y), np.asarray(test_x)