def prepare_datasets(data, resplit=True, validation_ratio=0.2, google=True):
    datasets, W, mask = read_word2index_data(data=data, google=google, cv=False, huge=not google)
    train_x, train_y, validate_x, validate_y, test_x, test_y = datasets
    if data == TREC and resplit:
        train_x, train_y, validate_x, validate_y, mask = resplit_train_data(train_x, train_y, validate_x, validate_y,
                                                                            validation_ratio, mask=mask)
    return train_x, train_y, validate_x, validate_y, test_x, test_y, W, mask
def cross_validation_embedding(validation_ratio=0.1, data=ROTTEN_TOMATOES, shuffle=True):
    datasets, W, _ = read_word2index_data(data=data, google=True, cv=True)
    x, y = datasets
    # get input shape
    input_shape = (x[0].shape[0], W.shape[1])
    print "input data shape", input_shape
    n_out = len(np.unique(y))
    skf = StratifiedKFold(y, n_folds=10)
    accuracy_list = []
    for i, indices in enumerate(skf):
        print "\nat cross validation iter %i" % i
        print "\n**********************\n"
        train, test = indices
        train_x = x[train]
        train_y = y[train]
        test_x = x[test]
        test_y = y[test]
        train_x, validate_x, train_y, validate_y = train_test_split(train_x, train_y, test_size=validation_ratio,
                                                                    random_state=42, stratify=train_y)
        shuffle_indices = np.random.permutation(train_x.shape[0]) if shuffle else np.arange(train_x.shape[0])
        datasets = (train_x[shuffle_indices], train_y[shuffle_indices], validate_x, validate_y, test_x, test_y)
        test_accuracy = train_ngram_net_embedding(
            U=W,
            datasets=datasets,
            n_epochs=15,
            non_static=False,
            ngrams=(1, 2),
            input_shape=input_shape,
            ngram_bias=False,
            multi_kernel=True,
            concat_out=False,
            n_kernels=(4, 4),
            ngram_out=(300, 250),
            use_bias=False,
            lr_rate=0.02,
            dropout=True,
            dropout_rate=0.,
            n_hidden=200,
            n_out=n_out,
            l2_ratio=1e-4,
            ngram_activation=tanh,
            activation=leaky_relu,
            batch_size=25,
            update_rule='adagrad',
            mean_pool=False
        )
        accuracy_list.append(test_accuracy)

    print "\n**********************\nfinal result: %f" % np.mean(accuracy_list)
def ngram_wrapper(
        data=SST_SENT_POL,
        n_epochs=25,
        ngrams=(1, 2),
        multi_kernel=True,
        n_kernels=(4, 3),
        ngram_out=(300, 200),
        use_bias=False,
        batch_size=50,
        dropout=True,
        n_hidden=100,
        ngram_layers=2,
        dropout_rate=0.3,
        lr_rate=0.01,
        mean_pool=False,
        non_static=True,
        l2_ratio=1
):
    # getting the datasets
    datasets, W, _ = read_word2index_data(data=data, google=True, cv=False)
    train_x, train_y, validate_x, validate_y, test_x, test_y = datasets
    # get input shape
    input_shape = (train_x[0].shape[0], W.shape[1])
    print "input data shape", input_shape
    n_out = len(np.unique(test_y))
    shuffle_indices = np.random.permutation(train_x.shape[0])
    datasets = (train_x[shuffle_indices], train_y[shuffle_indices], validate_x, validate_y, test_x, test_y)
    # network configuration
    l2_ratio /= 1e4
    lr_rate /= 1e2
    n_epochs *= 2
    batch_size *= 10
    n_hidden *= 10
    dropout_rate /= 10.0
    # ngram layers configurations
    n_kernels = tuple(n_kernels[:ngram_layers])
    ngrams = tuple(ngrams[:ngram_layers])
    ngram_out = tuple(ngram_out[:ngram_layers])
    validation_accuracy = train_ngram_net(
        U=W,
        datasets=datasets,
        n_epochs=n_epochs,
        ngrams=ngrams,
        non_static=non_static,
        input_shape=input_shape,
        ngram_out=ngram_out,
        ngram_bias=False,
        multi_kernel=multi_kernel,
        concat_out=False,
        n_kernels=n_kernels,
        use_bias=use_bias,
        lr_rate=lr_rate,
        mean_pool=mean_pool,
        dropout=dropout,
        dropout_rate=dropout_rate,
        n_hidden=n_hidden,
        n_out=n_out,
        l2_ratio=l2_ratio,
        ngram_activation=leaky_relu,
        activation=leaky_relu,
        batch_size=batch_size,
        update_rule='adagrad',
        validation_only=True    # return the validation error to minimize
    )
    return 1 - validation_accuracy