def get_naive_bigram_vectors(average=True, int_label=True, dim=300, kernel=(1, 1)):
    model = read_glove_model(dim=dim)
    train_x, train_y, validate_x, validate_y = read_train_data(int_label=int_label)
    test_x, test_y = read_test_data(int_label=int_label)
    print "getting bigram word vectors for documents..."
    train_x = get_reviews_vectors(train_x, model, average=average, kernel=kernel)
    validate_x = get_reviews_vectors(validate_x, model, average=average, kernel=kernel)
    test_x = get_reviews_vectors(test_x, model, average=average, kernel=kernel)
    return train_x, train_y, validate_x, validate_y, test_x, test_y
def get_aggregated_vectors(google=True, data=SST_KAGGLE, average=True, dim=300):
    if google:
        model = read_google_model()
    else:
        model = read_glove_model(dim=dim)
    print "getting aggregate word vectors for documents..."
    if data == SST_KAGGLE:
        train_x, train_y, test_x = read_sst_kaggle_pickle()
        train_x = get_reviews_vectors(train_x, model, average=average, aggregate=True)
        test_x = get_reviews_vectors(test_x, model, average=average, aggregate=True)
        return train_x, train_y, test_x
def get_document_matrices(google=False, dim=100, cutoff=50, uniform=True, data='rotten', cv=True, huge=False):
    print "getting concatenated word vectors for documents..."
    model = read_google_model() if google else read_glove_model(dim=dim, huge=huge)
    if cv:
        if data == ROTTEN_TOMATOES:
            x, y = read_rotten_pickle()
            cutoff = 56
        elif data == SUBJ:
            x, y = read_subj_pickle()
        elif data == CUSTOMER_REVIEW:
            cutoff = 45
            x, y = read_cr_pickle()
        elif data == MPQA:
            x, y = read_mpqa_pickle()
            cutoff = 20
        else:
            raise NotImplementedError('Not such cross validation data set %s', data)
        x = get_reviews_vectors(x, model, aggregate=False, cutoff=cutoff, uniform=uniform)
        x = np.asarray(x)
        y = np.asarray(y)
        return x, y
    else:
        if data == IMDB:
            train_x, train_y, validate_x, validate_y, test_x, test_y = read_imdb_pickle()
            cutoff = 75
        elif data == SST_SENT:
            cutoff = 50
            train_x, train_y, validate_x, validate_y, test_x, test_y = read_sst_sent_pickle()
        elif data == SST_SENT_POL:
            cutoff = 50
            train_x, train_y, validate_x, validate_y, test_x, test_y = read_sst_sent_pickle(polarity=True)
        elif data == TREC:
            train_x, train_y, validate_x, validate_y, test_x, test_y = read_trec_pickle()
            cutoff = 30
        else:
            raise NotImplementedError('Not such train/dev/test data set %s', data)
        train_x = get_reviews_vectors(train_x, model, aggregate=False, cutoff=cutoff, uniform=uniform)
        validate_x = get_reviews_vectors(validate_x, model, aggregate=False, cutoff=cutoff, uniform=uniform)
        test_x = get_reviews_vectors(test_x, model, aggregate=False, cutoff=cutoff, uniform=uniform)

        train_x = np.asarray(train_x)
        train_y = np.asarray(train_y)
        validate_x = np.asarray(validate_x)
        validate_y = np.asarray(validate_y)
        test_x = np.asarray(test_x)
        test_y = np.asarray(test_y)

        return train_x, train_y, validate_x, validate_y, test_x, test_y