def prepare_datasets(data, resplit=True, validation_ratio=0.2, google=True): datasets, W, mask = read_word2index_data(data=data, google=google, cv=False, huge=not google) train_x, train_y, validate_x, validate_y, test_x, test_y = datasets if data == TREC and resplit: train_x, train_y, validate_x, validate_y, mask = resplit_train_data(train_x, train_y, validate_x, validate_y, validation_ratio, mask=mask) return train_x, train_y, validate_x, validate_y, test_x, test_y, W, mask
def cross_validation_embedding(validation_ratio=0.1, data=ROTTEN_TOMATOES, shuffle=True): datasets, W, _ = read_word2index_data(data=data, google=True, cv=True) x, y = datasets # get input shape input_shape = (x[0].shape[0], W.shape[1]) print "input data shape", input_shape n_out = len(np.unique(y)) skf = StratifiedKFold(y, n_folds=10) accuracy_list = [] for i, indices in enumerate(skf): print "\nat cross validation iter %i" % i print "\n**********************\n" train, test = indices train_x = x[train] train_y = y[train] test_x = x[test] test_y = y[test] train_x, validate_x, train_y, validate_y = train_test_split(train_x, train_y, test_size=validation_ratio, random_state=42, stratify=train_y) shuffle_indices = np.random.permutation(train_x.shape[0]) if shuffle else np.arange(train_x.shape[0]) datasets = (train_x[shuffle_indices], train_y[shuffle_indices], validate_x, validate_y, test_x, test_y) test_accuracy = train_ngram_net_embedding( U=W, datasets=datasets, n_epochs=15, non_static=False, ngrams=(1, 2), input_shape=input_shape, ngram_bias=False, multi_kernel=True, concat_out=False, n_kernels=(4, 4), ngram_out=(300, 250), use_bias=False, lr_rate=0.02, dropout=True, dropout_rate=0., n_hidden=200, n_out=n_out, l2_ratio=1e-4, ngram_activation=tanh, activation=leaky_relu, batch_size=25, update_rule='adagrad', mean_pool=False ) accuracy_list.append(test_accuracy) print "\n**********************\nfinal result: %f" % np.mean(accuracy_list)
def ngram_wrapper( data=SST_SENT_POL, n_epochs=25, ngrams=(1, 2), multi_kernel=True, n_kernels=(4, 3), ngram_out=(300, 200), use_bias=False, batch_size=50, dropout=True, n_hidden=100, ngram_layers=2, dropout_rate=0.3, lr_rate=0.01, mean_pool=False, non_static=True, l2_ratio=1 ): # getting the datasets datasets, W, _ = read_word2index_data(data=data, google=True, cv=False) train_x, train_y, validate_x, validate_y, test_x, test_y = datasets # get input shape input_shape = (train_x[0].shape[0], W.shape[1]) print "input data shape", input_shape n_out = len(np.unique(test_y)) shuffle_indices = np.random.permutation(train_x.shape[0]) datasets = (train_x[shuffle_indices], train_y[shuffle_indices], validate_x, validate_y, test_x, test_y) # network configuration l2_ratio /= 1e4 lr_rate /= 1e2 n_epochs *= 2 batch_size *= 10 n_hidden *= 10 dropout_rate /= 10.0 # ngram layers configurations n_kernels = tuple(n_kernels[:ngram_layers]) ngrams = tuple(ngrams[:ngram_layers]) ngram_out = tuple(ngram_out[:ngram_layers]) validation_accuracy = train_ngram_net( U=W, datasets=datasets, n_epochs=n_epochs, ngrams=ngrams, non_static=non_static, input_shape=input_shape, ngram_out=ngram_out, ngram_bias=False, multi_kernel=multi_kernel, concat_out=False, n_kernels=n_kernels, use_bias=use_bias, lr_rate=lr_rate, mean_pool=mean_pool, dropout=dropout, dropout_rate=dropout_rate, n_hidden=n_hidden, n_out=n_out, l2_ratio=l2_ratio, ngram_activation=leaky_relu, activation=leaky_relu, batch_size=batch_size, update_rule='adagrad', validation_only=True # return the validation error to minimize ) return 1 - validation_accuracy