def get(dataset_name): # List of datasets that works with the current model ? datasets = ['adult', 'binarized_mnist', 'connect4', 'dna', 'mushrooms', 'nips', 'ocr_letters', 'rcv1', 'rcv2_russ', 'web'] # Setup dataset env if dataset_name not in datasets: raise ValueError('Dataset unknown: ' + dataset_name) # mldataset = __import__('mlpython.datasets.' + dataset_name, globals(), locals(), [dataset_name], -1) datadir = os.path.join(os.getenv("MLPYTHON_DATASET_REPO"), dataset_name) # Verify if dataset exist and if not, download it if(not os.path.exists(datadir)): dataset_store.download(dataset_name) print('### Loading dataset [{0}] ...'.format(dataset_name)) start_time = t.time() all_data = mldataset.load(datadir, load_to_memory=True) train_data, train_metadata = all_data['train'] if dataset_name == 'binarized_mnist' or dataset_name == 'nips': trainset = mlpb.MLProblem(train_data, train_metadata) else: trainset = mlpb.SubsetFieldsProblem(train_data, train_metadata) trainset.setup() valid_data, valid_metadata = all_data['valid'] validset = trainset.apply_on(valid_data, valid_metadata) test_data, test_metadata = all_data['test'] testset = trainset.apply_on(test_data, test_metadata) # Cleaning up, packaging and theanized full_dataset = {'input_size': trainset.metadata['input_size']} trainset_theano = theano.shared(value=Dataset._clean(trainset), borrow=True) validset_theano = theano.shared(value=Dataset._clean(validset), borrow=True) testset_theano = theano.shared(value=Dataset._clean(testset), borrow=True) full_dataset['train'] = {'data': trainset_theano, 'length': all_data['train'][1]['length']} full_dataset['valid'] = {'data': validset_theano, 'length': all_data['valid'][1]['length']} full_dataset['test'] = {'data': testset_theano, 'length': all_data['test'][1]['length']} print("(Dim:{0} Train:{1} Valid:{2} Test:{3})".format(trainset.metadata['input_size'], full_dataset['train']['length'], full_dataset['valid']['length'], full_dataset['test']['length'])) print(get_done_text(start_time), "###") return full_dataset
def get(dataset_name): # List of datasets that works with the current model ? datasets = ['adult', 'binarized_mnist', 'connect4', 'dna', 'mushrooms', 'nips', 'ocr_letters', 'rcv1', 'rcv2_russ', 'web'] # Setup dataset env if dataset_name not in datasets: raise ValueError('Dataset unknown: ' + dataset_name) mldataset = __import__('mlpython.datasets.' + dataset_name, globals(), locals(), [dataset_name], -1) datadir = os.path.join(os.getenv("MLPYTHON_DATASET_REPO"), dataset_name) # Verify if dataset exist and if not, download it if(not os.path.exists(datadir)): dataset_store.download(dataset_name) print '### Loading dataset [{0}] ...'.format(dataset_name), start_time = t.time() all_data = mldataset.load(datadir, load_to_memory=True) train_data, train_metadata = all_data['train'] if dataset_name == 'binarized_mnist' or dataset_name == 'nips': trainset = mlpb.MLProblem(train_data, train_metadata) else: trainset = mlpb.SubsetFieldsProblem(train_data, train_metadata) trainset.setup() valid_data, valid_metadata = all_data['valid'] validset = trainset.apply_on(valid_data, valid_metadata) test_data, test_metadata = all_data['test'] testset = trainset.apply_on(test_data, test_metadata) # Cleaning up, packaging and theanized full_dataset = {'input_size': trainset.metadata['input_size']} trainset_theano = theano.shared(value=Dataset._clean(trainset), borrow=True) validset_theano = theano.shared(value=Dataset._clean(validset), borrow=True) testset_theano = theano.shared(value=Dataset._clean(testset), borrow=True) full_dataset['train'] = {'data': trainset_theano, 'length': all_data['train'][1]['length']} full_dataset['valid'] = {'data': validset_theano, 'length': all_data['valid'][1]['length']} full_dataset['test'] = {'data': testset_theano, 'length': all_data['test'][1]['length']} print "(Dim:{0} Train:{1} Valid:{2} Test:{3})".format(trainset.metadata['input_size'], full_dataset['train']['length'], full_dataset['valid']['length'], full_dataset['test']['length']), print get_done_text(start_time), "###" return full_dataset
def sklearn_convex(classifier, algorithm, max_evals=100, seed=1, filename='none', preproc=[], loss=None): global suppress_output if suppress_output: dump_file = None else: dump_file = filename + '.dump' estim = hyperopt_estimator(classifier=classifier, algo=algorithm, preprocessing=preproc, max_evals=max_evals, trial_timeout=240, fit_increment_dump_filename=dump_file, loss_fn=loss) filename = filename + '.out' dataset_store.download('convex') trainset, validset, testset = dataset_store.get_classification_problem( 'convex') X_train = trainset.data.mem_data[0] y_train = trainset.data.mem_data[1] X_valid = validset.data.mem_data[0] y_valid = validset.data.mem_data[1] X_test = testset.data.mem_data[0] y_test = testset.data.mem_data[1] X_fulltrain = np.concatenate((X_train, X_valid)) y_fulltrain = np.concatenate((y_train, y_valid)) print(y_train.shape) print(y_valid.shape) print(y_test.shape) #find_model( X_train, y_train, X_test, y_test, estim, filename ) find_model(X_fulltrain, y_fulltrain, X_test, y_test, estim, filename)
def convex(): dataset_store.download('convex') trainset,validset,testset = dataset_store.get_classification_problem('convex') X_train = trainset.data.mem_data[0] y_train = trainset.data.mem_data[1] X_valid = validset.data.mem_data[0] y_valid = validset.data.mem_data[1] X_test = testset.data.mem_data[0] y_test = testset.data.mem_data[1] X_fulltrain = np.concatenate((X_train, X_valid)) y_fulltrain = np.concatenate((y_train, y_valid)) pca = PCA() X_train_pca = pca.fit_transform( X_fulltrain ) X_test_pca = pca.fit_transform( X_test ) clfs = [ MultinomialNB(), SVC(), KNeighborsClassifier(), SGDClassifier() ] pca_clfs = [ SVC(), KNeighborsClassifier(), SGDClassifier() ] print("Convex\n") with open( "convex_baselines.txt", 'w' ) as f: for clf in clfs: clf.fit( X_fulltrain, y_fulltrain ) pred = clf.predict( X_test ) score = metrics.f1_score( y_test, pred ) print( "Classifier: %s\nScore: %f\n" % (clf, score) ) f.write("Classifier: %s\nScore: %f\n\n" % (clf, score)) for clf in pca_clfs: clf.fit( X_train_pca, y_fulltrain ) pred = clf.predict( X_test_pca ) score = metrics.f1_score( y_test, pred ) print( "Classifier: PCA + %s\nScore: %f\n" % (clf, score) ) f.write("Classifier: PCA + %s\nScore: %f\n\n" % (clf, score))
def convex(): dataset_store.download('convex') trainset, validset, testset = dataset_store.get_classification_problem( 'convex') X_train = trainset.data.mem_data[0] y_train = trainset.data.mem_data[1] X_valid = validset.data.mem_data[0] y_valid = validset.data.mem_data[1] X_test = testset.data.mem_data[0] y_test = testset.data.mem_data[1] X_fulltrain = np.concatenate((X_train, X_valid)) y_fulltrain = np.concatenate((y_train, y_valid)) pca = PCA() X_train_pca = pca.fit_transform(X_fulltrain) X_test_pca = pca.fit_transform(X_test) clfs = [MultinomialNB(), SVC(), KNeighborsClassifier(), SGDClassifier()] pca_clfs = [SVC(), KNeighborsClassifier(), SGDClassifier()] print("Convex\n") with open("convex_baselines.txt", 'w') as f: for clf in clfs: clf.fit(X_fulltrain, y_fulltrain) pred = clf.predict(X_test) score = metrics.f1_score(y_test, pred) print("Classifier: %s\nScore: %f\n" % (clf, score)) f.write("Classifier: %s\nScore: %f\n\n" % (clf, score)) for clf in pca_clfs: clf.fit(X_train_pca, y_fulltrain) pred = clf.predict(X_test_pca) score = metrics.f1_score(y_test, pred) print("Classifier: PCA + %s\nScore: %f\n" % (clf, score)) f.write("Classifier: PCA + %s\nScore: %f\n\n" % (clf, score))
def sklearn_convex( classifier, algorithm, max_evals=100, seed=1, filename = 'none', preproc=[], loss=None ): global suppress_output if suppress_output: dump_file = None else: dump_file = filename+'.dump' estim = hyperopt_estimator( classifier=classifier, algo=algorithm, preprocessing=preproc, max_evals=max_evals, trial_timeout=240, fit_increment_dump_filename=dump_file, loss_fn=loss) filename = filename + '.out' dataset_store.download('convex') trainset,validset,testset = dataset_store.get_classification_problem('convex') X_train = trainset.data.mem_data[0] y_train = trainset.data.mem_data[1] X_valid = validset.data.mem_data[0] y_valid = validset.data.mem_data[1] X_test = testset.data.mem_data[0] y_test = testset.data.mem_data[1] X_fulltrain = np.concatenate((X_train, X_valid)) y_fulltrain = np.concatenate((y_train, y_valid)) print(y_train.shape) print(y_valid.shape) print(y_test.shape) #find_model( X_train, y_train, X_test, y_test, estim, filename ) find_model( X_fulltrain, y_fulltrain, X_test, y_test, estim, filename )
def setUp(): try: dataset_store.download('mnist_rotated_background_images') except: print 'Could not download the dataset : ', 'mnist_rotated_background_images' assert False
def setUp(): try: dataset_store.download('binarized_mnist') except: print 'Could not download the dataset : ', 'binarized_mnist' assert False
def setUp(): try: dataset_store.download('letor_mq2007') except: print 'Could not download the dataset : ', 'letor_mq2007' assert False
def setUp(): try: dataset_store.download('face_completion_lfw') except: print 'Could not download the dataset : ', 'face_completion_lfw' assert False
def setUp(): try: dataset_store.download("cadata") except: print "Could not download the dataset : ", "cadata" assert False
def setUp(): try: dataset_store.download('bibtex') except: print 'Could not download the dataset : ', 'bibtex' assert False
def setUp(): try: dataset_store.download('occluded_faces_lfw') except: print 'Could not download the dataset : ', 'occluded_faces_lfw' assert False
def setUp(): try: dataset_store.download('newsgroups') except: print 'Could not download the dataset : ', 'newsgroups' assert False
def setUp(): try: dataset_store.download('housing') except: print 'Could not download the dataset : ', 'housing' assert False
def setUp(): try: dataset_store.download('medical') except: print 'Could not download the dataset : ', 'medical' assert False
def setUp(): try: dataset_store.download('ocr_letters') except: print 'Could not download the dataset : ', 'ocr_letters' assert False
import mlpython.datasets.store as store store.download('adult') store.download('connect4') store.download('dna') store.download('mushrooms') store.download('nips') store.download('ocr_letters') store.download('rcv1') store.download('web')
def setUp(): try: dataset_store.download('corrupted_ocr_letters') except: print 'Could not download the dataset : ', 'corrupted_ocr_letters' assert False
def setUp(): try: dataset_store.download('mnist_basic') except: print 'Could not download the dataset : ', 'mnist_basic' assert False
def setUp(): try: dataset_store.download('mnist_background_random') except: print 'Could not download the dataset : ', 'mnist_background_random' assert False
def setUp(): try: dataset_store.download('connect4') except: print 'Could not download the dataset : ', 'connect4' assert False
def setUp(): try: dataset_store.download('rectangles') except: print 'Could not download the dataset : ', 'rectangles' assert False
def setUp(): try: dataset_store.download('mediamill') except: print 'Could not download the dataset : ', 'mediamill' assert False
def setUp(): try: dataset_store.download('adult') except: print 'Could not download the dataset : ', 'adult' assert False
def setUp(): try: dataset_store.download('occluded_mnist') except: print 'Could not download the dataset : ', 'occluded_mnist' assert False
def setUp(): try: dataset_store.download('mushrooms') except: print 'Could not download the dataset : ', 'mushrooms' assert False
def setUp(): try: dataset_store.download('letor_mq2008') except: print 'Could not download the dataset : ', 'letor_mq2008' assert False
def setUp(): try: dataset_store.download('abalone') except: print 'Could not download the dataset : ', 'abalone' assert False