コード例 #1
0
ファイル: test_dataset_store.py プロジェクト: MarcCote/mldata
def load_mnist(lazy):
    """
    Load mnist dataset from a hdf5 file and test if it matches mlpython's one.
    """
    dataset_name = 'mnist'

    start = time.time()
    import mlpython.datasets.store as mlstore
    mldatasets = mlstore.get_classification_problem(dataset_name, load_to_memory= (not lazy))
    print "mlpython version loaded ({0:.2f}sec).".format(time.time() - start)

    start = time.time()
    dataset_name = os.path.join(os.environ['MLPYTHON_DATASET_REPO'], dataset_name + ".h5")
    dataset = mldata.dataset_store.load(dataset_name, lazy=lazy)
    print "mldata version loaded ({0:.2f}sec).".format(time.time() - start)

    print "Comparing first 1000..."
    count = 0
    for (e1, t1), (e2,  t2) in itertools.izip(dataset, itertools.chain(*mldatasets)):
        #print t1, t2
        assert_array_almost_equal(e1, e2)
        assert_equal(t1, t2)
        
        count += 1
        if count >= 1000:
            break
コード例 #2
0
def train():

	sys.argv.pop(0);    # Remove first argument

	# Check if every option(s) from parent's script are here.
	if 5 != len(sys.argv):
	    print "Usage: python run_stacked_autoencoders_nnet.py lr hidden_size n_epochs n_cdk seed"
	    print ""
	    print "Ex.: python run_stacked_autoencoders_nnet.py 0.01 50 10 10 1234"
	    sys.exit()

	# Set the constructor
	str_ParamOption = "lr=" + sys.argv[0] + ", " + "hidden_size=" + sys.argv[1] + ", " + "n_epochs=" + sys.argv[2] + ", " +\
		"CDk=" + sys.argv[3] + ", " + "seed=" + sys.argv[4]
	try:
	    objectString = 'myObject = RBM(' + str_ParamOption + ')'
	    exec objectString
	    #code = compile(objectString, '<string>', 'exec')
	    #exec code
	except Exception as inst:
	    print "Error while instantiating RBM (required hyper-parameters are probably missing)"
	    print inst

	print "Loading dataset..."
	trainset,validset,testset = dataset_store.get_classification_problem('ocr_letters')
	
	print "Training..."
	myObject.train(trainset)

	#Store the trained dictionary and the parameters to a file.
	pickle.dump((myObject.W, myObject.b, myObject.hidden_size), open("Models/RBM/model%d.pkl"%experiment_number, 'wb'))
コード例 #3
0
def get_representation():
	# Load the dictionary and corresponding args.
	(W, b, hidden_size) = pickle.load(open("Models/RBM/model%d.pkl"%experiment_number,'rb'))

	# Set the constructor
	myObject = RBM(hidden_size=hidden_size)

	print "Loading dataset..."
	trainset,validset,testset = dataset_store.get_classification_problem('ocr_letters')

	encoded_trainset = []
	encoded_validset = []
	encoded_testset = []

	print "Initializing..."
	myObject.initialize(W,b)

	print "Encoding the trainset..."
	counter = 0 #Inelegant, I know! I use this to only use the first 1000 values.
	for input,target in trainset:    
		#Encode the sample.
		h = myObject.encode(input)
		encoded_trainset.append(h)

		# counter +=1
		# if counter == 1000:
		#     break

	# Save the datasets to files. 
	filename = "Models/RBM/trainset%d.pkl"%(experiment_number)
	pickle.dump( np.asarray(encoded_trainset) , open(filename, 'wb'))

	counter = 0
	print "Encoding the validset..."
	for input,target in validset:
		#Encode the sample.
		h = myObject.encode(input)
		encoded_validset.append(h)

		# counter +=1
		# if counter == 1000:
		#     break

	filename = "Models/RBM/validset%d.pkl"%(experiment_number)
	pickle.dump( np.asarray(encoded_validset) , open(filename, 'wb'))

	#Note: only need to do it for the best hyper-params at the end.	
	print "Encoding the testset..."
	for input,target in testset:
		#Encode the sample.
		h = myObject.encode(input)
		encoded_testset.append(h)	    

	filename = "Models/RBM/testset%d.pkl"%(experiment_number)
	pickle.dump( np.asarray(encoded_testset), open(filename, 'wb'))
コード例 #4
0
def sklearn_convex(classifier,
                   algorithm,
                   max_evals=100,
                   seed=1,
                   filename='none',
                   preproc=[],
                   loss=None):

    global suppress_output
    if suppress_output:
        dump_file = None
    else:
        dump_file = filename + '.dump'

    estim = hyperopt_estimator(classifier=classifier,
                               algo=algorithm,
                               preprocessing=preproc,
                               max_evals=max_evals,
                               trial_timeout=240,
                               fit_increment_dump_filename=dump_file,
                               loss_fn=loss)

    filename = filename + '.out'

    dataset_store.download('convex')
    trainset, validset, testset = dataset_store.get_classification_problem(
        'convex')

    X_train = trainset.data.mem_data[0]
    y_train = trainset.data.mem_data[1]

    X_valid = validset.data.mem_data[0]
    y_valid = validset.data.mem_data[1]

    X_test = testset.data.mem_data[0]
    y_test = testset.data.mem_data[1]

    X_fulltrain = np.concatenate((X_train, X_valid))
    y_fulltrain = np.concatenate((y_train, y_valid))

    print(y_train.shape)
    print(y_valid.shape)
    print(y_test.shape)

    #find_model( X_train, y_train, X_test, y_test, estim, filename )
    find_model(X_fulltrain, y_fulltrain, X_test, y_test, estim, filename)
コード例 #5
0
def convex():

  dataset_store.download('convex')
  trainset,validset,testset = dataset_store.get_classification_problem('convex')

  X_train = trainset.data.mem_data[0]
  y_train = trainset.data.mem_data[1]
  
  X_valid = validset.data.mem_data[0]
  y_valid = validset.data.mem_data[1]
  
  X_test = testset.data.mem_data[0]
  y_test = testset.data.mem_data[1]
  
  X_fulltrain = np.concatenate((X_train, X_valid))
  y_fulltrain = np.concatenate((y_train, y_valid))
  
  pca = PCA()
  X_train_pca = pca.fit_transform( X_fulltrain )
  X_test_pca = pca.fit_transform( X_test )
  
  clfs = [ MultinomialNB(), SVC(),
           KNeighborsClassifier(),
           SGDClassifier() ]
  
  pca_clfs = [ SVC(),
               KNeighborsClassifier(),
               SGDClassifier() ]
  
  print("Convex\n")
  with open( "convex_baselines.txt", 'w' ) as f:
    for clf in clfs:
      clf.fit( X_fulltrain, y_fulltrain )
      pred = clf.predict( X_test )
      score = metrics.f1_score( y_test, pred )
      print( "Classifier: %s\nScore: %f\n" % (clf, score) )
      f.write("Classifier: %s\nScore: %f\n\n" % (clf, score))
    for clf in pca_clfs:
      clf.fit( X_train_pca, y_fulltrain )
      pred = clf.predict( X_test_pca )
      score = metrics.f1_score( y_test, pred )
      print( "Classifier: PCA + %s\nScore: %f\n" % (clf, score) )
      f.write("Classifier: PCA + %s\nScore: %f\n\n" % (clf, score))
コード例 #6
0
def convex():

    dataset_store.download('convex')
    trainset, validset, testset = dataset_store.get_classification_problem(
        'convex')

    X_train = trainset.data.mem_data[0]
    y_train = trainset.data.mem_data[1]

    X_valid = validset.data.mem_data[0]
    y_valid = validset.data.mem_data[1]

    X_test = testset.data.mem_data[0]
    y_test = testset.data.mem_data[1]

    X_fulltrain = np.concatenate((X_train, X_valid))
    y_fulltrain = np.concatenate((y_train, y_valid))

    pca = PCA()
    X_train_pca = pca.fit_transform(X_fulltrain)
    X_test_pca = pca.fit_transform(X_test)

    clfs = [MultinomialNB(), SVC(), KNeighborsClassifier(), SGDClassifier()]

    pca_clfs = [SVC(), KNeighborsClassifier(), SGDClassifier()]

    print("Convex\n")
    with open("convex_baselines.txt", 'w') as f:
        for clf in clfs:
            clf.fit(X_fulltrain, y_fulltrain)
            pred = clf.predict(X_test)
            score = metrics.f1_score(y_test, pred)
            print("Classifier: %s\nScore: %f\n" % (clf, score))
            f.write("Classifier: %s\nScore: %f\n\n" % (clf, score))
        for clf in pca_clfs:
            clf.fit(X_train_pca, y_fulltrain)
            pred = clf.predict(X_test_pca)
            score = metrics.f1_score(y_test, pred)
            print("Classifier: PCA + %s\nScore: %f\n" % (clf, score))
            f.write("Classifier: PCA + %s\nScore: %f\n\n" % (clf, score))
コード例 #7
0
def sklearn_convex( classifier, algorithm, max_evals=100, seed=1,
                    filename = 'none', preproc=[], loss=None ):

  
  global suppress_output
  if suppress_output:
    dump_file = None
  else:
    dump_file = filename+'.dump'
  
  estim = hyperopt_estimator( classifier=classifier, algo=algorithm,
                              preprocessing=preproc,
                              max_evals=max_evals, trial_timeout=240,
                              fit_increment_dump_filename=dump_file,
                              loss_fn=loss)
  
  filename = filename + '.out'

  dataset_store.download('convex')
  trainset,validset,testset = dataset_store.get_classification_problem('convex')

  X_train = trainset.data.mem_data[0]
  y_train = trainset.data.mem_data[1]
  
  X_valid = validset.data.mem_data[0]
  y_valid = validset.data.mem_data[1]
  
  X_test = testset.data.mem_data[0]
  y_test = testset.data.mem_data[1]

  X_fulltrain = np.concatenate((X_train, X_valid))
  y_fulltrain = np.concatenate((y_train, y_valid))

  print(y_train.shape)
  print(y_valid.shape)
  print(y_test.shape)
  
  #find_model( X_train, y_train, X_test, y_test, estim, filename )
  find_model( X_fulltrain, y_fulltrain, X_test, y_test, estim, filename )
コード例 #8
0
def get_dictionary():
    """
    Train the sparse coding model 
    and save the dictionary and params to
    a file.
    """
    sys.argv.pop(0);    # Remove first argument

    # Check if every option(s) from parent's script are here.
    if 5 != len(sys.argv):
        print "Usage: python run_sparse_code.py lr size L1 n_epochs seed"
        print ""
        print "Ex.: python run_sparse_code.py 0.1 20 0.1 5 1234"
        sys.exit()

    # Set the constructor
    str_ParamOption = "lr=" + sys.argv[0] + ", " + "size=" + sys.argv[1] + ", " + "L1=" + sys.argv[2] + ", " + "n_epochs=" + sys.argv[3] + ", " + "seed=" + sys.argv[4]
    str_ParamOptionValue = sys.argv[0] + "\t" + sys.argv[1] + "\t" + sys.argv[2] + "\t" + sys.argv[3] + "\t" + sys.argv[4]
    try:
        objectString = 'myObject = SparseCode(' + str_ParamOption + ')'
        exec objectString
        #code = compile(objectString, '<string>', 'exec')
        #exec code
    except Exception as inst:
        print "Error while instantiating SparseCode (required hyper-parameters are probably missing)"
        print inst

    print "Loading dataset..."
    trainset,validset,testset = dataset_store.get_classification_problem('ocr_letters')
    
    print "Training..."
    myObject.train(trainset)
    #Store the trained dictionary and the parameters to a file.
    pickle.dump((myObject.dictionary, myObject.lr, myObject.hidden_size, myObject.L1), open("Models/SC/dictionary%d.pkl"%experiment_number, 'wb'))

    myObject.show_filters()
コード例 #9
0
import os
import itertools
import numpy as np
import fcntl
import copy
from string import Template
import mlpython.datasets.store as dataset_store
import mlpython.mlproblems.generic as mlpb
from rbm import RBM
#from autoencoder import Autoencoder

print "Loading dataset..."
trainset,validset,testset = dataset_store.get_classification_problem('ocr_letters')
print "Train RBM for 10 iterations... (this might take a few minutes)"
rbm = RBM(n_epochs = 10,
          hidden_size = 200,
          lr = 0.01,
          CDk = 1,
          seed=1234
          )

rbm.train(mlpb.SubsetFieldsProblem(trainset))
rbm.show_filters()

コード例 #10
0
ファイル: run_nnet.py プロジェクト: JonnyTran/ML-algorithms
# Set the constructor
str_ParamOption = "lr=" + sys.argv[0] + ", " + "dc=" + sys.argv[1] + ", " + "sizes=" + sys.argv[2] + ", " + "L2=" + \
                  sys.argv[3] + ", " + "L1=" + sys.argv[4] + ", " + "seed=" + sys.argv[5] + ", " + "tanh=" + sys.argv[6]
str_ParamOptionValue = sys.argv[0] + "\t" + sys.argv[1] + "\t" + sys.argv[2] + "\t" + sys.argv[3] + "\t" + sys.argv[
    4] + "\t" + sys.argv[5] + "\t" + sys.argv[6]
try:
    objectString = 'myObject = NeuralNetwork(n_epochs=1,' + str_ParamOption + ')'
    exec objectString
    # code = compile(objectString, '<string>', 'exec')
    # exec code
except Exception as inst:
    print "Error while instantiating NeuralNetwork (required hyper-parameters are probably missing)"
    print inst

print "Loading dataset..."
trainset, validset, testset = dataset_store.get_classification_problem('ocr_letters')
print "Training..."
# Early stopping code
best_val_error = np.inf
best_it = 0
str_header = 'best_it\t'
look_ahead = 5
n_incr_error = 0
for stage in range(1, 500 + 1, 1):
    if not n_incr_error < look_ahead:
        break
    myObject.n_epochs = stage
    myObject.train(trainset)
    n_incr_error += 1
    outputs, costs = myObject.test(trainset)
    errors = np.mean(costs, axis=0)
コード例 #11
0
def get_representation():

    """
    Grab the dictionary, convert
    the datasets to a sparse representation and
    save them to a file.
    """

    # Load the dictionary and corresponding args.
    (dictionary, lr, hidden_size, L1) = pickle.load(open("Models/SC/dictionary%d.pkl"%experiment_number,'rb'))

    # Set the constructor
    myObject = SparseCode(lr,hidden_size,L1)

    print "Loading dataset..."
    trainset,validset,testset = dataset_store.get_classification_problem('ocr_letters')

    encoded_trainset = []
    trainset_out = []
    encoded_validset = []
    validset_out = []
    encoded_testset = []
    testset_out = []

    print "Initializing..."
    myObject.initialize_dictionary(dictionary)

    print "Encoding the trainset..."
    #counter = 0 #Inelegant, I know! I use this to only use the first 1000 values.
    for input,target in trainset:    
        #Run ISTA
        h = myObject.infer(input)
        encoded_trainset.append(h)
        trainset_out.append(target)

        # counter +=1
        # if counter == 1000:
        #     break

    # Save the datasets to files. 
    filename = "Models/SC/trainset%d.pkl"%(experiment_number)
    pickle.dump( np.asarray(encoded_trainset) , open(filename, 'wb'))
    filename = "Models/train_outputs.pkl"
    pickle.dump( np.asarray(trainset_out) , open(filename, 'wb'))

    #counter = 0
    print "Encoding the validset..."
    for input,target in validset:
        #Run ISTA
        h = myObject.infer(input)
        encoded_validset.append(h)
        validset_out.append(target)

        # counter +=1
        # if counter == 1000:
        #     break

    filename = "Models/SC/validset%d.pkl"%(experiment_number)
    pickle.dump( np.asarray(encoded_validset) , open(filename, 'wb'))
    filename = "Models/valid_outputs.pkl"
    pickle.dump( np.asarray(validset_out) , open(filename, 'wb'))

    Note: only need to do it for the best hyper-params at the end.
    
    print "Encoding the testset..."
    for input,target in testset:
        #Run ISTA
        h = myObject.infer(input)
        encoded_testset.append(h)
        testset_out.append(target)        

    filename = "Models/SC/testset%d.pkl"%(experiment_number)
    pickle.dump( np.asarray(encoded_testset), open(filename, 'wb'))
    filename = "Models/test_outputs.pkl"
    pickle.dump( np.asarray(testset_out) , open(filename, 'wb'))