def run(): ( train, undef, test ) = data.load_data_wrapper() log( "Trainning set : %8d instances", ( len(train) ) ) log( "Testing set : %8d instances", ( len(test) ) ) txt = "Converting %s set to Instance objects" log( txt, ("training") ) train = [ Instance( t[0], label_array=t[1] ) for t in train ] log( txt, ("test") ) test = [ Instance( t[0], label=t[1] ) for t in test ] instance = 1 start_compare = time.time() """ Iterate through testing set """ test_subset = test[0:10] for i in test_subset: log("Instance %d", ( instance )) """ Find the closest pair from training set """ closest_pair = train[0] max_dist = cosine_dist( i, closest_pair ) for j in train[1:1000]: dist = cosine_dist( i, j ) if( dist < max_dist ): max_dist = dist closest_pair = j if( dist == 0 ): break i.predicted_label = closest_pair.label log(">>> %d, actual : %s , predict : %s", ( instance, test[0].label, test[0].predicted_label) ) instance+=1 end_compare = time.time() """ Compute confusion_matrix, accuracy and prediction and recall for each label """ log("----- Confusion Matrix -----") matrix = confusion_matrix( test_subset ) log("%s", ( pandas.DataFrame( matrix ) ) ) log("Accuracy : %0.2f", ( accuracy(matrix) ) ) for i in range(NUM_LABEL): log("Label %d : precision: %.2f \t recall: %.2f", ( i, precision( matrix, i ), recall( matrix, i ) ) ) log("----------------") log("Time spent : %.0f sec", ( end_compare - start_compare ) )
def run(): parser = OptionParser() parser.add_option("-k", "--dimension", dest="k", type="int", default=50 ) parser.add_option("-n", "--data", dest="n", type="int", default=20 ) (opt, args) = parser.parse_args() (train, validation, test) = data_loader.load_data_wrapper(); train = train[:opt.n] dim = len(train[0][0]) print( "k: %d" % ( opt.k ) ) print( "dimension: %d" % (dim)) print("---") R = RandomProjection( opt.k, dim ) train = [ Sample( t[0], R.project(t[0]), t[1] ) for t in train ] total = 0; f = open( "k-"+str(opt.k)+".csv", 'wt') try: writer = csv.writer(f) writer.writerow( ('Instance 1', 'Instance 2', 'Distortion') ) for i in range(len(train)): a = train[i] for j in range(i+1,len(train)): if i == j: continue total+=1 b = train[j] dist_k_dim = np.linalg.norm( a.low_features - b.low_features ) dist_d_dim = np.linalg.norm( a.features - b.features ) distortion = dist_k_dim / dist_d_dim writer.writerow( (i+1, j+1, "%.4f" % ( distortion ) )) finally: f.close() # total should be C(20,2) print(total)
import numpy as np import mnist_dataloader, time # load data set training_data, validation_data, test_data = mnist_dataloader.load_data_wrapper() start_time = time.time() # compute length of each instance in training_data training_data_lengths = [np.linalg.norm(training_instance[0]) for training_instance in training_data] # compute the length of each instance in test_data test_data_lengths = [np.linalg.norm(test_instance[0]) for test_instance in test_data] # for i in range(1, 100): # print test_data_length[i] # classify test_data classified_results = [] for test_instance, test_instance_index in zip(test_data, range(len(test_data))): # find the nearest training instance with cosine similarity maximal_cosine_similarity = -1 maximal_cosine_similarity_index = 0 for training_instance, training_instance_index in zip(training_data, range(len(training_data))): # compute the cosine similarity # first, compute the inner product inner_product = np.inner(test_instance[0].reshape(-1), training_instance[0].reshape(-1)) normalized_inner_product = inner_product / test_data_lengths[test_instance_index] / training_data_lengths[training_instance_index]
import numpy as np import mnist_dataloader, time # load data set training_data, validation_data, test_data = mnist_dataloader.load_data_wrapper( ) start_time = time.time() # compute length of each instance in training_data training_data_lengths = [ np.linalg.norm(training_instance[0]) for training_instance in training_data ] # compute the length of each instance in test_data test_data_lengths = [ np.linalg.norm(test_instance[0]) for test_instance in test_data ] # for i in range(1, 100): # print test_data_length[i] # classify test_data classified_results = [] for test_instance, test_instance_index in zip(test_data, range(len(test_data))): # find the nearest training instance with cosine similarity maximal_cosine_similarity = -1 maximal_cosine_similarity_index = 0
def accuracy( confusion_matrix ): total_instance = 0 correct_predict = 0 for i in range(NUM_LABEL): total_instance = total_instance + sum( confusion_matrix[i] ) correct_predict = correct_predict + confusion_matrix[i][i] return correct_predict*1.0 / total_instance def log( format, data=() ): text = format % data print(text) if __name__ == '__main__': parser = OptionParser() parser.add_option("-d","--dist_metric: 1.cosine 2.euclidean", dest="dist_type", type="string" ,default ="euclidean") parser.add_option("-r","--reduce dimensions: Y(es) ,N(o)", dest="reduce_dims_flag", type="string", default="yes") parser.add_option("-k", "--new_dimensions", dest="k", type="int", default=100 ) parser.add_option("-n", "--train_data_len", dest="n", type="int", default=1000) parser.add_option("-t", "--test_data_len", dest="t", type="int", default=100) (opt, args) = parser.parse_args() print("distance_metric: "+opt.dist_type) print("reduce_dims_flag: "+opt.reduce_dims_flag) print("data lenght: "+str(opt.n)) print("new dinmesnions: "+str(opt.k)) (train, validation, test) = data_loader.load_data_wrapper(opt.n,opt.t) runNNClassifier(train,test,opt.dist_type, opt.reduce_dims_flag.lower(), opt.k)