def matrixDemoTestWorker(size=None, dimensions=None, tree_type=None, spill_rate=None, samp=None): this_run = dict() # Create random matrix N = size or 5000 D = dimensions or 20 tree = tree_type or 'kd' spill = spill_rate or .25 #samples = samp or 100 samples = 100 X = numpy.random.randn(N, D) #Random projection to liven up the data P = numpy.random.randn(D, D) X = numpy.dot(X, P) # Construct the type of tree specified with spill specified. # Defaults are KD-spill-tree with spill = 25% print "Building tree...", tree, spill T = spatialtree(X, rule=tree, spill=spill) print "Done" T_root = spatialtree(X, rule=tree, spill=spill, height=0) # Test recall on items in tree in_tree_count = 0 in_tree_recall = 0 for countvar in range(samples): rand = random.randint(0, 499) knn_a = T.k_nearest(X, k=10, index=rand) knn_t = T_root.k_nearest(X, k=10, index=rand) in_tree_count += 1 in_tree_recall += (len(set(knn_a) & set(knn_t)) * 1.0 / len(set(knn_t))) index = 'in_' + tree + '_' + str(spill) + '_' + str(N) + '_' + str(D) this_run[index] = in_tree_recall / in_tree_count # We can also search with a new vector not already in the tree out_of_tree_count = 0 out_of_tree_recall = 0 for countvar in range(samples): query = numpy.dot(numpy.random.randn(D), P) knn_a = T.k_nearest(X, k=10, vector=query) knn_t = T_root.k_nearest(X, k=10, vector=query) out_of_tree_count += 1 out_of_tree_recall += (len(set(knn_a) & set(knn_t)) * 1.0 / len(set(knn_t))) index = 'out_' + tree + '_' + str(spill) + '_' + str(N) + '_' + str(D) this_run[index] = out_of_tree_recall / out_of_tree_count return this_run
def split_by_spatial_tree(X, n_anchors): '''Data partitioning via spatial trees Dependency: http://cseweb.ucsd.edu/~naverma/SpatialTrees/index.html Args: X: matrix of data points Returns: A: centroids for each spatial partitioning ''' from spatialtree import spatialtree height = np.log2(n_anchors) height_int = np.int(height) if height_int != height: print "number of anchors is not power of 2" T = spatialtree(X, rule='rp', height=height_int, spill=0.0, min_items=1) A = np.zeros((n_anchors, X.shape[1])) c = 0 for t in T.traverse(): if t.isLeaf(): indices = [index for index in t.__iter__()] A[c,:] = np.average(X[indices, :], axis=0) c = c + 1 return A
def matrixDemoTestWorker(size=None, dimensions=None, tree_type=None, spill_rate=None, samp=None, k_neighbors=None, tree_depth=None, files=None): #this_run = dict() this_run = [0 for i in xrange(2)] # Create random matrix N = size or 5000 D = dimensions or 20 # Create testing variables tree = tree_type or 'kd' spill = spill_rate or .25 samples = samp or 100 k_near = k_neighbors or 1 #k = 5 max_value = 100 filename = files # Python interprets spill_layer = 0 as False, and so sets spill to .25 if spill_rate == 0: spill = 0 # read in the data X = list(csv_reader(filename)) N = len(X) D = len(X[0]) # divide the data into 5 groups for cross validation i = 0 random.shuffle(X) Y = [[] for i in xrange(5)] for item in X: Y[i % 5].append(item) i += 1 """ for i in xrange(5): print(len(Y[i])) """ timer = open(tree, 'w') # for each, use the other 4 groups for training, test on the remaining group for item in Y: t = [] for x in Y: if x != item: t.extend(x) training = numpy.array(t) print "Building tree...", tree, spill, tree_depth #T = spatialtree(training, rule=tree, spill=spill, height = tree_depth) start_time = timeit.default_timer() T = spatialtree(training, rule=tree, spill=spill, height=tree_depth) elapsed = timeit.default_timer() - start_time timer.write(str(elapsed) + ",") #print "Done T_root = spatialtree( training, rule=tree, spill=spill, height=0) # this should be training and not X, right? recall = 0 index = '' + tree + '_' + str(spill) + '_' + str(tree_depth) f = open(index + ".txt", 'w') for test in item: knn_a = T.k_nearest(training, k=k_near, vector=test) knn_t = T_root.k_nearest(training, k=k_near, vector=test) #true_pos = len(set(knn_a) & set(knn_t))*1.0/len(set(knn_t)) true_pos = len(set(knn_a) & set(knn_t)) * 1.0 false_pos = len(set(knn_t)) - true_pos true_neg = len(training) - false_pos f.write( str(true_pos) + '-' + str(false_pos) + '-' + str(true_neg) + ',') recall += true_pos / len(set(knn_t)) #print_recall = recall*1.0/len(item) #f.write(str(print_recall)+',') #print(""+tree+" recall\t", print_recall) #results_memo[index] = print_recall f.write("\n") f.close() timer.close() return N, D # return the number of entries and decisions so they can be passed to data postprocessing
def matrixDemoTestWorker(size = None, dimensions = None, tree_type = None, spill_rate = None, samp = None, k_neighbors = None, tree_depth=None): #this_run = dict() this_run = [0 for i in xrange(2)] # Create random matrix N = size or 5000 D = dimensions or 20 # Create testing variables tree= tree_type or 'kd' spill = spill_rate or .25 samples = samp or 100 k_near = k_neighbors or 10 k = 5 max_value = 100 filename = "testdata.csv" # Python interprets spill_layer = 0 as False, and so sets spill to .25 if spill_rate == 0: spill = 0 """ X = numpy.random.randn(N, D) P = numpy.random.randn(D, D) X = numpy.dot(X, P) """ #If you want to use data from a file, uncomment this and put the filename in here X = csv_reader("testdata.csv") D = len(X[0]) N = len(X) P = numpy.random.randn(D,D) # Apply a few random projections so the data's not totally boring # Goal: embed k dimensional data in D-space """ for i in xrange(k): P = numpy.random.randn(D, D) X = numpy.dot(X, P) """ # Construct the type of tree specified with spill specified. # Defaults are KD-spill-tree with spill = 25% print "Building tree...", tree, spill, tree_depth T = spatialtree(X, rule=tree, spill=spill, height = tree_depth) print "Done" T_root = spatialtree(X, rule=tree, spill=spill, height = 0) print("Running tests...") # Test recall on items in tree in_tree_recall = 0 out_of_tree_recall = 0 index = '' + tree + '_' + str(spill) + '_' + str(tree_depth) f = open(index+".txt", 'w') for countvar in range(samples): rand = random.randint(0,N-1) knn_a = T.k_nearest(X, k=k_near, index = rand) knn_t = T_root.k_nearest(X, k=k_near, index=rand) value = len(set(knn_a) & set(knn_t))*1.0/len(set(knn_t)) f.write(str(value)) if countvar != samples-1: f.write(", ") in_tree_recall += value """ true_pos += len(set(knn_a) & set(knn_t)) false_pos += in_tree_count - true_pos false_neg += in_tree_count - true_pos true_neg += samples - false_neg """ f.write("\n") # We can also search with a new vector not already in the tree for countvar in range(samples): query = numpy.dot(numpy.random.randn(D), P) knn_a = T.k_nearest(X, k=k_near, vector=query) knn_t = T_root.k_nearest(X, k=k_near, vector=query) value = len(set(knn_a) & set(knn_t))*1.0/len(set(knn_t)) #f.write(str(format(len(set(knn_a) & set(knn_t)) * 1.0 / len(set(knn_t)), '.4f'))) f.write(str(value)) if countvar != samples-1: f.write(', ') out_of_tree_recall += value f.write("\n") f.close() print "in tree_recall\t", in_tree_recall*1.0/samples print "out of tree_recall\t", out_of_tree_recall*1.0/samples print("Done")
def transferLearning(preface, files, outpath, undersample=False, globalLocal=False, knn=False): for f1 in files: filedata = None print("Starting transfer learning for: " + f1) for f2 in files: if f1 != f2: if filedata == None: filedata = csv_reader2(preface + f2, mini=False) # filedata = csv_reader_remove_duplicates_and_normalize(preface+f2, mini=False) else: filedata.extend(csv_reader2(preface + f2, mini=False)) # filedata.extend(csv_reader_remove_duplicates_and_normalize(preface+f2, mini=False)) training = deepcopy(filedata) label = f1.split('.')[0] if undersample or globalLocal: bugs = [i for i, x in enumerate(training) if x[-1] > 0.5] nonbugs = [i for i, x in enumerate(training) if x[-1] < 0.5] if undersample: undersampleTraining = [ filedata[x] for x in bugs + sample(nonbugs, len(bugs)) ] bugs = [ i for i, x in enumerate(undersampleTraining) if x[-1] > 0.5 ] nonbugs = [ i for i, x in enumerate(undersampleTraining) if x[-1] < 0.5 ] training = deepcopy(undersampleTraining) testing = csv_reader2(preface + f1, mini=False) if undersample: label += '_Und' if globalLocal: label += '_GL' if not knn: outLogReg = open(outpath + label + "_Tran.txt", 'w') outGauss = open(outpath + label + "_Tran_Gauss.txt", 'w') # outMultiNom = open(outpath + label + "_transfer_Multinom.txt", 'w') outLogReg.write("Logistic Transfer,Logistic Transfer\n") outGauss.write("GaussianNB Transfer,GaussianNB Transfer\n") # outMultiNom.write("MultinomialNB Transfer,MultinomialNB Transfer\n") else: outKNN1 = open(outpath + label + "_Tran_KNN1.txt", 'w') outKNN1.write("KNN, KNN") outKNN3 = open(outpath + label + "_Tran_KNN3.txt", 'w') outKNN3.write("KNN, KNN") outKNN5 = open(outpath + label + "_Tran_KNN5.txt", 'w') outKNN5.write("KNN, KNN") outKNN10 = open(outpath + label + "_Tran_KNN10.txt", 'w') outKNN10.write("KNN, KNN") if globalLocal: runfiledata = deepcopy(training) for row in runfiledata: del row[-1] runfiledata = numpy.array(runfiledata) nearest = min(max(len(runfiledata) // 10, 10), len(runfiledata) // 3) T = spatialtree(runfiledata, spill=0.25, rule='kd') for test in testing: miniTraining = [ training[k] for k in T.k_nearest_with_both( runfiledata, bugs, k=nearest, vector=test[:-1]) if k != -1 ] if len(miniTraining) < 1: continue if not knn: naiveBayes(miniTraining, [test], outGauss) logisticRegression(miniTraining, [test], outLogReg) else: knna = T.k_nearest(runfiledata, k=10, vector=test[:-1]) predicted1 = 1 if round( sum([ 1 if training[kl][-1] > 0 else 0 for kl in knna[:1] ])) else 0 predicted3 = 1 if round( sum([ 1 if training[kl][-1] > 0 else 0 for kl in knna[:3] ]) / 3.0) else 0 predicted5 = 1 if round( sum([ 1 if training[kl][-1] > 0 else 0 for kl in knna[:5] ]) / 5.0) else 0 predicted10 = 1 if round( sum([1 if training[kl][-1] > 0 else 0 for kl in knna]) / 10.0) else 0 actual = 1 if test[-1] > 0.5 else 0 outKNN1.write(str(actual) + ',' + str(predicted1) + "\n") outKNN3.write(str(actual) + ',' + str(predicted3) + "\n") outKNN5.write(str(actual) + ',' + str(predicted5) + "\n") outKNN10.write(str(actual) + ',' + str(predicted10) + "\n") else: if not knn: naiveBayes(training, testing, outGauss) logisticRegression(training, testing, outLogReg) else: runfiledata = deepcopy(training) for row in runfiledata: del row[-1] runfiledata = numpy.array(runfiledata) if not globalLocal: T = spatialtree(runfiledata, spill=0.25, rule='kd') for test in testing: knna = T.k_nearest(runfiledata, k=10, vector=test[:-1]) predicted1 = 1 if round( sum([ 1 if training[kl][-1] > 0 else 0 for kl in knna[:1] ])) else 0 predicted3 = 1 if round( sum([ 1 if training[kl][-1] > 0 else 0 for kl in knna[:3] ]) / 3.0) else 0 predicted5 = 1 if round( sum([ 1 if training[kl][-1] > 0 else 0 for kl in knna[:5] ]) / 5.0) else 0 predicted10 = 1 if round( sum([1 if training[kl][-1] > 0 else 0 for kl in knna]) / 10.0) else 0 actual = 1 if test[-1] > 0.5 else 0 outKNN1.write(str(actual) + ',' + str(predicted1) + "\n") outKNN3.write(str(actual) + ',' + str(predicted3) + "\n") outKNN5.write(str(actual) + ',' + str(predicted5) + "\n") outKNN10.write(str(actual) + ',' + str(predicted10) + "\n")
def regularLearning(preface, files, outpath, undersample=False, globalLocal=False, knn=False): for f1 in files: print("Starting regular learning for: " + f1) filedata = csv_reader2(preface + f1, mini=False) #filedata = csv_reader_remove_duplicates_and_normalize(preface+f1, mini=False) retries = 0 for k in range(25): shuffle(filedata) index = len(filedata) // 5 training = deepcopy(filedata[index:]) if undersample or globalLocal: bugs = [i for i, x in enumerate(training) if x[-1] > 0.5] nonbugs = [i for i, x in enumerate(training) if x[-1] < 0.5] if len(bugs) == 0 or len(nonbugs) == 0: k -= 1 retries += 1 continue if undersample: undersampleTraining = [ filedata[x] for x in bugs + sample(nonbugs, min(len(bugs), len(nonbugs))) ] bugs = [ i for i, x in enumerate(undersampleTraining) if x[-1] > 0.5 ] nonbugs = [ i for i, x in enumerate(undersampleTraining) if x[-1] < 0.5 ] training = deepcopy(undersampleTraining) testing = deepcopy(filedata[:index]) bugs = [i for i, x in enumerate(training) if x[-1] > 0.5] nonbugs = [i for i, x in enumerate(training) if x[-1] < 0.5] if len(bugs) == 0 or len(nonbugs) == 0: k -= 1 retries += 1 continue label = str(k) + "-" + f1.split('.')[0] if not os.path.exists(outpath): os.makedirs(outpath) if undersample: label += '_Und' if globalLocal: label += '_GL' if not knn: outLogReg = open(outpath + label + ".txt", 'w') outGauss = open(outpath + label + "_Gauss.txt", 'w') #outMultiNom = open(outpath + label + "Multinom.txt", 'w') outLogReg.write("Logistic,Logistic\n") outGauss.write("GaussianNB,GaussianNB\n") #outMultiNom.write("MultinomialNB,MultinomialNB\n") else: outKNN1 = open(outpath + label + "_Tran_KNN1.txt", 'w') outKNN1.write("KNN, KNN") outKNN3 = open(outpath + label + "_Tran_KNN3.txt", 'w') outKNN3.write("KNN, KNN") outKNN5 = open(outpath + label + "_Tran_KNN5.txt", 'w') outKNN5.write("KNN, KNN") outKNN10 = open(outpath + label + "_Tran_KNN10.txt", 'w') outKNN10.write("KNN, KNN") if globalLocal: runfiledata = deepcopy(training) for row in runfiledata: del row[-1] nearest = min(max(len(runfiledata) // 10, 10), len(runfiledata) // 3) runfiledata = numpy.array(runfiledata) T = spatialtree(runfiledata, spill=0.25, rule='kd', height=5) for test in testing: x = T.k_nearest_with_both(runfiledata, bugs, k=nearest, vector=test[:-1]) if x != -1: miniTraining = [training[k] for k in x] else: continue """ miniTraining = [training[k] for k in T.k_nearest_with_both(runfiledata, bugs, k=nearest, vector=test[:-1]) if k != -1] if len(miniTraining) < 1: continue """ if not knn: naiveBayes(miniTraining, [test], outGauss) logisticRegression(miniTraining, [test], outLogReg) else: knna = T.k_nearest(runfiledata, k=10, vector=test[:-1]) predicted1 = 1 if round( sum([ 1 if training[kl][-1] > 0 else 0 for kl in knna[:1] ])) else 0 predicted3 = 1 if round( sum([ 1 if training[kl][-1] > 0 else 0 for kl in knna[:3] ]) / 3.0) else 0 predicted5 = 1 if round( sum([ 1 if training[kl][-1] > 0 else 0 for kl in knna[:5] ]) / 5.0) else 0 predicted10 = 1 if round( sum([ 1 if training[kl][-1] > 0 else 0 for kl in knna ]) / 10.0) else 0 actual = 1 if test[-1] > 0.5 else 0 outKNN1.write( str(actual) + ',' + str(predicted1) + "\n") outKNN3.write( str(actual) + ',' + str(predicted3) + "\n") outKNN5.write( str(actual) + ',' + str(predicted5) + "\n") outKNN10.write( str(actual) + ',' + str(predicted10) + "\n") else: if not knn: naiveBayes(training, testing, outGauss) logisticRegression(training, testing, outLogReg) else: runfiledata = deepcopy(training) for row in runfiledata: del row[-1] runfiledata = numpy.array(runfiledata) if not globalLocal: T = spatialtree(runfiledata, spill=0.25, rule='kd') for test in testing: knna = T.k_nearest(runfiledata, k=10, vector=test[:-1]) predicted1 = 1 if round( sum([ 1 if training[kl][-1] > 0 else 0 for kl in knna[:1] ])) else 0 predicted3 = 1 if round( sum([ 1 if training[kl][-1] > 0 else 0 for kl in knna[:3] ]) / 3.0) else 0 predicted5 = 1 if round( sum([ 1 if training[kl][-1] > 0 else 0 for kl in knna[:5] ]) / 5.0) else 0 predicted10 = 1 if round( sum([ 1 if training[kl][-1] > 0 else 0 for kl in knna ]) / 10.0) else 0 actual = 1 if test[-1] > 0.5 else 0 outKNN1.write( str(actual) + ',' + str(predicted1) + "\n") outKNN3.write( str(actual) + ',' + str(predicted3) + "\n") outKNN5.write( str(actual) + ',' + str(predicted5) + "\n") outKNN10.write( str(actual) + ',' + str(predicted10) + "\n")
for i in xrange(N): # Let's use string-valued keys X['%04x' %i] = newpoint() pass # Let's make a few distinguished points X['Alice'] = newpoint() X['Bob'] = newpoint() X['Carol'] = newpoint() print 'done.' # Construct a tree. Let's use a 2-means tree with spill percentage 0.3 print 'Building tree...' T = spatialtree(X, rule='2-means', spill=0.3) print 'done.' # Show some stats print '# items in tree : ', len(T) print 'Dimensionality : ', T.getDimension() print 'Height of tree : ', T.getHeight() print 'Spill percentage : ', T.getSpill() print 'Split rule : ', T.getRule() # Let's find the nearest neighbors of bob: knn_bob = T.k_nearest(X, k=10, index='Bob') print 'KNN(Bob) : ', knn_bob # Or of a random vector:
# First, create a random data matrix N = 5000 D = 20 X = numpy.random.randn(N,D) # Apply a random projection so the data's not totally boring P = numpy.random.randn(D, D) X = numpy.dot(X, P) # Construct a tree. This time, we'll use a random projection tree of height 10 print 'Building tree...' T = spatialtree(X, rule='rp', height=10) print 'done.' # Show some useful information about the tree print '# items in tree : ', len(T) print 'Dimensionality : ', T.getDimension() print 'Height of tree : ', T.getHeight() print 'Spill percentage : ', T.getSpill() print 'Split rule : ', T.getRule() # # By default, spatialtree retains index information at every level # throughout the tree. This facilitates pruning and other dynamic # modifications to the tree. However, if your data set and tree # are static (after construction of the tree), then we can make a more # space-efficient data structure by using an inverted map.
# First, create a random data matrix N = 5000 D = 20 X = numpy.random.randn(N, D) # Apply a random projection so the data's not totally boring P = numpy.random.randn(D, D) X = numpy.dot(X, P) # Construct a tree. This time, we'll use a random projection tree of height 10 print 'Building tree...' T = spatialtree(X, rule='rp', height=10) print 'done.' # Show some useful information about the tree print '# items in tree : ', len(T) print 'Dimensionality : ', T.getDimension() print 'Height of tree : ', T.getHeight() print 'Spill percentage : ', T.getSpill() print 'Split rule : ', T.getRule() # # By default, spatialtree retains index information at every level # throughout the tree. This facilitates pruning and other dynamic # modifications to the tree. However, if your data set and tree # are static (after construction of the tree), then we can make a more # space-efficient data structure by using an inverted map.
def matrixDemoTestWorker(trials=None, tree_type=None, spill_rate=None, k_neighbors=None, tree_depth=None, files=None): run = 0 # Create testing variables tree = tree_type or 'kd' spill = spill_rate or .25 k_near = k_neighbors or 10 depth = tree_depth or 0 filename = files # Python interprets spill_layer = 0 as False, and so sets spill to .25 if spill_rate == 0: spill = 0 # read in the data filedata, reduceddata, nothing = csv_reader2(filename, mini=True) if reduceddata == "error" and tree == 'entropic': print("insufficient eigenvectors for entropic") return N = len(filedata) D = len(filedata[0]) displayfile = os.path.basename(filename).split(".")[ 0] # get only the filename from the path index = 'k_' + str(k_near) + '_' + str(spill) + '_' + str( tree_depth) + "_" + str(displayfile) + '_' + tree basepath = '.\\datafiles\\' #create a folder for it, then put all this stuff in. if not os.path.exists(basepath): os.makedirs(basepath) path = basepath + index #f = open(path + ".txt", 'w') #f.write(displayfile + "," + index + "\n") timerfile = open(path + "_times.txt", 'w') for runs in range(trials): c = list(zip(filedata, reduceddata)) random.shuffle(c) filedata, reduceddata = zip(*c) # make a version of filedata that doesn't have the class in it so things don't get sorted by class runfiledata = deepcopy(filedata) for row in runfiledata: del row[-1] # divide the data into 5 groups for cross validation validationGroups = [[] for i in range(5)] evectValidationGroups = [[] for i in range(5)] testingValidationGroups = [[] for i in range(5)] i = 0 for item in runfiledata: validationGroups[i % 5].append(item) i += 1 i = 0 for item in filedata: testingValidationGroups[i % 5].append(item) i += 1 i = 0 for item in reduceddata: evectValidationGroups[i % 5].append(item) i += 1 for k in range(len(validationGroups)): run += 1 index = str(run) + '-k_' + str(k_near) + '_' + str( spill) + '_' + str(tree_depth) + "_" + str( displayfile) + '_' + tree path = basepath + index f = open(path + ".txt", 'w') f.write(displayfile + "," + index + "\n") #print("Building ", tree_type) t = [] for x in range(len(evectValidationGroups)): if x != k: t.extend(evectValidationGroups[x]) trainingevect = numpy.array(t) t2 = [] for x in range(len(validationGroups)): if x != k: t2.extend(validationGroups[x]) training = numpy.array(t2) t3 = [] for x in range(len(testingValidationGroups)): if x != k: t3.extend(testingValidationGroups[x]) testing = numpy.array(t3) #need to get this part working if tree == 'entropic': with timer(timerfile): T = spatialtree(trainingevect, spill=spill, rule=tree, height=tree_depth) else: with timer(timerfile): T = spatialtree(training, spill=spill, rule=tree, height=tree_depth) # To compare accuracy against brute force, make a height=0 tree (will do a linear search for knn #T_root = spatialtree(training, height=0) recall = 0 # Generate test points from the test set for test_point in range(len(validationGroups[k])): test = validationGroups[k][test_point] testevect = evectValidationGroups[k][test_point] #actual classification actual = testingValidationGroups[k][test_point][-1] #find approximate knn if tree != 'entropic': knn_approx = T.k_nearest(training, k=k_near, vector=test) else: knn_approx = T.k_nearest(trainingevect, k=k_near, vector=testevect) #predicted classification (class should be 1 or 0. Sum, divide by n and round to 0 or 1 for majority vote) predicted = round( sum([1 if testing[kl][-1] > 0 else 0 for kl in knn_approx]) / k_near) #first round, merely classifying as buggy or not if predicted >= 0.5 and actual >= 0.5: # datasets are number of bugs, average of over 1/2 means buggy predicted = actual = 1 else: predicted = int(max(min(predicted, 1), 0)) actual = int(max(min(actual, 1), 0)) # Now, get the true nearest neighbors (want to compare results with this????) #knn_t = T_root.k_nearest(training, k=k_near, vector=test2) f.write(str(actual) + "," + str(predicted) + "\n") f.close() timerfile.close()
N = 5000 D = 20 X = numpy.random.randn(N,D) # Apply a random projection so the data's not totally boring P = numpy.random.randn(D, D) X = numpy.dot(X, P) # Construct a tree. By default, we get a KD-spill-tree with height # determined automatically, and spill = 25% print 'Building tree...' T = spatialtree(X) print 'done.' # Show some useful information about the tree print '# items in tree : ', len(T) print 'Dimensionality : ', T.getDimension() print 'Height of tree : ', T.getHeight() print 'Spill percentage : ', T.getSpill() print 'Split rule : ', T.getRule() # If we want to compare accuracy against brute-force search, # we can make a height=0 tree: T_root = spatialtree(X, height=0) # Find the 10 approximate nearest neighbors of the 500th data point # returned list is row#'s of X closest to the query index,
def matrixDemoTestWorker(size=None, dimensions=None, tree_type=None, spill_rate=None, samp=None, k_neighbors=None, tree_depth=None, files=None): #this_run = dict() # Create random matrix N = size or 5000 D = dimensions or 20 # Create testing variables tree = tree_type or 'kd' spill = spill_rate or .25 samples = samp or 100 k_near = k_neighbors or 5 k = 5 max_value = 100 filename = files # Python interprets spill_layer = 0 as False, and so sets spill to .25 if spill_rate == 0: spill = 0 # read in the data filedata, reduceddata = csv_reader2(filename) N = len(filedata) D = len(filedata[0]) #reducedD = len(reduceddata[0]) c = list(zip(filedata, reduceddata)) random.shuffle(c) filedata, reduceddata = zip(*c) # divide the data into 5 groups for cross validation i = 0 Y = [[] for i in xrange(5)] Z = [[] for i in xrange(5)] for item in filedata: Y[i % 5].append(item) i += 1 i = 0 for item in reduceddata: Z[i % 5].append(item) i += 1 this = random.randint(0, 4) t = [] for x in xrange(len(Z)): if x != this: t.extend(Z[x]) training = numpy.array(t) t2 = [] for x in xrange(len(Y)): if x != this: t2.extend(Y[x]) training_real = numpy.array(t2) print 'Building tree...' start_time = timeit.default_timer() T = spatialtree(training, spill=spill, height=tree_depth, rule=tree) elapsed = timeit.default_timer() - start_time print("mine:\t" + str(elapsed)) start_time = timeit.default_timer() T2 = spatialtree(training_real, spill=spill, height=tree_depth, rule=tree) elapsed = timeit.default_timer() - start_time print("theirs:\t" + str(elapsed)) print 'done.' # If we want to compare accuracy against brute-force search, # we can make a height=0 tree: T_root = spatialtree(training_real, height=0) # Generate test points from the test set test_point = random.randint(0, len(Y[4]) - 1) test2 = Y[this][test_point] test = Z[this][test_point] print(test2) print(test) # Find the 10 approximate nearest neighbors of the 500th data point # returned list is row#'s of X closest to the query index, # sorted by increasing distance knn_a = T.k_nearest(training, k=k_near, vector=test) print 'KNN approx (index) : ', knn_a knn_b = T2.k_nearest(training_real, k=k_near, vector=test2) print 'KNN approx (index) : ', knn_b # Now, get the true nearest neighbors knn_t = T_root.k_nearest(training_real, k=2 * k_near, vector=test2) print 'KNN true (index) : ', knn_t
N = 5000 D = 20 X = numpy.random.randn(N,D) # Apply a random projection so the data's not totally boring P = numpy.random.randn(D, D) X = numpy.dot(X, P) # Construct a tree. By default, we get a KD-spill-tree with height # determined automatically, and spill = 25% print('Building tree...') T = spatialtree(X) print('done.') # Show some useful information about the tree print('# items in tree : ', len(T)) print('Dimensionality : ', T.getDimension()) print('Height of tree : ', T.getHeight()) print('Spill percentage : ', T.getSpill()) print('Split rule : ', T.getRule()) # If we want to compare accuracy against brute-force search, # we can make a height=0 tree: T_root = spatialtree(X, height=0) # Find the 10 approximate nearest neighbors of the 500th data point # returned list is row#'s of X closest to the query index,
# Apply a random projection so the data's not totally boring P = numpy.random.randn(D, D) X = numpy.dot(X, P) """ X = fileio.csv_reader("testdata.csv") D = len(X[0]) N = len(X) P = numpy.random.randn(D,D) """ # Construct a tree. By default, we get a KD-spill-tree with height # determined automatically, and spill = 25% print 'Building tree...' T = spatialtree(X, spill=.01, rule='2-means') print 'done.' # Show some useful information about the tree print '# items in tree : ', len(T) print 'Dimensionality : ', T.getDimension() print 'Height of tree : ', T.getHeight() print 'Spill percentage : ', T.getSpill() print 'Split rule : ', T.getRule() # If we want to compare accuracy against brute-force search, # we can make a height=0 tree: T_root = spatialtree(X, height=0) # Find the 10 approximate nearest neighbors of the 500th data point # returned list is row#'s of X closest to the query index,