Ejemplo n.º 1
0
def matrixDemoTestWorker(size=None,
                         dimensions=None,
                         tree_type=None,
                         spill_rate=None,
                         samp=None):
    this_run = dict()

    # Create random matrix
    N = size or 5000
    D = dimensions or 20
    tree = tree_type or 'kd'
    spill = spill_rate or .25
    #samples = samp or 100
    samples = 100
    X = numpy.random.randn(N, D)

    #Random projection to liven up the data
    P = numpy.random.randn(D, D)
    X = numpy.dot(X, P)

    # Construct the type of tree specified with spill specified.
    # Defaults are KD-spill-tree with spill = 25%
    print "Building tree...", tree, spill
    T = spatialtree(X, rule=tree, spill=spill)
    print "Done"

    T_root = spatialtree(X, rule=tree, spill=spill, height=0)

    # Test recall on items in tree
    in_tree_count = 0
    in_tree_recall = 0

    for countvar in range(samples):
        rand = random.randint(0, 499)
        knn_a = T.k_nearest(X, k=10, index=rand)
        knn_t = T_root.k_nearest(X, k=10, index=rand)
        in_tree_count += 1
        in_tree_recall += (len(set(knn_a) & set(knn_t)) * 1.0 /
                           len(set(knn_t)))

    index = 'in_' + tree + '_' + str(spill) + '_' + str(N) + '_' + str(D)
    this_run[index] = in_tree_recall / in_tree_count

    # We can also search with a new vector not already in the tree
    out_of_tree_count = 0
    out_of_tree_recall = 0

    for countvar in range(samples):
        query = numpy.dot(numpy.random.randn(D), P)
        knn_a = T.k_nearest(X, k=10, vector=query)
        knn_t = T_root.k_nearest(X, k=10, vector=query)
        out_of_tree_count += 1
        out_of_tree_recall += (len(set(knn_a) & set(knn_t)) * 1.0 /
                               len(set(knn_t)))

    index = 'out_' + tree + '_' + str(spill) + '_' + str(N) + '_' + str(D)
    this_run[index] = out_of_tree_recall / out_of_tree_count

    return this_run
Ejemplo n.º 2
0
def split_by_spatial_tree(X, n_anchors):
    '''Data partitioning via spatial trees
    Dependency: http://cseweb.ucsd.edu/~naverma/SpatialTrees/index.html

    Args:
        X: matrix of data points
   
    Returns:
        A: centroids for each spatial partitioning
    '''
    from spatialtree import spatialtree

    height = np.log2(n_anchors)
    height_int = np.int(height)
    if height_int != height:
        print "number of anchors is not power of 2"

    T = spatialtree(X, rule='rp', height=height_int, spill=0.0, min_items=1)

    A = np.zeros((n_anchors, X.shape[1]))

    c = 0
    for t in T.traverse():
        if t.isLeaf():
            indices = [index for index in t.__iter__()]
            A[c,:] = np.average(X[indices, :], axis=0)
            c = c + 1

    return A
Ejemplo n.º 3
0
def matrixDemoTestWorker(size=None,
                         dimensions=None,
                         tree_type=None,
                         spill_rate=None,
                         samp=None,
                         k_neighbors=None,
                         tree_depth=None,
                         files=None):
    #this_run = dict()
    this_run = [0 for i in xrange(2)]

    # Create random matrix
    N = size or 5000
    D = dimensions or 20

    # Create testing variables
    tree = tree_type or 'kd'
    spill = spill_rate or .25
    samples = samp or 100
    k_near = k_neighbors or 1
    #k = 5
    max_value = 100
    filename = files

    # Python interprets spill_layer = 0 as False, and so sets spill to .25
    if spill_rate == 0:
        spill = 0

    # read in the data
    X = list(csv_reader(filename))
    N = len(X)
    D = len(X[0])

    # divide the data into 5 groups for cross validation
    i = 0
    random.shuffle(X)

    Y = [[] for i in xrange(5)]
    for item in X:
        Y[i % 5].append(item)
        i += 1
    """
    for i in xrange(5):
        print(len(Y[i]))
    """
    timer = open(tree, 'w')

    # for each, use the other 4 groups for training, test on the remaining group
    for item in Y:
        t = []
        for x in Y:
            if x != item:
                t.extend(x)
        training = numpy.array(t)
        print "Building tree...", tree, spill, tree_depth
        #T = spatialtree(training, rule=tree, spill=spill, height = tree_depth)
        start_time = timeit.default_timer()
        T = spatialtree(training, rule=tree, spill=spill, height=tree_depth)
        elapsed = timeit.default_timer() - start_time
        timer.write(str(elapsed) + ",")
        #print "Done
        T_root = spatialtree(
            training, rule=tree, spill=spill,
            height=0)  # this should be training and not X, right?
        recall = 0
        index = '' + tree + '_' + str(spill) + '_' + str(tree_depth)
        f = open(index + ".txt", 'w')
        for test in item:
            knn_a = T.k_nearest(training, k=k_near, vector=test)
            knn_t = T_root.k_nearest(training, k=k_near, vector=test)
            #true_pos = len(set(knn_a) & set(knn_t))*1.0/len(set(knn_t))
            true_pos = len(set(knn_a) & set(knn_t)) * 1.0
            false_pos = len(set(knn_t)) - true_pos
            true_neg = len(training) - false_pos
            f.write(
                str(true_pos) + '-' + str(false_pos) + '-' + str(true_neg) +
                ',')
            recall += true_pos / len(set(knn_t))
        #print_recall = recall*1.0/len(item)
        #f.write(str(print_recall)+',')
        #print(""+tree+" recall\t", print_recall)
        #results_memo[index] = print_recall

    f.write("\n")
    f.close()
    timer.close()
    return N, D  # return the number of entries and decisions so they can be passed to data postprocessing
Ejemplo n.º 4
0
def matrixDemoTestWorker(size = None, dimensions = None, tree_type = None, spill_rate = None, samp = None, k_neighbors = None, tree_depth=None):
    #this_run = dict()
    this_run = [0 for i in xrange(2)]
    
    # Create random matrix
    N = size or 5000
    D = dimensions or 20 
    
    # Create testing variables
    tree= tree_type or 'kd'
    spill = spill_rate or .25
    samples = samp or 100
    k_near = k_neighbors or 10
    k = 5
    max_value = 100
    filename = "testdata.csv"
    
    # Python interprets spill_layer = 0 as False, and so sets spill to .25
    if spill_rate == 0:
        spill = 0
    
    
    """
    X = numpy.random.randn(N, D)
    P = numpy.random.randn(D, D)
    X = numpy.dot(X, P)
    """
    
    #If you want to use data from a file, uncomment this and put the filename in here
    X = csv_reader("testdata.csv")
    D = len(X[0])
    N = len(X)
    P = numpy.random.randn(D,D)
    
    
    # Apply a few random projections so the data's not totally boring
    # Goal: embed k dimensional data in D-space
    """
    for i in xrange(k):
        P = numpy.random.randn(D, D)
        X = numpy.dot(X, P)
    """
    
    # Construct the type of tree specified with spill specified.
    # Defaults are KD-spill-tree with spill = 25%
    print "Building tree...", tree, spill, tree_depth
    T = spatialtree(X, rule=tree, spill=spill, height = tree_depth)
    print "Done"
    
    T_root = spatialtree(X, rule=tree, spill=spill, height = 0)
    
    print("Running tests...")
    # Test recall on items in tree
    in_tree_recall = 0
    out_of_tree_recall = 0
    
    index = '' + tree + '_' + str(spill) + '_' + str(tree_depth)
    f = open(index+".txt", 'w')
    
    for countvar in range(samples):
        rand = random.randint(0,N-1)
        knn_a = T.k_nearest(X, k=k_near, index = rand)
        knn_t = T_root.k_nearest(X, k=k_near, index=rand)
        value = len(set(knn_a) & set(knn_t))*1.0/len(set(knn_t))
        f.write(str(value))
        if countvar != samples-1:
            f.write(", ")
        in_tree_recall += value
      
        """       
        true_pos += len(set(knn_a) & set(knn_t))
        false_pos += in_tree_count - true_pos
        false_neg += in_tree_count - true_pos
        true_neg += samples - false_neg
        """
        
    f.write("\n")
        
    # We can also search with a new vector not already in the tree
    for countvar in range(samples):
        query = numpy.dot(numpy.random.randn(D), P)
        knn_a = T.k_nearest(X, k=k_near, vector=query)
        knn_t = T_root.k_nearest(X, k=k_near, vector=query)
        value = len(set(knn_a) & set(knn_t))*1.0/len(set(knn_t))
        #f.write(str(format(len(set(knn_a) & set(knn_t)) * 1.0 / len(set(knn_t)), '.4f')))
        f.write(str(value))
        if countvar != samples-1:
            f.write(', ')
        out_of_tree_recall += value
    f.write("\n")
    f.close()    
    
    print "in tree_recall\t", in_tree_recall*1.0/samples 
    print "out of tree_recall\t", out_of_tree_recall*1.0/samples 
    print("Done")
Ejemplo n.º 5
0
def transferLearning(preface,
                     files,
                     outpath,
                     undersample=False,
                     globalLocal=False,
                     knn=False):
    for f1 in files:
        filedata = None
        print("Starting transfer learning for: " + f1)
        for f2 in files:
            if f1 != f2:
                if filedata == None:
                    filedata = csv_reader2(preface + f2, mini=False)
                    # filedata = csv_reader_remove_duplicates_and_normalize(preface+f2, mini=False)
                else:
                    filedata.extend(csv_reader2(preface + f2, mini=False))
                    # filedata.extend(csv_reader_remove_duplicates_and_normalize(preface+f2, mini=False))
        training = deepcopy(filedata)
        label = f1.split('.')[0]

        if undersample or globalLocal:
            bugs = [i for i, x in enumerate(training) if x[-1] > 0.5]
            nonbugs = [i for i, x in enumerate(training) if x[-1] < 0.5]
            if undersample:
                undersampleTraining = [
                    filedata[x] for x in bugs + sample(nonbugs, len(bugs))
                ]
                bugs = [
                    i for i, x in enumerate(undersampleTraining) if x[-1] > 0.5
                ]
                nonbugs = [
                    i for i, x in enumerate(undersampleTraining) if x[-1] < 0.5
                ]
                training = deepcopy(undersampleTraining)
        testing = csv_reader2(preface + f1, mini=False)

        if undersample:
            label += '_Und'
        if globalLocal:
            label += '_GL'
        if not knn:
            outLogReg = open(outpath + label + "_Tran.txt", 'w')
            outGauss = open(outpath + label + "_Tran_Gauss.txt", 'w')
            # outMultiNom = open(outpath + label + "_transfer_Multinom.txt", 'w')
            outLogReg.write("Logistic Transfer,Logistic Transfer\n")
            outGauss.write("GaussianNB Transfer,GaussianNB Transfer\n")
            # outMultiNom.write("MultinomialNB Transfer,MultinomialNB Transfer\n")
        else:
            outKNN1 = open(outpath + label + "_Tran_KNN1.txt", 'w')
            outKNN1.write("KNN, KNN")
            outKNN3 = open(outpath + label + "_Tran_KNN3.txt", 'w')
            outKNN3.write("KNN, KNN")
            outKNN5 = open(outpath + label + "_Tran_KNN5.txt", 'w')
            outKNN5.write("KNN, KNN")
            outKNN10 = open(outpath + label + "_Tran_KNN10.txt", 'w')
            outKNN10.write("KNN, KNN")

        if globalLocal:
            runfiledata = deepcopy(training)
            for row in runfiledata:
                del row[-1]
            runfiledata = numpy.array(runfiledata)
            nearest = min(max(len(runfiledata) // 10, 10),
                          len(runfiledata) // 3)
            T = spatialtree(runfiledata, spill=0.25, rule='kd')
            for test in testing:
                miniTraining = [
                    training[k] for k in T.k_nearest_with_both(
                        runfiledata, bugs, k=nearest, vector=test[:-1])
                    if k != -1
                ]
                if len(miniTraining) < 1:
                    continue
                if not knn:
                    naiveBayes(miniTraining, [test], outGauss)
                    logisticRegression(miniTraining, [test], outLogReg)
                else:
                    knna = T.k_nearest(runfiledata, k=10, vector=test[:-1])
                    predicted1 = 1 if round(
                        sum([
                            1 if training[kl][-1] > 0 else 0 for kl in knna[:1]
                        ])) else 0
                    predicted3 = 1 if round(
                        sum([
                            1 if training[kl][-1] > 0 else 0 for kl in knna[:3]
                        ]) / 3.0) else 0
                    predicted5 = 1 if round(
                        sum([
                            1 if training[kl][-1] > 0 else 0 for kl in knna[:5]
                        ]) / 5.0) else 0
                    predicted10 = 1 if round(
                        sum([1 if training[kl][-1] > 0 else 0
                             for kl in knna]) / 10.0) else 0
                    actual = 1 if test[-1] > 0.5 else 0
                    outKNN1.write(str(actual) + ',' + str(predicted1) + "\n")
                    outKNN3.write(str(actual) + ',' + str(predicted3) + "\n")
                    outKNN5.write(str(actual) + ',' + str(predicted5) + "\n")
                    outKNN10.write(str(actual) + ',' + str(predicted10) + "\n")
        else:
            if not knn:
                naiveBayes(training, testing, outGauss)
                logisticRegression(training, testing, outLogReg)
            else:
                runfiledata = deepcopy(training)
                for row in runfiledata:
                    del row[-1]
                runfiledata = numpy.array(runfiledata)
                if not globalLocal:
                    T = spatialtree(runfiledata, spill=0.25, rule='kd')
                for test in testing:
                    knna = T.k_nearest(runfiledata, k=10, vector=test[:-1])
                    predicted1 = 1 if round(
                        sum([
                            1 if training[kl][-1] > 0 else 0 for kl in knna[:1]
                        ])) else 0
                    predicted3 = 1 if round(
                        sum([
                            1 if training[kl][-1] > 0 else 0 for kl in knna[:3]
                        ]) / 3.0) else 0
                    predicted5 = 1 if round(
                        sum([
                            1 if training[kl][-1] > 0 else 0 for kl in knna[:5]
                        ]) / 5.0) else 0
                    predicted10 = 1 if round(
                        sum([1 if training[kl][-1] > 0 else 0
                             for kl in knna]) / 10.0) else 0
                    actual = 1 if test[-1] > 0.5 else 0
                    outKNN1.write(str(actual) + ',' + str(predicted1) + "\n")
                    outKNN3.write(str(actual) + ',' + str(predicted3) + "\n")
                    outKNN5.write(str(actual) + ',' + str(predicted5) + "\n")
                    outKNN10.write(str(actual) + ',' + str(predicted10) + "\n")
Ejemplo n.º 6
0
def regularLearning(preface,
                    files,
                    outpath,
                    undersample=False,
                    globalLocal=False,
                    knn=False):
    for f1 in files:
        print("Starting regular learning for: " + f1)
        filedata = csv_reader2(preface + f1, mini=False)
        #filedata = csv_reader_remove_duplicates_and_normalize(preface+f1, mini=False)
        retries = 0

        for k in range(25):
            shuffle(filedata)
            index = len(filedata) // 5
            training = deepcopy(filedata[index:])
            if undersample or globalLocal:
                bugs = [i for i, x in enumerate(training) if x[-1] > 0.5]
                nonbugs = [i for i, x in enumerate(training) if x[-1] < 0.5]
                if len(bugs) == 0 or len(nonbugs) == 0:
                    k -= 1
                    retries += 1
                    continue
                if undersample:
                    undersampleTraining = [
                        filedata[x] for x in bugs +
                        sample(nonbugs, min(len(bugs), len(nonbugs)))
                    ]
                    bugs = [
                        i for i, x in enumerate(undersampleTraining)
                        if x[-1] > 0.5
                    ]
                    nonbugs = [
                        i for i, x in enumerate(undersampleTraining)
                        if x[-1] < 0.5
                    ]
                    training = deepcopy(undersampleTraining)
            testing = deepcopy(filedata[:index])
            bugs = [i for i, x in enumerate(training) if x[-1] > 0.5]
            nonbugs = [i for i, x in enumerate(training) if x[-1] < 0.5]
            if len(bugs) == 0 or len(nonbugs) == 0:
                k -= 1
                retries += 1
                continue

            label = str(k) + "-" + f1.split('.')[0]

            if not os.path.exists(outpath):
                os.makedirs(outpath)
            if undersample:
                label += '_Und'
            if globalLocal:
                label += '_GL'
            if not knn:
                outLogReg = open(outpath + label + ".txt", 'w')
                outGauss = open(outpath + label + "_Gauss.txt", 'w')
                #outMultiNom = open(outpath + label + "Multinom.txt", 'w')
                outLogReg.write("Logistic,Logistic\n")
                outGauss.write("GaussianNB,GaussianNB\n")
                #outMultiNom.write("MultinomialNB,MultinomialNB\n")
            else:
                outKNN1 = open(outpath + label + "_Tran_KNN1.txt", 'w')
                outKNN1.write("KNN, KNN")
                outKNN3 = open(outpath + label + "_Tran_KNN3.txt", 'w')
                outKNN3.write("KNN, KNN")
                outKNN5 = open(outpath + label + "_Tran_KNN5.txt", 'w')
                outKNN5.write("KNN, KNN")
                outKNN10 = open(outpath + label + "_Tran_KNN10.txt", 'w')
                outKNN10.write("KNN, KNN")

            if globalLocal:
                runfiledata = deepcopy(training)
                for row in runfiledata:
                    del row[-1]
                nearest = min(max(len(runfiledata) // 10, 10),
                              len(runfiledata) // 3)
                runfiledata = numpy.array(runfiledata)
                T = spatialtree(runfiledata, spill=0.25, rule='kd', height=5)
                for test in testing:
                    x = T.k_nearest_with_both(runfiledata,
                                              bugs,
                                              k=nearest,
                                              vector=test[:-1])
                    if x != -1:
                        miniTraining = [training[k] for k in x]
                    else:
                        continue
                    """
                    miniTraining = [training[k] for k in T.k_nearest_with_both(runfiledata, bugs, k=nearest, vector=test[:-1]) if k != -1]
                    if len(miniTraining) < 1:
                        continue
                    """
                    if not knn:
                        naiveBayes(miniTraining, [test], outGauss)
                        logisticRegression(miniTraining, [test], outLogReg)
                    else:
                        knna = T.k_nearest(runfiledata, k=10, vector=test[:-1])
                        predicted1 = 1 if round(
                            sum([
                                1 if training[kl][-1] > 0 else 0
                                for kl in knna[:1]
                            ])) else 0
                        predicted3 = 1 if round(
                            sum([
                                1 if training[kl][-1] > 0 else 0
                                for kl in knna[:3]
                            ]) / 3.0) else 0
                        predicted5 = 1 if round(
                            sum([
                                1 if training[kl][-1] > 0 else 0
                                for kl in knna[:5]
                            ]) / 5.0) else 0
                        predicted10 = 1 if round(
                            sum([
                                1 if training[kl][-1] > 0 else 0 for kl in knna
                            ]) / 10.0) else 0
                        actual = 1 if test[-1] > 0.5 else 0
                        outKNN1.write(
                            str(actual) + ',' + str(predicted1) + "\n")
                        outKNN3.write(
                            str(actual) + ',' + str(predicted3) + "\n")
                        outKNN5.write(
                            str(actual) + ',' + str(predicted5) + "\n")
                        outKNN10.write(
                            str(actual) + ',' + str(predicted10) + "\n")
            else:
                if not knn:
                    naiveBayes(training, testing, outGauss)
                    logisticRegression(training, testing, outLogReg)
                else:
                    runfiledata = deepcopy(training)
                    for row in runfiledata:
                        del row[-1]
                    runfiledata = numpy.array(runfiledata)
                    if not globalLocal:
                        T = spatialtree(runfiledata, spill=0.25, rule='kd')
                    for test in testing:
                        knna = T.k_nearest(runfiledata, k=10, vector=test[:-1])
                        predicted1 = 1 if round(
                            sum([
                                1 if training[kl][-1] > 0 else 0
                                for kl in knna[:1]
                            ])) else 0
                        predicted3 = 1 if round(
                            sum([
                                1 if training[kl][-1] > 0 else 0
                                for kl in knna[:3]
                            ]) / 3.0) else 0
                        predicted5 = 1 if round(
                            sum([
                                1 if training[kl][-1] > 0 else 0
                                for kl in knna[:5]
                            ]) / 5.0) else 0
                        predicted10 = 1 if round(
                            sum([
                                1 if training[kl][-1] > 0 else 0 for kl in knna
                            ]) / 10.0) else 0
                        actual = 1 if test[-1] > 0.5 else 0
                        outKNN1.write(
                            str(actual) + ',' + str(predicted1) + "\n")
                        outKNN3.write(
                            str(actual) + ',' + str(predicted3) + "\n")
                        outKNN5.write(
                            str(actual) + ',' + str(predicted5) + "\n")
                        outKNN10.write(
                            str(actual) + ',' + str(predicted10) + "\n")
Ejemplo n.º 7
0
for i in xrange(N):
    # Let's use string-valued keys
    X['%04x' %i] = newpoint()
    pass

# Let's make a few distinguished points
X['Alice']  = newpoint()
X['Bob']    = newpoint()
X['Carol']  = newpoint()

print 'done.'

# Construct a tree.  Let's use a 2-means tree with spill percentage 0.3
print 'Building tree...'
T = spatialtree(X, rule='2-means', spill=0.3)
print 'done.'


# Show some stats
print '# items in tree    : ', len(T)
print 'Dimensionality     : ', T.getDimension()
print 'Height of tree     : ', T.getHeight()
print 'Spill percentage   : ', T.getSpill()
print 'Split rule         : ', T.getRule()

# Let's find the nearest neighbors of bob:
knn_bob = T.k_nearest(X, k=10, index='Bob')
print 'KNN(Bob)           : ', knn_bob

# Or of a random vector:
Ejemplo n.º 8
0
# First, create a random data matrix
N = 5000
D = 20

X = numpy.random.randn(N,D)


# Apply a random projection so the data's not totally boring
P = numpy.random.randn(D, D)

X = numpy.dot(X, P)

# Construct a tree.  This time, we'll use a random projection tree of height 10

print 'Building tree...'
T = spatialtree(X, rule='rp', height=10)
print 'done.'

# Show some useful information about the tree
print '# items in tree    : ', len(T)
print 'Dimensionality     : ', T.getDimension()
print 'Height of tree     : ', T.getHeight()
print 'Spill percentage   : ', T.getSpill()
print 'Split rule         : ', T.getRule()

#
# By default, spatialtree retains index information at every level 
# throughout the tree.  This facilitates pruning and other dynamic 
# modifications to the tree.  However, if your data set and tree
# are static (after construction of the tree), then we can make a more
# space-efficient data structure by using an inverted map.
Ejemplo n.º 9
0
# First, create a random data matrix
N = 5000
D = 20

X = numpy.random.randn(N, D)

# Apply a random projection so the data's not totally boring
P = numpy.random.randn(D, D)

X = numpy.dot(X, P)

# Construct a tree.  This time, we'll use a random projection tree of height 10

print 'Building tree...'
T = spatialtree(X, rule='rp', height=10)
print 'done.'

# Show some useful information about the tree
print '# items in tree    : ', len(T)
print 'Dimensionality     : ', T.getDimension()
print 'Height of tree     : ', T.getHeight()
print 'Spill percentage   : ', T.getSpill()
print 'Split rule         : ', T.getRule()

#
# By default, spatialtree retains index information at every level
# throughout the tree.  This facilitates pruning and other dynamic
# modifications to the tree.  However, if your data set and tree
# are static (after construction of the tree), then we can make a more
# space-efficient data structure by using an inverted map.
Ejemplo n.º 10
0
def matrixDemoTestWorker(trials=None,
                         tree_type=None,
                         spill_rate=None,
                         k_neighbors=None,
                         tree_depth=None,
                         files=None):
    run = 0
    # Create testing variables
    tree = tree_type or 'kd'
    spill = spill_rate or .25
    k_near = k_neighbors or 10
    depth = tree_depth or 0
    filename = files

    # Python interprets spill_layer = 0 as False, and so sets spill to .25
    if spill_rate == 0:
        spill = 0

    # read in the data
    filedata, reduceddata, nothing = csv_reader2(filename, mini=True)
    if reduceddata == "error" and tree == 'entropic':
        print("insufficient eigenvectors for entropic")
        return
    N = len(filedata)
    D = len(filedata[0])

    displayfile = os.path.basename(filename).split(".")[
        0]  # get only the filename from the path
    index = 'k_' + str(k_near) + '_' + str(spill) + '_' + str(
        tree_depth) + "_" + str(displayfile) + '_' + tree
    basepath = '.\\datafiles\\'

    #create a folder for it, then put all this stuff in.
    if not os.path.exists(basepath):
        os.makedirs(basepath)

    path = basepath + index

    #f = open(path + ".txt", 'w')
    #f.write(displayfile + "," + index + "\n")
    timerfile = open(path + "_times.txt", 'w')

    for runs in range(trials):
        c = list(zip(filedata, reduceddata))
        random.shuffle(c)
        filedata, reduceddata = zip(*c)

        # make a version of filedata that doesn't have the class in it so things don't get sorted by class
        runfiledata = deepcopy(filedata)
        for row in runfiledata:
            del row[-1]

        # divide the data into 5 groups for cross validation

        validationGroups = [[] for i in range(5)]
        evectValidationGroups = [[] for i in range(5)]
        testingValidationGroups = [[] for i in range(5)]

        i = 0
        for item in runfiledata:
            validationGroups[i % 5].append(item)
            i += 1
        i = 0
        for item in filedata:
            testingValidationGroups[i % 5].append(item)
            i += 1
        i = 0
        for item in reduceddata:
            evectValidationGroups[i % 5].append(item)
            i += 1

        for k in range(len(validationGroups)):
            run += 1
            index = str(run) + '-k_' + str(k_near) + '_' + str(
                spill) + '_' + str(tree_depth) + "_" + str(
                    displayfile) + '_' + tree
            path = basepath + index
            f = open(path + ".txt", 'w')
            f.write(displayfile + "," + index + "\n")

            #print("Building ", tree_type)
            t = []
            for x in range(len(evectValidationGroups)):
                if x != k:
                    t.extend(evectValidationGroups[x])
            trainingevect = numpy.array(t)
            t2 = []
            for x in range(len(validationGroups)):
                if x != k:
                    t2.extend(validationGroups[x])
            training = numpy.array(t2)
            t3 = []
            for x in range(len(testingValidationGroups)):
                if x != k:
                    t3.extend(testingValidationGroups[x])
            testing = numpy.array(t3)

            #need to get this part working
            if tree == 'entropic':
                with timer(timerfile):
                    T = spatialtree(trainingevect,
                                    spill=spill,
                                    rule=tree,
                                    height=tree_depth)
            else:
                with timer(timerfile):
                    T = spatialtree(training,
                                    spill=spill,
                                    rule=tree,
                                    height=tree_depth)

            # To compare accuracy against brute force, make a height=0 tree (will do a linear search for knn
            #T_root = spatialtree(training, height=0)

            recall = 0

            # Generate test points from the test set
            for test_point in range(len(validationGroups[k])):
                test = validationGroups[k][test_point]
                testevect = evectValidationGroups[k][test_point]

                #actual classification
                actual = testingValidationGroups[k][test_point][-1]

                #find approximate knn
                if tree != 'entropic':
                    knn_approx = T.k_nearest(training, k=k_near, vector=test)
                else:
                    knn_approx = T.k_nearest(trainingevect,
                                             k=k_near,
                                             vector=testevect)

                #predicted classification (class should be 1 or 0.  Sum, divide by n and round to 0 or 1 for majority vote)
                predicted = round(
                    sum([1 if testing[kl][-1] > 0 else 0
                         for kl in knn_approx]) / k_near)

                #first round, merely classifying as buggy or not
                if predicted >= 0.5 and actual >= 0.5:  # datasets are number of bugs, average of over 1/2 means buggy
                    predicted = actual = 1
                else:
                    predicted = int(max(min(predicted, 1), 0))
                    actual = int(max(min(actual, 1), 0))

                # Now, get the true nearest neighbors (want to compare results with this????)
                #knn_t = T_root.k_nearest(training, k=k_near, vector=test2)

                f.write(str(actual) + "," + str(predicted) + "\n")
            f.close()
    timerfile.close()
Ejemplo n.º 11
0
N = 5000
D = 20

X = numpy.random.randn(N,D)


# Apply a random projection so the data's not totally boring
P = numpy.random.randn(D, D)

X = numpy.dot(X, P)

# Construct a tree.  By default, we get a KD-spill-tree with height
# determined automatically, and spill = 25%

print 'Building tree...'
T = spatialtree(X)
print 'done.'

# Show some useful information about the tree
print '# items in tree    : ', len(T)
print 'Dimensionality     : ', T.getDimension()
print 'Height of tree     : ', T.getHeight()
print 'Spill percentage   : ', T.getSpill()
print 'Split rule         : ', T.getRule()

# If we want to compare accuracy against brute-force search,
# we can make a height=0 tree:
T_root = spatialtree(X, height=0)

# Find the 10 approximate nearest neighbors of the 500th data point
# returned list is row#'s of X closest to the query index, 
Ejemplo n.º 12
0
def matrixDemoTestWorker(size=None,
                         dimensions=None,
                         tree_type=None,
                         spill_rate=None,
                         samp=None,
                         k_neighbors=None,
                         tree_depth=None,
                         files=None):
    #this_run = dict()

    # Create random matrix
    N = size or 5000
    D = dimensions or 20

    # Create testing variables
    tree = tree_type or 'kd'
    spill = spill_rate or .25
    samples = samp or 100
    k_near = k_neighbors or 5
    k = 5
    max_value = 100
    filename = files

    # Python interprets spill_layer = 0 as False, and so sets spill to .25
    if spill_rate == 0:
        spill = 0

    # read in the data
    filedata, reduceddata = csv_reader2(filename)
    N = len(filedata)
    D = len(filedata[0])
    #reducedD = len(reduceddata[0])

    c = list(zip(filedata, reduceddata))
    random.shuffle(c)
    filedata, reduceddata = zip(*c)

    # divide the data into 5 groups for cross validation
    i = 0
    Y = [[] for i in xrange(5)]
    Z = [[] for i in xrange(5)]
    for item in filedata:
        Y[i % 5].append(item)
        i += 1
    i = 0
    for item in reduceddata:
        Z[i % 5].append(item)
        i += 1

    this = random.randint(0, 4)
    t = []
    for x in xrange(len(Z)):
        if x != this:
            t.extend(Z[x])
    training = numpy.array(t)
    t2 = []
    for x in xrange(len(Y)):
        if x != this:
            t2.extend(Y[x])
    training_real = numpy.array(t2)

    print 'Building tree...'
    start_time = timeit.default_timer()
    T = spatialtree(training, spill=spill, height=tree_depth, rule=tree)
    elapsed = timeit.default_timer() - start_time
    print("mine:\t" + str(elapsed))

    start_time = timeit.default_timer()
    T2 = spatialtree(training_real, spill=spill, height=tree_depth, rule=tree)
    elapsed = timeit.default_timer() - start_time
    print("theirs:\t" + str(elapsed))
    print 'done.'

    # If we want to compare accuracy against brute-force search,
    # we can make a height=0 tree:

    T_root = spatialtree(training_real, height=0)

    # Generate test points from the test set
    test_point = random.randint(0, len(Y[4]) - 1)
    test2 = Y[this][test_point]
    test = Z[this][test_point]
    print(test2)
    print(test)

    # Find the 10 approximate nearest neighbors of the 500th data point
    # returned list is row#'s of X closest to the query index,
    # sorted by increasing distance
    knn_a = T.k_nearest(training, k=k_near, vector=test)
    print 'KNN approx (index) : ', knn_a

    knn_b = T2.k_nearest(training_real, k=k_near, vector=test2)
    print 'KNN approx (index) : ', knn_b

    # Now, get the true nearest neighbors
    knn_t = T_root.k_nearest(training_real, k=2 * k_near, vector=test2)
    print 'KNN true   (index) : ', knn_t
Ejemplo n.º 13
0
N = 5000
D = 20

X = numpy.random.randn(N,D)


# Apply a random projection so the data's not totally boring
P = numpy.random.randn(D, D)

X = numpy.dot(X, P)

# Construct a tree.  By default, we get a KD-spill-tree with height
# determined automatically, and spill = 25%

print('Building tree...')
T = spatialtree(X)
print('done.')

# Show some useful information about the tree
print('# items in tree    : ', len(T))
print('Dimensionality     : ', T.getDimension())
print('Height of tree     : ', T.getHeight())
print('Spill percentage   : ', T.getSpill())
print('Split rule         : ', T.getRule())

# If we want to compare accuracy against brute-force search,
# we can make a height=0 tree:
T_root = spatialtree(X, height=0)

# Find the 10 approximate nearest neighbors of the 500th data point
# returned list is row#'s of X closest to the query index, 
Ejemplo n.º 14
0
# Apply a random projection so the data's not totally boring
P = numpy.random.randn(D, D)

X = numpy.dot(X, P)
"""
X = fileio.csv_reader("testdata.csv")
D = len(X[0])
N = len(X)
P = numpy.random.randn(D,D)
"""

# Construct a tree.  By default, we get a KD-spill-tree with height
# determined automatically, and spill = 25%

print 'Building tree...'
T = spatialtree(X, spill=.01, rule='2-means')
print 'done.'

# Show some useful information about the tree
print '# items in tree    : ', len(T)
print 'Dimensionality     : ', T.getDimension()
print 'Height of tree     : ', T.getHeight()
print 'Spill percentage   : ', T.getSpill()
print 'Split rule         : ', T.getRule()

# If we want to compare accuracy against brute-force search,
# we can make a height=0 tree:
T_root = spatialtree(X, height=0)

# Find the 10 approximate nearest neighbors of the 500th data point
# returned list is row#'s of X closest to the query index,