Esempio n. 1
0
def compareRandom(num_trials, tensor_dimensions, matrix_data, cluster_dimensions,
                  maxit_ebc, jitter_max_ebc, objective_tolerance):
    deltas = []
    iterations_M = []
    iterations_Mr = []
    noconverge_M = 0
    noconverge_Mr = 0
    for j in range(num_trials):
        print "Trial ", j

        M = SparseMatrix(tensor_dimensions)
        M.read_data(matrix_data)
        Mr = M.shuffle()  # could also be M.shuffle_old()

        M.normalize()

        ebc_M = EBC(M, cluster_dimensions, maxit_ebc, jitter_max_ebc, objective_tolerance)
        cXY_M, objective_M, it_M = ebc_M.run()
        if it_M == maxit_ebc:
            noconverge_M += 1
        else:
            iterations_M.append(it_M)

        Mr.normalize()

        ebc_Mr = EBC(Mr, cluster_dimensions, maxit_ebc, jitter_max_ebc, objective_tolerance)
        cXY_Mr, objective_Mr, it_Mr = ebc_Mr.run()
        if it_Mr == maxit_ebc:
            noconverge_Mr += 1
        else:
            iterations_Mr.append(it_Mr)

        deltas.append(objective_M - objective_Mr)
    return deltas, iterations_M, iterations_Mr, noconverge_M, noconverge_Mr
Esempio n. 2
0
def main():
    data_file = sys.argv[1]
    ebc_cols = [int(e) for e in sys.argv[2].split(",")]
    K = [int(e) for e in sys.argv[3].split(",")]
    N_runs = int(sys.argv[4])
    output_file = sys.argv[5]
    jitter_max = float(sys.argv[6])
    max_iterations_ebc = int(sys.argv[7])
    entity_cols = [int(e) for e in sys.argv[8].split(",")]
    object_toler = float(sys.argv[9])

    # get original data
    raw_data = [line.split("\t") for line in open(data_file, "r")]
    data = [[d[i] for i in ebc_cols] for d in raw_data]
    data_dimensions = len(data[0]) - 1

    # get axis length for each dimension
    N = []
    for dim in range(data_dimensions):
        N.append(len(set([d[dim] for d in data])))
    print(N)

    # set up matrix
    M = SparseMatrix(N)
    M.read_data(data)
    M.normalize()

    # set up entity map to ids
    entity_map = defaultdict(tuple)
    for d in raw_data:
        entity = tuple([d[i] for i in entity_cols])
        entity_ids = tuple([M.feature_ids[ebc_cols.index(i)][d[i]] for i in entity_cols])
        entity_map[entity_ids] = entity

    # figure out which ebc columns the entity columns correspond to
    entity_column_indices = []
    for c in ebc_cols:
        if c in entity_cols:
            entity_column_indices.append(ebc_cols.index(c))

    # run EBC and get entity cluster assignments
    ebc_M = EBC(M, K, max_iterations_ebc, jitter_max, object_toler)
    clusters = defaultdict(list)
    for t in range(N_runs):
        print "run ", t
        cXY_M, objective_M, it_M = ebc_M.run()
        for e1 in entity_map.keys():
            c1_i = tuple([cXY_M[i][e1[i]] for i in entity_column_indices])
            clusters[e1].append(c1_i)

    # print assignments
    writer = open(output_file, "w")
    for k in clusters:
        e1_name = entity_map[k]
        writer.write(",".join([str(e) for e in k]) + "\t" +
                     ",".join([e for e in e1_name]) + "\t" + "\t".join([",".join([str(f) for f in e])
                                                                        for e in clusters[k]]) + "\n")
        writer.flush()
    writer.close()
Esempio n. 3
0
 def testEbcOnSparseMatrix(self):
     ebc = EBC(self.matrix, [30, 125], 10, 1e-10, 0.01)
     cXY, objective, it = ebc.run()
     print "objective: ", objective
     print "iterations: ", it
     self.assertEquals(len(ebc.pXY.nonzero_elements), 29456)
     self.assertEquals(len(set(ebc.cXY[0])), 30)
     self.assertEquals(len(set(ebc.cXY[1])), 125)
Esempio n. 4
0
    def testEbcOnSparseMatrix(self):
        ebc = EBC(self.matrix, [3, 2], 10, 1e-10, 0.01)
        cXY, objective, it = ebc.run(verbose=False)
        print "--> ebc"
        print "objective: ", objective
        print "iterations: ", it

        ebc = EBC(self.matrix, [3, 2], 10, 1e-10, 0.01)
        ebc.run(assigned_clusters=[[2, 0, 1, 1, 2, 2], [0, 0, 1, 0, 1, 1]], verbose=False)
        indices = [range(N_d) for N_d in ebc.pXY.N]
        index_list = self.cartesian(indices)
        approx_distribution = {}
        for location in index_list:
            q = 1.0
            c_location = []
            for i in range(len(location)):
                c_i = ebc.cXY[i][location[i]]
                c_location.append(c_i)
                q *= ebc.qXxHat[i][location[i]]
            q *= ebc.qXhatYhat.get(tuple(c_location))
            approx_distribution[tuple(location)] = q

        self.assertAlmostEquals(approx_distribution[(0, 0)], 0.054)
        self.assertAlmostEquals(approx_distribution[(0, 1)], 0.054)
        self.assertAlmostEquals(approx_distribution[(0, 2)], 0.042)
        self.assertAlmostEquals(approx_distribution[(0, 3)], 0.0)
        self.assertAlmostEquals(approx_distribution[(0, 4)], 0.0)
        self.assertAlmostEquals(approx_distribution[(0, 5)], 0.0)
        self.assertAlmostEquals(approx_distribution[(1, 0)], 0.054)
        self.assertAlmostEquals(approx_distribution[(1, 1)], 0.054)
        self.assertAlmostEquals(approx_distribution[(1, 2)], 0.042)
        self.assertAlmostEquals(approx_distribution[(1, 3)], 0.0)
        self.assertAlmostEquals(approx_distribution[(1, 4)], 0.0)
        self.assertAlmostEquals(approx_distribution[(1, 5)], 0.0)
        self.assertAlmostEquals(approx_distribution[(2, 0)], 0.0)
        self.assertAlmostEquals(approx_distribution[(2, 1)], 0.0)
        self.assertAlmostEquals(approx_distribution[(2, 2)], 0.0)
        self.assertAlmostEquals(approx_distribution[(2, 3)], 0.042)
        self.assertAlmostEquals(approx_distribution[(2, 4)], 0.054)
        self.assertAlmostEquals(approx_distribution[(2, 5)], 0.054)
        self.assertAlmostEquals(approx_distribution[(3, 0)], 0.0)
        self.assertAlmostEquals(approx_distribution[(3, 1)], 0.0)
        self.assertAlmostEquals(approx_distribution[(3, 2)], 0.0)
        self.assertAlmostEquals(approx_distribution[(3, 3)], 0.042)
        self.assertAlmostEquals(approx_distribution[(3, 4)], 0.054)
        self.assertAlmostEquals(approx_distribution[(3, 5)], 0.054)
        self.assertAlmostEquals(approx_distribution[(4, 0)], 0.036)
        self.assertAlmostEquals(approx_distribution[(4, 1)], 0.036)
        self.assertAlmostEquals(approx_distribution[(4, 2)], 0.028)
        self.assertAlmostEquals(approx_distribution[(4, 3)], 0.028)
        self.assertAlmostEquals(approx_distribution[(4, 4)], 0.036)
        self.assertAlmostEquals(approx_distribution[(4, 5)], 0.036)
        self.assertAlmostEquals(approx_distribution[(5, 0)], 0.036)
        self.assertAlmostEquals(approx_distribution[(5, 1)], 0.036)
        self.assertAlmostEquals(approx_distribution[(5, 2)], 0.028)
        self.assertAlmostEquals(approx_distribution[(5, 3)], 0.028)
        self.assertAlmostEquals(approx_distribution[(5, 4)], 0.036)
        self.assertAlmostEquals(approx_distribution[(5, 5)], 0.036)
Esempio n. 5
0
    def testOldMatrix3d(self):
        with open("resources/matrix-ebc-paper-dense-3d.tsv", "r") as f:
            data = []
            for line in f:
                sl = line.split("\t")
                data.append([sl[0], sl[1], sl[2], float(sl[3])])

        matrix = SparseMatrix([756, 996, 1232])
        matrix.read_data(data)
        matrix.normalize()
        ebc = EBC(matrix, [30, 30, 10], 100, 1e-10, 0.01)
        cXY, objective, it = ebc.run()
        print "objective: ", objective
        print "iterations: ", it
        self.assertEquals(len(ebc.pXY.nonzero_elements), 10007)
        self.assertEquals(len(set(ebc.cXY[0])), 30)
        self.assertEquals(len(set(ebc.cXY[1])), 30)
        self.assertEquals(len(set(ebc.cXY[2])), 10)
Esempio n. 6
0
    def test3DMatrix(self):
        data = [[0, 0, 0, 1.0],
                [0, 0, 1, 1.0],
                [0, 1, 0, 1.0],
                [0, 1, 1, 1.0],
                [1, 0, 0, 1.0],
                [1, 0, 1, 1.0],
                [1, 1, 0, 1.0],
                [1, 1, 1, 1.0],
                [2, 2, 2, 1.0],
                [2, 2, 3, 1.0],
                [2, 3, 2, 1.0],
                [3, 2, 2, 1.0],
                [2, 3, 3, 1.0],
                [3, 3, 2, 1.0],
                [3, 2, 3, 1.0],
                [3, 3, 3, 1.0],
                [4, 4, 4, 1.0],
                [4, 4, 5, 1.0],
                [4, 5, 4, 1.0],
                [4, 5, 5, 1.0],
                [5, 4, 4, 1.0],
                [5, 4, 5, 1.0],
                [5, 5, 4, 1.0],
                [5, 5, 5, 1.0]]
        matrix = SparseMatrix([6, 6, 6])
        matrix.read_data(data)
        matrix.normalize()
        ebc = EBC(matrix, [3, 3, 3], 10, 1e-10, 0.01)
        assigned_C = [[0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2]]
        cXY, objective, it = ebc.run(assigned_C)
        self.assertEquals(cXY, assigned_C)
        self.assertAlmostEqual(objective, 0.0)
        self.assertEquals(it, 1)

        for i in range(100):
            cXY, objective, it = ebc.run()  # random initialization
            print cXY, objective, it
Esempio n. 7
0
    def setUp(self):
        data = [[0, 0, 0, 1.0],
                [0, 0, 1, 1.0],
                [0, 1, 0, 1.0],
                [0, 1, 1, 1.0],
                [1, 0, 0, 1.0],
                [1, 0, 1, 1.0],
                [1, 1, 0, 1.0],
                [1, 1, 1, 1.0],
                [2, 2, 2, 1.0],
                [2, 2, 3, 1.0],
                [2, 3, 2, 1.0],
                [3, 2, 2, 1.0],
                [2, 3, 3, 1.0],
                [3, 3, 2, 1.0],
                [3, 2, 3, 1.0],
                [3, 3, 3, 1.0],
                [4, 4, 4, 1.0],
                [4, 4, 5, 1.0],
                [4, 5, 4, 1.0],
                [4, 5, 5, 1.0],
                [5, 4, 4, 1.0],
                [5, 4, 5, 1.0],
                [5, 5, 4, 1.0],
                [5, 5, 5, 1.0]]
        matrix = SparseMatrix([6, 6, 6])
        matrix.read_data(data)
        matrix.normalize()

        ebc = EBC(matrix, [3, 3, 3], 10, 1e-10)
        assigned_C = [[0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2]]
        cXY, objective = ebc.run(assigned_C)
        self.assertEquals(cXY, assigned_C)
        self.assertAlmostEqual(objective, 0.0)
        cXY, objective = ebc.run()  # random initialization
        self.assertAlmostEqual(objective, 0.0)