def compareRandom(num_trials, tensor_dimensions, matrix_data, cluster_dimensions, maxit_ebc, jitter_max_ebc, objective_tolerance): deltas = [] iterations_M = [] iterations_Mr = [] noconverge_M = 0 noconverge_Mr = 0 for j in range(num_trials): print "Trial ", j M = SparseMatrix(tensor_dimensions) M.read_data(matrix_data) Mr = M.shuffle() # could also be M.shuffle_old() M.normalize() ebc_M = EBC(M, cluster_dimensions, maxit_ebc, jitter_max_ebc, objective_tolerance) cXY_M, objective_M, it_M = ebc_M.run() if it_M == maxit_ebc: noconverge_M += 1 else: iterations_M.append(it_M) Mr.normalize() ebc_Mr = EBC(Mr, cluster_dimensions, maxit_ebc, jitter_max_ebc, objective_tolerance) cXY_Mr, objective_Mr, it_Mr = ebc_Mr.run() if it_Mr == maxit_ebc: noconverge_Mr += 1 else: iterations_Mr.append(it_Mr) deltas.append(objective_M - objective_Mr) return deltas, iterations_M, iterations_Mr, noconverge_M, noconverge_Mr
def main(): data_file = sys.argv[1] ebc_cols = [int(e) for e in sys.argv[2].split(",")] K = [int(e) for e in sys.argv[3].split(",")] N_runs = int(sys.argv[4]) output_file = sys.argv[5] jitter_max = float(sys.argv[6]) max_iterations_ebc = int(sys.argv[7]) entity_cols = [int(e) for e in sys.argv[8].split(",")] object_toler = float(sys.argv[9]) # get original data raw_data = [line.split("\t") for line in open(data_file, "r")] data = [[d[i] for i in ebc_cols] for d in raw_data] data_dimensions = len(data[0]) - 1 # get axis length for each dimension N = [] for dim in range(data_dimensions): N.append(len(set([d[dim] for d in data]))) print(N) # set up matrix M = SparseMatrix(N) M.read_data(data) M.normalize() # set up entity map to ids entity_map = defaultdict(tuple) for d in raw_data: entity = tuple([d[i] for i in entity_cols]) entity_ids = tuple([M.feature_ids[ebc_cols.index(i)][d[i]] for i in entity_cols]) entity_map[entity_ids] = entity # figure out which ebc columns the entity columns correspond to entity_column_indices = [] for c in ebc_cols: if c in entity_cols: entity_column_indices.append(ebc_cols.index(c)) # run EBC and get entity cluster assignments ebc_M = EBC(M, K, max_iterations_ebc, jitter_max, object_toler) clusters = defaultdict(list) for t in range(N_runs): print "run ", t cXY_M, objective_M, it_M = ebc_M.run() for e1 in entity_map.keys(): c1_i = tuple([cXY_M[i][e1[i]] for i in entity_column_indices]) clusters[e1].append(c1_i) # print assignments writer = open(output_file, "w") for k in clusters: e1_name = entity_map[k] writer.write(",".join([str(e) for e in k]) + "\t" + ",".join([e for e in e1_name]) + "\t" + "\t".join([",".join([str(f) for f in e]) for e in clusters[k]]) + "\n") writer.flush() writer.close()
def testEbcOnSparseMatrix(self): ebc = EBC(self.matrix, [30, 125], 10, 1e-10, 0.01) cXY, objective, it = ebc.run() print "objective: ", objective print "iterations: ", it self.assertEquals(len(ebc.pXY.nonzero_elements), 29456) self.assertEquals(len(set(ebc.cXY[0])), 30) self.assertEquals(len(set(ebc.cXY[1])), 125)
def testEbcOnSparseMatrix(self): ebc = EBC(self.matrix, [3, 2], 10, 1e-10, 0.01) cXY, objective, it = ebc.run(verbose=False) print "--> ebc" print "objective: ", objective print "iterations: ", it ebc = EBC(self.matrix, [3, 2], 10, 1e-10, 0.01) ebc.run(assigned_clusters=[[2, 0, 1, 1, 2, 2], [0, 0, 1, 0, 1, 1]], verbose=False) indices = [range(N_d) for N_d in ebc.pXY.N] index_list = self.cartesian(indices) approx_distribution = {} for location in index_list: q = 1.0 c_location = [] for i in range(len(location)): c_i = ebc.cXY[i][location[i]] c_location.append(c_i) q *= ebc.qXxHat[i][location[i]] q *= ebc.qXhatYhat.get(tuple(c_location)) approx_distribution[tuple(location)] = q self.assertAlmostEquals(approx_distribution[(0, 0)], 0.054) self.assertAlmostEquals(approx_distribution[(0, 1)], 0.054) self.assertAlmostEquals(approx_distribution[(0, 2)], 0.042) self.assertAlmostEquals(approx_distribution[(0, 3)], 0.0) self.assertAlmostEquals(approx_distribution[(0, 4)], 0.0) self.assertAlmostEquals(approx_distribution[(0, 5)], 0.0) self.assertAlmostEquals(approx_distribution[(1, 0)], 0.054) self.assertAlmostEquals(approx_distribution[(1, 1)], 0.054) self.assertAlmostEquals(approx_distribution[(1, 2)], 0.042) self.assertAlmostEquals(approx_distribution[(1, 3)], 0.0) self.assertAlmostEquals(approx_distribution[(1, 4)], 0.0) self.assertAlmostEquals(approx_distribution[(1, 5)], 0.0) self.assertAlmostEquals(approx_distribution[(2, 0)], 0.0) self.assertAlmostEquals(approx_distribution[(2, 1)], 0.0) self.assertAlmostEquals(approx_distribution[(2, 2)], 0.0) self.assertAlmostEquals(approx_distribution[(2, 3)], 0.042) self.assertAlmostEquals(approx_distribution[(2, 4)], 0.054) self.assertAlmostEquals(approx_distribution[(2, 5)], 0.054) self.assertAlmostEquals(approx_distribution[(3, 0)], 0.0) self.assertAlmostEquals(approx_distribution[(3, 1)], 0.0) self.assertAlmostEquals(approx_distribution[(3, 2)], 0.0) self.assertAlmostEquals(approx_distribution[(3, 3)], 0.042) self.assertAlmostEquals(approx_distribution[(3, 4)], 0.054) self.assertAlmostEquals(approx_distribution[(3, 5)], 0.054) self.assertAlmostEquals(approx_distribution[(4, 0)], 0.036) self.assertAlmostEquals(approx_distribution[(4, 1)], 0.036) self.assertAlmostEquals(approx_distribution[(4, 2)], 0.028) self.assertAlmostEquals(approx_distribution[(4, 3)], 0.028) self.assertAlmostEquals(approx_distribution[(4, 4)], 0.036) self.assertAlmostEquals(approx_distribution[(4, 5)], 0.036) self.assertAlmostEquals(approx_distribution[(5, 0)], 0.036) self.assertAlmostEquals(approx_distribution[(5, 1)], 0.036) self.assertAlmostEquals(approx_distribution[(5, 2)], 0.028) self.assertAlmostEquals(approx_distribution[(5, 3)], 0.028) self.assertAlmostEquals(approx_distribution[(5, 4)], 0.036) self.assertAlmostEquals(approx_distribution[(5, 5)], 0.036)
def testOldMatrix3d(self): with open("resources/matrix-ebc-paper-dense-3d.tsv", "r") as f: data = [] for line in f: sl = line.split("\t") data.append([sl[0], sl[1], sl[2], float(sl[3])]) matrix = SparseMatrix([756, 996, 1232]) matrix.read_data(data) matrix.normalize() ebc = EBC(matrix, [30, 30, 10], 100, 1e-10, 0.01) cXY, objective, it = ebc.run() print "objective: ", objective print "iterations: ", it self.assertEquals(len(ebc.pXY.nonzero_elements), 10007) self.assertEquals(len(set(ebc.cXY[0])), 30) self.assertEquals(len(set(ebc.cXY[1])), 30) self.assertEquals(len(set(ebc.cXY[2])), 10)
def test3DMatrix(self): data = [[0, 0, 0, 1.0], [0, 0, 1, 1.0], [0, 1, 0, 1.0], [0, 1, 1, 1.0], [1, 0, 0, 1.0], [1, 0, 1, 1.0], [1, 1, 0, 1.0], [1, 1, 1, 1.0], [2, 2, 2, 1.0], [2, 2, 3, 1.0], [2, 3, 2, 1.0], [3, 2, 2, 1.0], [2, 3, 3, 1.0], [3, 3, 2, 1.0], [3, 2, 3, 1.0], [3, 3, 3, 1.0], [4, 4, 4, 1.0], [4, 4, 5, 1.0], [4, 5, 4, 1.0], [4, 5, 5, 1.0], [5, 4, 4, 1.0], [5, 4, 5, 1.0], [5, 5, 4, 1.0], [5, 5, 5, 1.0]] matrix = SparseMatrix([6, 6, 6]) matrix.read_data(data) matrix.normalize() ebc = EBC(matrix, [3, 3, 3], 10, 1e-10, 0.01) assigned_C = [[0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2]] cXY, objective, it = ebc.run(assigned_C) self.assertEquals(cXY, assigned_C) self.assertAlmostEqual(objective, 0.0) self.assertEquals(it, 1) for i in range(100): cXY, objective, it = ebc.run() # random initialization print cXY, objective, it
def setUp(self): data = [[0, 0, 0, 1.0], [0, 0, 1, 1.0], [0, 1, 0, 1.0], [0, 1, 1, 1.0], [1, 0, 0, 1.0], [1, 0, 1, 1.0], [1, 1, 0, 1.0], [1, 1, 1, 1.0], [2, 2, 2, 1.0], [2, 2, 3, 1.0], [2, 3, 2, 1.0], [3, 2, 2, 1.0], [2, 3, 3, 1.0], [3, 3, 2, 1.0], [3, 2, 3, 1.0], [3, 3, 3, 1.0], [4, 4, 4, 1.0], [4, 4, 5, 1.0], [4, 5, 4, 1.0], [4, 5, 5, 1.0], [5, 4, 4, 1.0], [5, 4, 5, 1.0], [5, 5, 4, 1.0], [5, 5, 5, 1.0]] matrix = SparseMatrix([6, 6, 6]) matrix.read_data(data) matrix.normalize() ebc = EBC(matrix, [3, 3, 3], 10, 1e-10) assigned_C = [[0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2]] cXY, objective = ebc.run(assigned_C) self.assertEquals(cXY, assigned_C) self.assertAlmostEqual(objective, 0.0) cXY, objective = ebc.run() # random initialization self.assertAlmostEqual(objective, 0.0)