Beispiel #1
0
def test(mat_name, l, check_c = True):
	"""
	@param mat_name: name of matrix file
	@param l: number of columns in sketch
	"""
	if ".txt" not in mat_name:
		mat_name += ".txt"
	mat_pname = os.path.join(MATRIX_DIR, mat_name)
	mat = load_matrix(mat_pname)
	print "Original shape: %r, l: %d" %(mat.shape, l)
	f_norm = squaredFrobeniusNorm(mat)
	start = time.time()
	p_sketch = sketch(mat, l)
	p_time = time.time() - start
	print "Sketch shape: ", p_sketch.shape
	p_err = calculateError(mat, p_sketch)
	# calculate bound on error
	err_bound = 2 * squaredFrobeniusNorm(mat) / l 
	if not check_c:
		return err_bound, f_norm, p_err, None

	# run sketch.c on the matrix 
	sketch_pname = os.path.join(MATRIX_DIR, "sketch_" + mat_name)
	subprocess.call(["make", "clean"])
	subprocess.call(["make", "sketch"])
	# need to check output 
	c_start = time.time()
	err = subprocess.check_output([RUN_SKETCH, '-f', mat_pname, '-w', sketch_pname, '-l', str(l)])
	c_time = time.time() - c_start
	print "Sketch output: ", err
	c_sketch = load_matrix(sketch_pname)
	assert (c_sketch.shape == p_sketch.shape)
	c_err = calculateError(mat, c_sketch)
	return err_bound, f_norm, p_err, p_time, c_err, c_time
Beispiel #2
0
def kmeans_experiment(on_sketch=True, on_orig=True):
    #path = "../../data/GoogleNews-vectors-negative300.bin"
    #wmodel = models.Word2Vec.load_word2vec_format(path, binary=True)
    mat = load_matrix('data_batch_1')
    # sketch the transpose 
    mat = mat.T
    #sketch_sizes = [10, 50, 100, 200]
    sketch_sizes = [10, 15, 20, 30, 100]
    sketches = []
    for l in sketch_sizes:
        fname = 'test_matrices/sketches/sk_%d'%l
        if os.path.exists(fname):
            sketches.append(load_matrix(fname))
        else:
            sketch_o = BatchPFDSketch(mat, l, l, 0.2, randomized=True)
            sketch_o.compute_sketch()
            write_matrix(sketch_o.sketch.T, fname)
            sketches.append(sketch_o.sketch.T)
    mat = mat.T
    #print sketche.shape
    print "Mat: ", mat.shape
    #sketch = load_matrix("sketches/w2vec_250.txt")
    clusters = [2, 4, 8, 16, 32]
    #clusters = [10, 20, 30, 40]
    num_processes = 8
    results = {'opt': {}, 'sketch': {}}
    for l in sketch_sizes:
        results['sketch'][l] = {}

    for k in clusters:
        print "Testing ", k
        if on_sketch:
            for sketch, l in zip(sketches, sketch_sizes):
                start_time = time.time()
                cost, cluster_centers, labels = train_kmeans(sketch, k, 
                                                    num_processes=num_processes)
                train_time = time.time() - start_time
                test_cost = compute_cost_labels(mat, labels, k)
                results['sketch'][l][k] = {'time': train_time, 'cost': test_cost}
        init_centers = compute_centers(mat, labels, k)
        if on_orig:
            start_time = time.time()
            cost, cluster_centers, labels = train_kmeans(mat, k, init_centers=init_centers, num_processes=num_processes)
            train_time = time.time() - start_time
            cost = compute_cost_labels(mat, labels, k)
            results['opt'][k] = {'time': train_time, 'cost': cost}
    
    if on_orig:
        with open('experiments/kmeans/cifar/mat_results.p', "wb") as f:
            pickle.dump(results, f)
    if on_sketch:
        with open('experiments/kmeans/cifar/sketch_results.p', "wb") as f:
            pickle.dump(results, f)
Beispiel #3
0
def plot_errors(orig_mat_fname, p_fnames, c_fnames = None):
	# load the original matrix 
	if ".txt" not in orig_mat_fname:
		orig_mat_fname += ".txt"
	mat_pname = os.path.join(MATRIX_DIR, orig_mat_fname)
	mat = load_matrix(mat_pname)
	rows, cols = mat.shape
	ls = []
	errs = []
	bounds = []
	regex = re.compile('\d+')
	if c_fnames:
		for p_fname, c_fname in zip(p_fnames, c_fnames):
			# extract the sketch size
			# calculate the error from each 
			# assert t
			p_l = int(regex.search(p_fname).group(0))
			c_l = int(regex.search(c_fname).group(0))
			assert(p_l == c_l)
			p_sketch = load_matrix(p_fname)
			c_sketch = load_matrix(c_fname)
			assert(p_sketch.shape == c_sketch.shape)
			bound = fd_bound(mat, p_l)
			p_err = calculateError(mat, p_sketch)
			c_err = calculateError(mat, c_sketch)
			print p_err, c_err
			# the optimizations make floating point arithmetic not exact 
			assert(np.isclose(p_err/c_err, 1.0, atol=1e-1))
			ls.append(p_l)
			errs.append(p_err)
			bounds.append(bound)
	else:
		for p_fname in p_fnames:
			p_l = int(regex.search(p_fname).group(0))
			p_sketch = load_matrix(p_fname)
			bound = fd_bound(mat, p_l)
			p_err = calculateError(mat, p_sketch)
			ls.append(p_l)
			errs.append(p_err)
			bounds.append(bound)
	# lets plot this shit 
	plt.plot(ls, errs, '-o', color='b', label='cov err')
	plt.plot(ls, bounds, '-o', color='r', label='upper bound')
	plt.xlabel("Sketch size (l)")
	plt.ylabel("Error")
	title = "Sketch size vs Reconstruction Error: %d X %d" %(mat.shape[0], mat.shape[1])
	plt.title(title)
	plt.grid()
	plt.legend(loc=3)
	plt.yscale('log')
	#plt.xlim(25, 375)
	plt.show() 
	return ls, errs, bounds
Beispiel #4
0
def check_sketch_file(orig_mat_fname, sketch_fname, l):
	# assert os.path.exists
	assert(os.path.exists(orig_mat_fname))
	mat = load_matrix(orig_mat_fname)
	if not os.path.exists(sketch_fname):
		return False
	sketch = load_matrix(sketch_fname)
	if mat.shape[1] != sketch.shape[1]:
		return False
	bound = fd_bound(mat, l)
	err = calculateError(mat, sketch)
	if err < bound: 
		return True 
	else:
		return False
Beispiel #5
0
def test_svd(mat_name):
	if ".txt" not in mat_name:
		mat_name += ".txt"
	mat_pname = os.path.join(MATRIX_DIR, mat_name)
	mat = load_matrix(mat_pname)
	U, w, Vt = np.linalg.svd(mat, full_matrices=False)
	V = Vt.T
	print "V shape: ", V.shape
	print np.around(V, 2)
	print "Singular values: ", w
def dynamic_experiment(mat_fname=MATRIX, l1=320, l2=350, batch_size=400, plot=True):
    print "Dynamic Experiment"
    mat = load_matrix(mat_fname)
    # changepoints 
    ts = np.arange(1, mat.shape[0], max(mat.shape[0]/10, 1))
    exp_name = "dynamic_exp_" + os.path.splitext(mat_fname)[0]
    dyn_exp = DynamicSketchExperiment(exp_name, mat_fname, l1, l2, batch_size, ts, randomized=False)
    dyn_exp.run_experiment()
    dyn_exp.write_results()
    if plot:
        dyn_exp.plot_results()
Beispiel #7
0
def test_dyn_exp():
    mat_fname = "med_svd_mat.txt"
    l1 = 20
    l2 = 30
    batch_size = 20
    mat = load_matrix(mat_fname)
    ts = np.arange(0, mat.shape[0], max(mat.shape[0]/10, 1))
    exp_name = "test_dynamic_exp"
    dyn_exp = DynamicSketchExperiment(exp_name, mat_fname, l1, l2, batch_size, ts, randomized=False)
    dyn_exp.run_experiment()
    dyn_exp.plot_results()
    dyn_exp.write_results()
Beispiel #8
0
 def __init__(self, exp_name, mat_fname, dependent_vars, dependent_var_name, sparse=False):
     self.exp_name = exp_name
     self.mat_fname = mat_fname
     if sparse:
         # assume Matrix Market format
         self.mat = mmread(os.path.join(MATRIX_DIR, mat_fname))
     else:
         self.mat = load_matrix(self.mat_fname)
     self.exp_dir = os.path.join(EXP_DIR, exp_name, os.path.splitext(mat_fname)[0])
     # make a directory for the experiment if it doesnt exist yet 
     try:
         os.makedirs(self.exp_dir)
     except OSError, e:
         if e.errno != 17:
             raise
         pass
Beispiel #9
0
def test_rand_exp():
    mat_fname = "small_data_batch_1"
    l = 200
    sketch_sizes = [100, 200, 300, 400, 500]
    batch_sizes =[5, 10, 20]
    alpha = 0.2
    exp_name = 'test_rand_bpfd_experiment'
    mat = load_matrix(mat_fname)
    print mat.shape
    results = {}
    for l in sketch_sizes:
        sk_o = BatchPFDSketch(mat, l, 5000, alpha, randomized=True)
        sk_o.compute_sketch()
        results[l] = sk_o.sketching_time
    with open("experiments/rand_scale/results2.p", "wb") as f:
        pickle.dump(results, f) 
    print "Done"
def sketch_cifar():
    print "Loading model"
    mat = load_matrix("large_cifar_data")
    l = 250
    alpha = 0.2
    batch_size = 5000
    randomized = True
    num_processes = 8
    start_time = time.time()
    sketch = parallel_bpfd_sketch(mat, l, alpha, batch_size,
                                    randomized=randomized, num_processes=num_processes)
    sketching_time = time.time() - start_time
    #print "Writing sketch"
    #write_matrix(sketch, "sketches/w2vec_%d.txt"%l)
    with open("experiments/parallel_results.txt", "a") as f:
            f.write("""Mat: %s, Rand: %r, l: %d, 
                        b: %d, alpha: %f, Processes: %d, Time: %f\n""" %
                                    ("large_cifar", randomized, l, batch_size, 
                                    alpha, num_processes, sketching_time))
def run_code():
    mat = load_matrix(mat_fname)
    print "Mat Shape: ", mat.shape
    l = 100
    alpha = 0.2
    batch_size = 100
    randomized=False
    num_processes=1
    print "Starting"
    start_time = time.time()
    sketch = parallel_bpfd_sketch(mat, l, alpha, batch_size, 
                                                                            randomized=randomized, num_processes=num_processes)
    sketching_time = time.time() - start_time 


    with open("experiments/parallel_results.txt", "a") as f:
            f.write("""Mat: %s, Rand: %r, l: %d, 
                        b: %d, alpha: %f, Processes: %d, Time: %f\n""" %(mat_fname, randomized, l, 
                                    batch_size, alpha, num_processes, sketching_time))
    print calculateError(mat, sketch)
def dynamic_experiment(mat_fname=MATRIX,
                       l1=320,
                       l2=350,
                       batch_size=400,
                       plot=True):
    print "Dynamic Experiment"
    mat = load_matrix(mat_fname)
    # changepoints
    ts = np.arange(1, mat.shape[0], max(mat.shape[0] / 10, 1))
    exp_name = "dynamic_exp_" + os.path.splitext(mat_fname)[0]
    dyn_exp = DynamicSketchExperiment(exp_name,
                                      mat_fname,
                                      l1,
                                      l2,
                                      batch_size,
                                      ts,
                                      randomized=False)
    dyn_exp.run_experiment()
    dyn_exp.write_results()
    if plot:
        dyn_exp.plot_results()
def sketch_cifar():
    print "Loading model"
    mat = load_matrix("large_cifar_data")
    l = 250
    alpha = 0.2
    batch_size = 5000
    randomized = True
    num_processes = 8
    start_time = time.time()
    sketch = parallel_bpfd_sketch(mat,
                                  l,
                                  alpha,
                                  batch_size,
                                  randomized=randomized,
                                  num_processes=num_processes)
    sketching_time = time.time() - start_time
    #print "Writing sketch"
    #write_matrix(sketch, "sketches/w2vec_%d.txt"%l)
    with open("experiments/parallel_results.txt", "a") as f:
        f.write("""Mat: %s, Rand: %r, l: %d, 
                        b: %d, alpha: %f, Processes: %d, Time: %f\n""" %
                ("large_cifar", randomized, l, batch_size, alpha,
                 num_processes, sketching_time))
def run_code():
    mat = load_matrix(mat_fname)
    print "Mat Shape: ", mat.shape
    l = 100
    alpha = 0.2
    batch_size = 100
    randomized = False
    num_processes = 1
    print "Starting"
    start_time = time.time()
    sketch = parallel_bpfd_sketch(mat,
                                  l,
                                  alpha,
                                  batch_size,
                                  randomized=randomized,
                                  num_processes=num_processes)
    sketching_time = time.time() - start_time

    with open("experiments/parallel_results.txt", "a") as f:
        f.write("""Mat: %s, Rand: %r, l: %d, 
                        b: %d, alpha: %f, Processes: %d, Time: %f\n""" %
                (mat_fname, randomized, l, batch_size, alpha, num_processes,
                 sketching_time))
    print calculateError(mat, sketch)
Beispiel #15
0
 def load_sketch(sketch_fname):
     self.sketch = load_matrix(sketch_matrix)
     self.sketching_time = 0
Beispiel #16
0
def run_fd_sketch(mat_pname, write_pname, l):
	# do exactly what the C code is doing (including reading and writing files)
	mat = load_matrix(mat_pname)
	p_sketch = sketch(mat, l)
	write_matrix(p_sketch, write_pname)
	return p_sketch
Beispiel #17
0
def construct_sketches(orig_mat_fname, ls, check_c=True, force_comp=False):
	"""
	generates sketch files using input l's 
	@param orig_mat_fname: name of input matrix file inside MATRIX_DIR
	@param ls: List of sketch sizes to construct
	@param check_c: flag whether to construct sketches using custom C implementation
	@param force_comp: disregard cached sketch, recompute (required to determine runtime)
	"""
	# TOOD: check if we've already constructed it. 
	# we can check for the right filename and then some
	# maybe don'r run again? 
	p_fnames = []
	p_times = []
	if check_c:
		c_fnames = []
		c_times = []
	if ".txt" not in orig_mat_fname:
		orig_mat_fname += ".txt"
	mat_pname = os.path.join(MATRIX_DIR, orig_mat_fname)
	mat = load_matrix(mat_pname)
	rows, cols = mat.shape
	# create C binary 
	if check_c:
		subprocess.call(["make", "clean"])
		subprocess.call(["make", "sketch"])

	for l in ls:
		# generate p_sketch
		assert(l <= cols)
		p_sketch_pname = os.path.join(MATRIX_DIR, 
										"p_sketch_%d_%s" %(l, orig_mat_fname))
		if not(check_sketch_file(mat_pname, p_sketch_pname, l)) or force_comp:
			start = time.time()
			run_fd_sketch(mat_pname, p_sketch_pname, l)
			p_time = time.time() - start
		else:
			p_time = 0.0
		p_fnames.append(p_sketch_pname)
		p_times.append(p_time)
		if check_c:
			c_sketch_pname = os.path.join(MATRIX_DIR, 
											"c_sketch_%d_%s" %(l, orig_mat_fname))
			
			#
			if not(check_sketch_file(mat_pname, c_sketch_pname, l)) or force_comp:
				start = time.time()
				# not a fair comparison because of shit
				c_output = subprocess.check_output([RUN_SKETCH, '-f', mat_pname, '-w', c_sketch_pname, '-l', str(l)])
				c_time = time.time() - start				
				# so we should get the time from c_output, hopefully just one float shows up
				#num_matches = re.findall("\d+\.\d+", c_output)
				#c_time = float(num_matches[0])
				print"C output on: ", l,  c_output
				print "Python time: ", c_time
			else:
				c_time = 0.0
			c_fnames.append(c_sketch_pname)
			c_times.append(c_time)
	if check_c:
		return p_fnames, p_times, c_fnames, c_times
	else:
		return p_fnames, p_times 
Beispiel #18
0
def test_largest_product_in_grid():
	'''Test'''
	grid = load_matrix(data_file_path)
	assert 70600674 == largest_product_in_grid(grid, 4)
Beispiel #19
0
def main():
	'''Main runner, delegates to solution.'''
	grid = load_matrix(data_file_path)
	print(largest_product_in_grid(grid, 4))
Beispiel #20
0
def main():
	'''Main runner, delegates to solution.'''
	#tree = load_data_file('data/018.txt')
	tree = load_matrix(build_path(__file__, 'data/018.txt'))
	print(max_path_sum(tree))