class MatrixMultiplicationTestCase(unittest.TestCase): def setUp(self): self.matrix_A = RowMatrix(matrix_rdd,'test_data',1000,100) self.matrix_A2 = RowMatrix(matrix_rdd2,'test_data',100,1000) def test_mat_rtimes(self): mat = np.random.rand(100,50) p = self.matrix_A.rtimes(mat) p_true = np.dot( A, mat ) self.assertTrue( np.linalg.norm(p-p_true)/np.linalg.norm(p_true) < 1e-5 ) def test_mat_ltimes(self): mat = np.random.rand(100,1000) p = self.matrix_A.ltimes(mat) p_true = np.dot( mat,A ) self.assertTrue( np.linalg.norm(p-p_true)/np.linalg.norm(p_true) < 1e-5 ) def test_atamat(self): mat = np.random.rand(100,20) p = self.matrix_A.atamat(mat) p_true = np.dot( A.T, np.dot(A, mat) ) self.assertTrue( np.linalg.norm(p-p_true)/np.linalg.norm(p_true) < 1e-5 ) def test_mat_rtimes2(self): mat = np.random.rand(1000,50) p = self.matrix_A2.rtimes(mat) p_true = np.dot( A2, mat ) self.assertTrue( np.linalg.norm(p-p_true)/np.linalg.norm(p_true) < 1e-5 ) def test_mat_ltimes2(self): mat = np.random.rand(50,100) p = self.matrix_A2.ltimes(mat) p_true = np.dot( mat,A2 ) self.assertTrue( np.linalg.norm(p-p_true)/np.linalg.norm(p_true) < 1e-5 ) def test_atamat2(self): mat = np.random.rand(1000,20) p = self.matrix_A2.atamat(mat) p_true = np.dot( A2.T, np.dot(A2, mat) ) self.assertTrue( np.linalg.norm(p-p_true)/np.linalg.norm(p_true) < 1e-5 ) def test_mat_rtimes_sub(self): mat = np.random.rand(99,50) p = self.matrix_A.rtimes(mat, (0,98)) p_true = np.dot( A[:,:-1], mat ) self.assertTrue( np.linalg.norm(p-p_true)/np.linalg.norm(p_true) < 1e-5 ) def test_mat_ltimes_sub(self): mat = np.random.rand(100,1000) p = self.matrix_A.ltimes(mat, (0,98)) p_true = np.dot( mat,A[:,:-1] ) self.assertTrue( np.linalg.norm(p-p_true)/np.linalg.norm(p_true) < 1e-5 ) def test_atamat_sub(self): mat = np.random.rand(99,50) p = self.matrix_A.atamat(mat, (0,98)) p_true = np.dot( A[:,:-1].T, np.dot(A[:,:-1], mat) ) self.assertTrue( np.linalg.norm(p-p_true)/np.linalg.norm(p_true) < 1e-5 )
class MatrixMultiplicationTestCase(unittest.TestCase): def setUp(self): self.matrix_Ab = RowMatrix(matrix_rdd,'test_data',1000,10) def test_mat_rtimes(self): vec = np.random.rand(10) p = self.matrix_Ab.rtimes_vec(vec) p_true = np.dot( A, vec ) self.assertTrue( np.linalg.norm(p-p_true) < 1e-5 ) def test_mat_ltimes(self): vec = np.random.rand(1000) p = self.matrix_Ab.ltimes_vec(vec) p_true = np.dot( vec, A ) self.assertTrue( np.linalg.norm(p-p_true) < 1e-5 ) def test_get_b(self): b = self.matrix_Ab.get_b() self.assertTrue( np.linalg.norm(b - Ab[:,-1]) < 1e-5 )
class MatrixMultiplicationTestCase(unittest.TestCase): def setUp(self): self.matrix_Ab = RowMatrix(matrix_rdd, 'test_data', 1000, 10) def test_mat_rtimes(self): vec = np.random.rand(10) p = self.matrix_Ab.rtimes_vec(vec) p_true = np.dot(A, vec) self.assertTrue(np.linalg.norm(p - p_true) < 1e-5) def test_mat_ltimes(self): vec = np.random.rand(1000) p = self.matrix_Ab.ltimes_vec(vec) p_true = np.dot(vec, A) self.assertTrue(np.linalg.norm(p - p_true) < 1e-5) def test_get_b(self): b = self.matrix_Ab.get_b() self.assertTrue(np.linalg.norm(b - Ab[:, -1]) < 1e-5)
def setUp(self): self.matrix_Ab = RowMatrix(matrix_rdd,'test_data',1000,10)
def setUp(self): self.matrix_A = RowMatrix(matrix_rdd, 'test_data', 1000, 100) self.matrix_A2 = RowMatrix(matrix_rdd2, 'test_data', 100, 1000)
class MatrixMultiplicationTestCase(unittest.TestCase): def setUp(self): self.matrix_A = RowMatrix(matrix_rdd, 'test_data', 1000, 100) self.matrix_A2 = RowMatrix(matrix_rdd2, 'test_data', 100, 1000) def test_mat_rtimes(self): mat = np.random.rand(100, 50) p = self.matrix_A.rtimes(mat) p_true = np.dot(A, mat) self.assertTrue( np.linalg.norm(p - p_true) / np.linalg.norm(p_true) < 1e-5) def test_mat_ltimes(self): mat = np.random.rand(100, 1000) p = self.matrix_A.ltimes(mat) p_true = np.dot(mat, A) self.assertTrue( np.linalg.norm(p - p_true) / np.linalg.norm(p_true) < 1e-5) def test_atamat(self): mat = np.random.rand(100, 20) p = self.matrix_A.atamat(mat) p_true = np.dot(A.T, np.dot(A, mat)) self.assertTrue( np.linalg.norm(p - p_true) / np.linalg.norm(p_true) < 1e-5) def test_mat_rtimes2(self): mat = np.random.rand(1000, 50) p = self.matrix_A2.rtimes(mat) p_true = np.dot(A2, mat) self.assertTrue( np.linalg.norm(p - p_true) / np.linalg.norm(p_true) < 1e-5) def test_mat_ltimes2(self): mat = np.random.rand(50, 100) p = self.matrix_A2.ltimes(mat) p_true = np.dot(mat, A2) self.assertTrue( np.linalg.norm(p - p_true) / np.linalg.norm(p_true) < 1e-5) def test_atamat2(self): mat = np.random.rand(1000, 20) p = self.matrix_A2.atamat(mat) p_true = np.dot(A2.T, np.dot(A2, mat)) self.assertTrue( np.linalg.norm(p - p_true) / np.linalg.norm(p_true) < 1e-5) def test_mat_rtimes_sub(self): mat = np.random.rand(99, 50) p = self.matrix_A.rtimes(mat, (0, 98)) p_true = np.dot(A[:, :-1], mat) self.assertTrue( np.linalg.norm(p - p_true) / np.linalg.norm(p_true) < 1e-5) def test_mat_ltimes_sub(self): mat = np.random.rand(100, 1000) p = self.matrix_A.ltimes(mat, (0, 98)) p_true = np.dot(mat, A[:, :-1]) self.assertTrue( np.linalg.norm(p - p_true) / np.linalg.norm(p_true) < 1e-5) def test_atamat_sub(self): mat = np.random.rand(99, 50) p = self.matrix_A.atamat(mat, (0, 98)) p_true = np.dot(A[:, :-1].T, np.dot(A[:, :-1], mat)) self.assertTrue( np.linalg.norm(p - p_true) / np.linalg.norm(p_true) < 1e-5)
def setUp(self): self.matrix_A = RowMatrix(matrix_rdd,'test_data',1000,100) self.matrix_A2 = RowMatrix(matrix_rdd2,'test_data',100,1000)
def main(argv): logging.config.fileConfig('logging.conf', disable_existing_loggers=False) logger = logging.getLogger('') #using root parser = argparse.ArgumentParser(description='Getting parameters.', prog='run_cx.sh') parser.add_argument( 'dataset', type=str, help='dataset.txt stores the input matrix to run CX on; \ dataset_U.txt stores left-singular vectors of the input matrix (only needed for -t); \ dataset_D.txt stores singular values of the input matrix (only needed for -t)' ) parser.add_argument('--dims', metavar=('m', 'n'), type=int, nargs=2, required=True, help='size of the input matrix') parser.add_argument('--sparse', dest='sparse', action='store_true', help='whether the data is sparse') parser.add_argument('--hdfs', dest='file_source', default='local', action='store_const', const='hdfs', help='load dataset from HDFS') parser.add_argument( '-k', '--rank', metavar='targetRank', dest='k', default=5, type=int, help= 'target rank parameter in the definition of leverage scores, this value shoud not be greater than m or n' ) parser.add_argument('-r', metavar='numRowsToSelect', default=20, type=int, help='number of rows to select in CX') parser.add_argument( '-q', '--niters', metavar='numIters', dest='q', default=2, type=int, help='number of iterations to run in approximation of leverage scores') parser.add_argument( '--deterministic', dest='scheme', default='randomized', action='store_const', const='deterministic', help= 'use deterministic scheme instead of randomized when selecting rows') parser.add_argument('-c', '--cache', action='store_true', help='cache the dataset in Spark') parser.add_argument('-t', '--test', action='store_true', help='compute accuracies of the returned solutions') parser.add_argument('-s', '--save_logs', action='store_true', help='save Spark logs') parser.add_argument( '--nrepetitions', metavar='numRepetitions', default=1, type=int, help= 'number of times to stack matrix vertically in order to generate large matrices' ) parser.add_argument('--npartitions', metavar='numPartitions', default=280, type=int, help='number of partitions in Spark') group = parser.add_mutually_exclusive_group() group.add_argument('--row', dest='axis', default=0, action='store_const', const=0, help='compute row leverage scores') group.add_argument('--column', dest='axis', default=0, action='store_const', const=1, help='compute column leverage scores') group = parser.add_mutually_exclusive_group() group.add_argument('--leverage-scores-only', dest='stage', default='full', action='store_const', const='leverage', help='return approximate leverage scores only') group.add_argument( '--indices-only', dest='stage', default='full', action='store_const', const='indices', help='return approximate leverage scores and selected row indices only' ) if len(argv) > 0 and argv[0] == 'print_help': parser.print_help() sys.exit(1) args = parser.parse_args(argv) (m, n) = args.dims # validating if args.k > m or args.k > n: raise ValueError( 'Rank parameter({0}) should not be greater than m({1}) or n({2})'. format(args.k, m, n)) if args.npartitions > m or args.npartitions > n: args.npartitions = min(m, n) if args.test and args.nrepetitions > 1: raise OptionError( 'Do not use the test mode(-t) on replicated data(numRepetitions>1)!' ) if args.axis == 0: raise OptionError('Need to implement transpose first!') if args.sparse and args.file_source == 'hdfs': raise OptionError('Not yet!') # print parameters print_params(args, logger) # TO-DO: put these to a configuration file dire = '../data/' hdfs_dire = 'data/' logs_dire = 'file:///home/jiyan/cx_logs' # instantializing a Spark instance if args.save_logs: conf = SparkConf().set('spark.eventLog.enabled', 'true').set('spark.eventLog.dir', logs_dire) else: conf = SparkConf() sc = SparkContext(appName="cx_exp", conf=conf) # loading data if args.file_source == 'hdfs': A_rdd = sc.textFile(hdfs_dire + args.dataset + '.txt', args.npartitions) #loading dataset from HDFS else: A = np.loadtxt(dire + args.dataset + '.txt') #loading dataset from local disc if args.sparse: sA = to_sparse(A) A_rdd = sc.parallelize(sA, args.npartitions) else: A_rdd = sc.parallelize(A.tolist(), args.npartitions) if args.axis == 0: pass # get rdd from the transpose of A t = time.time() if args.sparse: matrix_A = SparseRowMatrix( A_rdd, args.dataset, m, n, args.cache) # creating a SparseRowMatrix instance else: matrix_A = RowMatrix( A_rdd, args.dataset, m, n, args.cache, repnum=args.nrepetitions) # creating a RowMatrix instance cx = CX(matrix_A) lev, p = cx.get_lev( args.k, q=args.q ) # getting the approximate row leverage scores. it has the same size as the number of rows if args.test: if args.file_source != 'local': A = np.loadtxt(dire + args.dataset + '.txt') U, D, V = np.linalg.svd(A, 0) if args.axis == 0: lev_exact = np.sum(U[:, :args.k]**2, axis=1) else: lev_exact = np.sum(V.T[:, :args.k]**2, axis=1) p_exact = lev_exact / args.k logger.info( 'KL divergence between the estimation of leverage scores and the exact one is {0}' .format(scipy.stats.entropy(p_exact, p))) logger.info('finished stage 1') logger.info('----------------------------------------------') if args.stage == 'indices' or args.stage == 'full': idx = cx.comp_idx(args.scheme, args.r) # choosing rows based on the leverage scores # maybe to store the indices to file logger.info('finished stage 2') logger.info('----------------------------------------------') if args.stage == 'full': rows = cx.get_rows( ) # getting back the selected rows based on the idx computed above (this might give you different results if you rerun the above) if args.test: diff = cx.comp_err() # computing the relative error logger.info('relative error ||A-CX||/||A|| is {0}'.format( diff / np.linalg.norm(A, 'fro'))) logger.info( 'raltive error of the best rank-{0} approximation is {1}'. format(args.k, np.sqrt(np.sum(D[args.k:]**2)) / np.sqrt(np.sum(D**2)))) logger.info('finished stage 3') rtime = time.time() - t logger.info('time elapsed: {0} second'.format(rtime))
def main(argv): parser = argparse.ArgumentParser(description='Getting parameters.', prog='run_ls.sh') parser.add_argument( 'dataset', type=str, help='dataset_Ab.txt stores the input matrix to run CX on; \ dataset.txt stores the original matrix (only needed for -t);') parser.add_argument('--dims', metavar=('m', 'n'), type=int, nargs=2, required=True, help='size of the input matrix') parser.add_argument( '--nrepetitions', metavar='numRepetitions', default=1, type=int, help= 'number of times to stack matrix vertically in order to generate large matrices' ) parser.add_argument('--stack', metavar='stackType', dest='stack_type', type=int, default=1, help='stack type') parser.add_argument('--npartitions', metavar='numPartitions', default=280, type=int, help='number of partitions in Spark') parser.add_argument( '--setting_filename', metavar='settingConfFilename', default='conf/settings.cfg', type=str, help='name of the configuration file storing the settings') parser.add_argument('--logging_filename', metavar='loggingConfFilename', default='conf/logging.cfg', type=str, help='configuration file for Python logging') parser.add_argument('-c', '--cache', action='store_true', help='cache the dataset in Spark') group = parser.add_mutually_exclusive_group() group.add_argument( '--hdfs', dest='file_source', default='local', action='store_const', const='hdfs', help='load dataset from HDFS (default: loading files from local)') group.add_argument( '--s3', dest='file_source', default='local', action='store_const', const='s3', help='load dataset from Amazon S3 (default: loading files from local)') group = parser.add_mutually_exclusive_group() group.add_argument('--low-precision', dest='solver_type', default='low_precision', action='store_const', const='low_precision', help='use low-precision solver') group.add_argument('--high_precision', dest='solver_type', default='low_precision', action='store_const', const='high_precision', help='use high_precision solver') group = parser.add_mutually_exclusive_group() group.add_argument('--projection', dest='sketch_type', action='store_const', const='projection', help='compute sketch by projection') group.add_argument('--sampling', dest='sketch_type', action='store_const', const='sampling', help='compute sketch by sampling') parser.add_argument('-p', dest='projection_type', default='gaussian', choices=('cw', 'gaussian', 'rademacher', 'srdht'), help='underlying projection type') parser.add_argument('-r', metavar='projectionSize', type=int, help='sketch size') parser.add_argument('-s', metavar='samplingSize', type=int, help='sampling size (for samping sektch only)') parser.add_argument('-q', '--niters', metavar='numIters', dest='q', type=int, help='number of iterations in LSQR') parser.add_argument('-k', '--ntrials', metavar='numTrials', dest='k', default=1, type=int, help='number of independent trials to run') parser.add_argument('-t', '--test', action='store_true', help='compute accuracies of the returned solutions') parser.add_argument('--save_logs', action='store_true', help='save Spark logs') parser.add_argument('--output_filename', metavar='outputFilename', default='ls.out', help='filename of the output file (default: ls.out)') parser.add_argument('--load_N', action='store_true', help='load N') parser.add_argument('--save_N', action='store_true', help='save N') parser.add_argument('--debug', action='store_true', help='debug mode') if len(argv) > 0 and argv[0] == 'print_help': parser.print_help() sys.exit(1) args = parser.parse_args(argv) (m, n) = args.dims # validating if m < n: raise ValueError( 'Number of rows({0}) should be greater than number of columns({1})' .format(m, n)) if args.sketch_type == 'sampling' and args.s is None: raise ValueError('Please enter a sampling size!') if args.solver_type == 'high_precision' and args.q is None: raise ValueError('Please enter number of iterations!') if args.solver_type == 'low_precision' and args.sketch_type is None: raise ValueError( 'Please specify a sketch method for the low-precision solver!') if args.sketch_type and args.r is None: raise ValueError('Please enter a projection size!') # loading configuration file config = ConfigParser.RawConfigParser() config.read(args.setting_filename) data_dir = config.get('local_directories', 'data_dir') spark_logs_dir = 'file://' + os.path.dirname( os.path.abspath(__file__)) + '/' + config.get('local_directories', 'spark_logs_dir') logging.config.fileConfig( args.logging_filename, disable_existing_loggers=False) # setting up the logger logger = logging.getLogger('') #using root print_params(args, logger) # printing parameters # instantializing a Spark instance if args.save_logs: conf = SparkConf().set('spark.eventLog.enabled', 'true').set( 'spark.eventLog.dir', spark_logs_dir).set('spark.driver.maxResultSize', '20g') else: conf = SparkConf().set('spark.driver.maxResultSize', '20g') sc = SparkContext(appName="ls_exp", conf=conf) if args.file_source == 'hdfs': hdfs_dir = config.get('hdfs', 'hdfs_dir') Ab_rdd = sc.textFile(hdfs_dir + args.dataset + '.txt', args.npartitions) #loading dataset from HDFS elif args.file_source == 's3': s3_dir = config.get('s3', 's3_dir') key_id = config.get('s3', 'key_id') secret_key = config.get('s3', 'secret_key') Ab_rdd = sc.textFile( 's3n://' + key_id + ':' + secret_key + '@' + s3_dir + args.dataset + '.txt', args.npartitions) else: A = np.loadtxt(data_dir + args.dataset + '.txt') #loading dataset from local disc Ab_rdd = sc.parallelize(A.tolist(), args.npartitions) matrix_Ab = RowMatrix( Ab_rdd, args.dataset, m, n, args.cache, stack_type=args.stack_type, repnum=args.nrepetitions) # creating a RowMatrix instance ls = RandLeastSquares(matrix_Ab, solver_type=args.solver_type, sketch_type=args.sketch_type, projection_type=args.projection_type, c=args.r, s=args.s, num_iters=args.q, k=args.k) ls.fit(args.load_N, args.save_N, args.debug) # solving the problem result = {'time': ls.time, 'x': ls.x} pickle_write('../result/' + args.output_filename, result) # saving results logger.info('Total time elapsed:{0}'.format(ls.time)) if args.test: #only need to load these in the test mode if os.path.isfile(data_dir + args.dataset + '_x_opt.txt'): logger.info('Found precomputed optimal solutions!') x_opt = np.loadtxt(data_dir + args.dataset + '_x_opt.txt') f_opt = np.loadtxt(data_dir + args.dataset + '_f_opt.txt') else: logger.info('Computing optimal solutions!') Ab = np.array(matrix_Ab.rdd_original.values().collect() ) # might not be accurate, needed to check A = Ab[:, :-1] b = Ab[:, -1] x_opt = np.linalg.lstsq(A, b)[0] f_opt = np.linalg.norm(np.dot(A, x_opt) - b) rx, rf = ls.comp_relerr(x_opt, f_opt) logger.info( 'Median of the relative error on solution vector is:{0}'.format( rx)) logger.info( 'Median of the relative error on objective value is:{0}'.format( rf))
def setUp(self): self.matrix_Ab = RowMatrix(matrix_rdd, 'test_data', 1000, 10) self.N_dire = 'N/'