def run_stage3(params_dict): input_matrix = params_dict.get('inputmatrix') #leverage_scores_file = params_dict.get('leveragescores') #p_score_file = params_dict.get('pscores') on_rows = stage_3_params.get('on_rows', False) sc = params_dict.get('sc') if on_rows: # we do the flip rows_assigned = sc.textFile(input_matrix).map(lambda x:x.split(',')).map(lambda x:(int(x[1]), int(x[0]), float(x[2]))) leverage_scores_file = params_dict.get('rowleveragescores') p_score_file = params_dict.get('rowpscores') else: rows_assigned = sc.textFile(input_matrix).map(lambda x:x.split(',')).map(lambda x:(int(x[0]), int(x[1]), float(x[2]))) leverage_scores_file = params_dict.get('columnleveragescores') p_score_file = params_dict.get('columnpscores') row_shape = rows_assigned.map(lambda x:x[0]).max() + 1 column_shape = rows_assigned.map(lambda x:x[1]).max() + 1 matrix_A = SparseRowMatrix(rows_assigned,'output', row_shape,column_shape, True) start = time.time() cx = CX(matrix_A) k = 5 q = 3 lev, p = cx.get_lev(k,axis=0, q=q) end = time.time() np.savetxt(leverage_scores_file, np.array(lev)) np.savetxt(p_score_file, np.array(p)) print 'lev score ', lev, len(lev) print 'p is ', p, len(p) print 'time ', end-start
def test_col_lev2(self): cx = CX(self.matrix_A2) lev, p = cx.get_lev(10, q=6) lev_exact, p_exact = compLevExact(A2, 10, axis=1) print scipy.stats.entropy(p_exact, p) self.assertEqual(len(lev), 1000)
def test_col_lev2(self): cx = CX(self.matrix_A2) lev, p = cx.get_lev(10, q=6) lev_exact, p_exact = compLevExact(A2, 10, axis=1) print scipy.stats.entropy(p_exact,p) self.assertEqual(len(lev), 1000)
def _indexed(grouped_list): indexed, values = [],[] for tup in grouped_list: indexed.append(tup[0]) values.append(tup[1]) return np.array(indexed), np.array(values) filename = "/global/u2/m/msingh/sc_paper/new_version/sc-2015/cx_spark/data/movielens/ml-10M100K/ratings.dat" #filename = '/global/u2/m/msingh/sc_paper/new_version/sc-2015/cx_spark/data/ml-100k/u.data' data = sc.textFile(filename).map(lambda x:parse(x)) row_shape = data.map(lambda x:x[0]).max() +1 column_shape = data.map(lambda x:x[1]).max()+1 drdd = data.map(lambda x:(x[0],(x[1],x[2]))).groupByKey().map(lambda x :(x[0],list(x[1]))).map(lambda x: (x[0],_indexed(x[1]))) print drdd.take(1) #prep_rdd = prepare_matrix(data) matrix_A = SparseRowMatrix(drdd,'output', row_shape,column_shape, True) cx = CX(matrix_A) k = 2 q = 2 lev, p = cx.get_lev(k,axis=0, q=q) #end = time.time() leverage_scores_file='/scratch1/scratchdirs/msingh/sc_paper/experiments/striped_data/movielens_leverage_scores_full1' p_score_file='/scratch1/scratchdirs/msingh/sc_paper/experiments/striped_data/movielens_p_scores_full1' np.savetxt(leverage_scores_file, np.array(lev)) np.savetxt(p_score_file, np.array(p)) #825 post street """
val = ast.literal_eval(s) return val[0], (np.array(val[1][0]), np.array(val[1][1])) data = sc.textFile('/scratch1/scratchdirs/msingh/sc_paper/experiments/striped_data/ncolumns_matrix').map(lambda x:parse(x)) #row_shape = 131048 #column_shape = 8258911 #131047 8258910 row_shape = 8258911 column_shape =131048 #column_shape+=20 print data.take(1) matrix_A = SparseRowMatrix(data,'output', row_shape,column_shape, False) cx = CX(matrix_A) k = 2 q = 2 lev, p = cx.get_lev(k,axis=0, q=q) #end = time.time() leverage_scores_file='/scratch1/scratchdirs/msingh/sc_paper/experiments/striped_data/columns_row_leverage_scores_logged' p_score_file='/scratch1/scratchdirs/msingh/sc_paper/experiments/striped_data/columns_p_scores_logged' np.savetxt(leverage_scores_file, np.array(lev)) np.savetxt(p_score_file, np.array(p)) """ def parse_func(x): stringed = str(x) chunks = stringed.split(",")
def main(argv): logging.config.fileConfig('logging.conf',disable_existing_loggers=False) logger = logging.getLogger('') #using root parser = argparse.ArgumentParser(description='Getting parameters.',prog='run_cx.sh') parser.add_argument('dataset', type=str, help='dataset.txt stores the input matrix to run CX on; \ dataset_U.txt stores left-singular vectors of the input matrix (only needed for -t); \ dataset_D.txt stores singular values of the input matrix (only needed for -t)') parser.add_argument('--dims', metavar=('m','n'), type=int, nargs=2, required=True, help='size of the input matrix') parser.add_argument('--sparse', dest='sparse', action='store_true', help='whether the data is sparse') parser.add_argument('--hdfs', dest='file_source', default='local', action='store_const', const='hdfs', help='load dataset from HDFS') parser.add_argument('-k', '--rank', metavar='targetRank', dest='k', default=5, type=int, help='target rank parameter in the definition of leverage scores, this value shoud not be greater than m or n') parser.add_argument('-r', metavar='numRowsToSelect', default=20, type=int, help='number of rows to select in CX') parser.add_argument('-q', '--niters', metavar='numIters', dest='q', default=2, type=int, help='number of iterations to run in approximation of leverage scores') parser.add_argument('--deterministic', dest='scheme', default='randomized', action='store_const', const='deterministic', help='use deterministic scheme instead of randomized when selecting rows') parser.add_argument('-c', '--cache', action='store_true', help='cache the dataset in Spark') parser.add_argument('-t', '--test', action='store_true', help='compute accuracies of the returned solutions') parser.add_argument('-s', '--save_logs', action='store_true', help='save Spark logs') parser.add_argument('--nrepetitions', metavar='numRepetitions', default=1, type=int, help='number of times to stack matrix vertically in order to generate large matrices') parser.add_argument('--npartitions', metavar='numPartitions', default=280, type=int, help='number of partitions in Spark') group = parser.add_mutually_exclusive_group() group.add_argument('--row', dest='axis', default=0, action='store_const', const=0, help='compute row leverage scores') group.add_argument('--column', dest='axis', default=0, action='store_const', const=1, help='compute column leverage scores') group = parser.add_mutually_exclusive_group() group.add_argument('--leverage-scores-only', dest='stage', default='full', action='store_const', const='leverage', help='return approximate leverage scores only') group.add_argument('--indices-only', dest='stage', default='full', action='store_const', const='indices', help='return approximate leverage scores and selected row indices only') if len(argv)>0 and argv[0]=='print_help': parser.print_help() sys.exit(1) args = parser.parse_args(argv) (m,n) = args.dims # validating if args.k > m or args.k > n: raise ValueError('Rank parameter({0}) should not be greater than m({1}) or n({2})'.format(args.k,m,n)) if args.npartitions > m or args.npartitions > n: args.npartitions = min(m,n) if args.test and args.nrepetitions>1: raise OptionError('Do not use the test mode(-t) on replicated data(numRepetitions>1)!') if args.axis == 0: raise OptionError('Need to implement transpose first!') if args.sparse and args.file_source=='hdfs': raise OptionError('Not yet!') # print parameters print_params(args, logger) # TO-DO: put these to a configuration file dire = '../data/' hdfs_dire = 'data/' logs_dire = 'file:///home/jiyan/cx_logs' # instantializing a Spark instance if args.save_logs: conf = SparkConf().set('spark.eventLog.enabled','true').set('spark.eventLog.dir',logs_dire) else: conf = SparkConf() sc = SparkContext(appName="cx_exp",conf=conf) # loading data if args.file_source=='hdfs': A_rdd = sc.textFile(hdfs_dire+args.dataset+'.txt',args.npartitions) #loading dataset from HDFS else: A = np.loadtxt(dire+args.dataset+'.txt') #loading dataset from local disc if args.sparse: sA = to_sparse(A) A_rdd = sc.parallelize(sA,args.npartitions) else: A_rdd = sc.parallelize(A.tolist(),args.npartitions) if args.axis == 0: pass # get rdd from the transpose of A t = time.time() if args.sparse: matrix_A = SparseRowMatrix(A_rdd,args.dataset,m,n,args.cache) # creating a SparseRowMatrix instance else: matrix_A = RowMatrix(A_rdd,args.dataset,m,n,args.cache,repnum=args.nrepetitions) # creating a RowMatrix instance cx = CX(matrix_A) lev, p = cx.get_lev(args.k, q=args.q) # getting the approximate row leverage scores. it has the same size as the number of rows if args.test: if args.file_source != 'local': A = np.loadtxt(dire+args.dataset+'.txt') U, D, V = np.linalg.svd(A,0) if args.axis == 0: lev_exact = np.sum(U[:,:args.k]**2,axis=1) else: lev_exact = np.sum(V.T[:,:args.k]**2,axis=1) p_exact = lev_exact/args.k logger.info('KL divergence between the estimation of leverage scores and the exact one is {0}'.format( scipy.stats.entropy(p_exact,p) )) logger.info('finished stage 1') logger.info('----------------------------------------------') if args.stage=='indices' or args.stage=='full': idx = cx.comp_idx(args.scheme,args.r) # choosing rows based on the leverage scores # maybe to store the indices to file logger.info('finished stage 2') logger.info('----------------------------------------------') if args.stage=='full': rows = cx.get_rows() # getting back the selected rows based on the idx computed above (this might give you different results if you rerun the above) if args.test: diff = cx.comp_err() # computing the relative error logger.info('relative error ||A-CX||/||A|| is {0}'.format( diff/np.linalg.norm(A,'fro') )) logger.info('raltive error of the best rank-{0} approximation is {1}'.format( args.k, np.sqrt(np.sum(D[args.k:]**2))/np.sqrt(np.sum(D**2)) )) logger.info('finished stage 3') rtime = time.time() - t logger.info('time elapsed: {0} second'.format( rtime ))
def test_col_lev2(self): cx = CX(self.matrix_A2) lev, p = cx.get_lev(5, q=10) self.assertEqual(len(lev), 1000)
from utils import prepare_matrix import os import logging.config logging.config.fileConfig('logging.conf', disable_existing_loggers=False) logger = logging.getLogger(__name__) sc = SparkContext() logger.info("job_cx starting with appId=" + sc._jsc.sc().applicationId()) prefix = 'hdfs:///sc-2015/' name = 'Lewis_Dalisay_Peltatum_20131115_hexandrum_1_1-masked' logger.info("job_cx loading RDD from %s" % name) #dataset = MSIDataset.load(sc, 'meta/' + name, prefix + name).cache() #msimat = MSIMatrix.from_dataset(sc, dataset) #msimat.save(prefix, 'meta', name) msimat = MSIMatrix.load(sc, prefix, 'meta', name) logger.info("shape: %s" % (msimat.shape, )) mat = prepare_matrix(msimat.nonzeros).cache() mat = SparseRowMatrix(mat, "msimat", msimat.shape[0], msimat.shape[1], cache=False) cx = CX(mat) k = 32 q = 5 lev, p = cx.get_lev(k, axis=0, q=q) with open('dump.pkl', 'w') as outf: import cPickle as pickle data = {'lev': lev, 'p': p} pickle.dump(data, outf)
def main(argv): logging.config.fileConfig('logging.conf', disable_existing_loggers=False) logger = logging.getLogger('') #using root parser = argparse.ArgumentParser(description='Getting parameters.', prog='run_cx.sh') parser.add_argument( 'dataset', type=str, help='dataset.txt stores the input matrix to run CX on; \ dataset_U.txt stores left-singular vectors of the input matrix (only needed for -t); \ dataset_D.txt stores singular values of the input matrix (only needed for -t)' ) parser.add_argument('--dims', metavar=('m', 'n'), type=int, nargs=2, required=True, help='size of the input matrix') parser.add_argument('--sparse', dest='sparse', action='store_true', help='whether the data is sparse') parser.add_argument('--hdfs', dest='file_source', default='local', action='store_const', const='hdfs', help='load dataset from HDFS') parser.add_argument( '-k', '--rank', metavar='targetRank', dest='k', default=5, type=int, help= 'target rank parameter in the definition of leverage scores, this value shoud not be greater than m or n' ) parser.add_argument('-r', metavar='numRowsToSelect', default=20, type=int, help='number of rows to select in CX') parser.add_argument( '-q', '--niters', metavar='numIters', dest='q', default=2, type=int, help='number of iterations to run in approximation of leverage scores') parser.add_argument( '--deterministic', dest='scheme', default='randomized', action='store_const', const='deterministic', help= 'use deterministic scheme instead of randomized when selecting rows') parser.add_argument('-c', '--cache', action='store_true', help='cache the dataset in Spark') parser.add_argument('-t', '--test', action='store_true', help='compute accuracies of the returned solutions') parser.add_argument('-s', '--save_logs', action='store_true', help='save Spark logs') parser.add_argument( '--nrepetitions', metavar='numRepetitions', default=1, type=int, help= 'number of times to stack matrix vertically in order to generate large matrices' ) parser.add_argument('--npartitions', metavar='numPartitions', default=280, type=int, help='number of partitions in Spark') group = parser.add_mutually_exclusive_group() group.add_argument('--row', dest='axis', default=0, action='store_const', const=0, help='compute row leverage scores') group.add_argument('--column', dest='axis', default=0, action='store_const', const=1, help='compute column leverage scores') group = parser.add_mutually_exclusive_group() group.add_argument('--leverage-scores-only', dest='stage', default='full', action='store_const', const='leverage', help='return approximate leverage scores only') group.add_argument( '--indices-only', dest='stage', default='full', action='store_const', const='indices', help='return approximate leverage scores and selected row indices only' ) if len(argv) > 0 and argv[0] == 'print_help': parser.print_help() sys.exit(1) args = parser.parse_args(argv) (m, n) = args.dims # validating if args.k > m or args.k > n: raise ValueError( 'Rank parameter({0}) should not be greater than m({1}) or n({2})'. format(args.k, m, n)) if args.npartitions > m or args.npartitions > n: args.npartitions = min(m, n) if args.test and args.nrepetitions > 1: raise OptionError( 'Do not use the test mode(-t) on replicated data(numRepetitions>1)!' ) if args.axis == 0: raise OptionError('Need to implement transpose first!') if args.sparse and args.file_source == 'hdfs': raise OptionError('Not yet!') # print parameters print_params(args, logger) # TO-DO: put these to a configuration file dire = '../data/' hdfs_dire = 'data/' logs_dire = 'file:///home/jiyan/cx_logs' # instantializing a Spark instance if args.save_logs: conf = SparkConf().set('spark.eventLog.enabled', 'true').set('spark.eventLog.dir', logs_dire) else: conf = SparkConf() sc = SparkContext(appName="cx_exp", conf=conf) # loading data if args.file_source == 'hdfs': A_rdd = sc.textFile(hdfs_dire + args.dataset + '.txt', args.npartitions) #loading dataset from HDFS else: A = np.loadtxt(dire + args.dataset + '.txt') #loading dataset from local disc if args.sparse: sA = to_sparse(A) A_rdd = sc.parallelize(sA, args.npartitions) else: A_rdd = sc.parallelize(A.tolist(), args.npartitions) if args.axis == 0: pass # get rdd from the transpose of A t = time.time() if args.sparse: matrix_A = SparseRowMatrix( A_rdd, args.dataset, m, n, args.cache) # creating a SparseRowMatrix instance else: matrix_A = RowMatrix( A_rdd, args.dataset, m, n, args.cache, repnum=args.nrepetitions) # creating a RowMatrix instance cx = CX(matrix_A) lev, p = cx.get_lev( args.k, q=args.q ) # getting the approximate row leverage scores. it has the same size as the number of rows if args.test: if args.file_source != 'local': A = np.loadtxt(dire + args.dataset + '.txt') U, D, V = np.linalg.svd(A, 0) if args.axis == 0: lev_exact = np.sum(U[:, :args.k]**2, axis=1) else: lev_exact = np.sum(V.T[:, :args.k]**2, axis=1) p_exact = lev_exact / args.k logger.info( 'KL divergence between the estimation of leverage scores and the exact one is {0}' .format(scipy.stats.entropy(p_exact, p))) logger.info('finished stage 1') logger.info('----------------------------------------------') if args.stage == 'indices' or args.stage == 'full': idx = cx.comp_idx(args.scheme, args.r) # choosing rows based on the leverage scores # maybe to store the indices to file logger.info('finished stage 2') logger.info('----------------------------------------------') if args.stage == 'full': rows = cx.get_rows( ) # getting back the selected rows based on the idx computed above (this might give you different results if you rerun the above) if args.test: diff = cx.comp_err() # computing the relative error logger.info('relative error ||A-CX||/||A|| is {0}'.format( diff / np.linalg.norm(A, 'fro'))) logger.info( 'raltive error of the best rank-{0} approximation is {1}'. format(args.k, np.sqrt(np.sum(D[args.k:]**2)) / np.sqrt(np.sum(D**2)))) logger.info('finished stage 3') rtime = time.time() - t logger.info('time elapsed: {0} second'.format(rtime))