def start_gpdb(port, nproc): if (port is None) or (nproc == '24'): os.system('yes | gpstart') cxn = SQLCxn(username='******', db='ubuntu', timeout=10000) else: call = 'yes | gpstart -d /gpsegs/gpdb-{}/master/gpseg-1'.format(nproc) os.system(call) cxn = SQLCxn(username='******', db='ubuntu', timeout=10000, port=port) return cxn
def main(kwargs): opType = kwargs.get('opType') savestub = kwargs.get('savestub') nodes = kwargs.get('nodes') x_table_name = kwargs.get('xTableName') y_table_name = kwargs.get('yTableName') savestub = '' if (savestub is None) else savestub print 'Evaluating: {}'.format(opType) cxn = SQLCxn(username='******', db='ubuntu', timeout=10000) colnames = [ 'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5' ] runTimes = pd.DataFrame(np.zeros((1, len(colnames)))) runTimes.columns = colnames shape = cxn.get_shape(x_table_name) env = { 'x_table_name': x_table_name, 'y_table_name': y_table_name, 'do_logit': do_logit, 'do_reg': do_reg, 'do_gnmf': do_gnmf, 'do_robust': do_robust, 'shape': shape, 'cxn': cxn } cleanup = None if opType == 'logit': call = 'do_logit(x_table_name, y_table_name, shape, cxn)' elif opType == 'gnmf': call = 'do_gnmf(x_table_name, shape, 10, cxn)' elif opType == 'reg': call = 'do_reg(x_table_name, y_table_name, cxn)' cleanup = ("map(lambda x: cxn.execute(" "'DROP TABLE {}'.format(x)), ['XTX','XTY','XTX_INV','B'])") elif opType == 'robust': cxn.execute('DROP TABLE IF EXISTS R2 CASCADE') cxn.execute( "SELECT MADLIB.matrix_random({},1,NULL,'uniform','R2',NULL)". format(shape[0])) cxn.execute('ALTER TABLE R2 RENAME COLUMN ROW TO ROW_NUM') call = 'do_robust(x_table_name, cxn)' rows = shape[0] cols = shape[1] path = '../output/madlib_adclick_{}{}.txt'.format(opType, int(nodes)) runTimes.ix[:, ['nodes', 'rows', 'cols']] = (nodes, rows, cols) res = utils.timeOp(call, env, cleanup) print res runTimes.ix[:, 3:] = res writeHeader = False if (os.path.exists(path)) else True runTimes.to_csv(path, index=False, header=writeHeader, mode='a')
def doMatrixOp(kwargs): opType = kwargs.get('opType') mattype = kwargs.get('mattype') tableStub = kwargs.get('tableStub') savestub = kwargs.get('savestub') nodes = kwargs.get('nodes') outdir = kwargs.get('outdir') savestub = '' if (savestub is None) else savestub try: tableStub = int(tableStub) except ValueError: pass Mname = 'M{}'.format(tableStub) Nname = 'N{}'.format(tableStub) wname = 'w{}'.format(tableStub) print 'Evaluating: {}'.format(opType) colnames = [ 'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5' ] runTimes = pd.DataFrame(np.zeros((1, len(colnames)))) runTimes.columns = colnames cxn = SQLCxn(username='******', db='ubuntu', timeout=2000) shape = cxn.get_shape_dense('M{}'.format(tableStub)) cleanup = [] if (opType == 'SVD'): call = "svd('{}','svd','row_num',10, 10,'svd_summary')".format( Mname, shape[1]) cleanup.append('svd_s') cleanup.append('svd_u') cleanup.append('svd_v') cleanup.append('svd_summary') else: raise NotImplementedError('Invalid Operation') for obj in cleanup: cxn.execute('DROP TABLE IF EXISTS {}'.format(obj)) sql_call = 'SELECT madlib.{}'.format(call) rows = shape[0] cols = shape[1] path = '../output/{}/madlib_{}_{}{}.txt'.format(outdir, mattype, opType, int(nodes)) runTimes.ix[:, ['nodes', 'rows', 'cols']] = (nodes, rows, cols) madlib_timeout = ('../temp/madlib_punked_out.json', opType) res = cxn.time(sql_call, cleanup, madlib_timeout) if (res is None): print 'Timed Out' return runTimes.ix[:, 3:] = res writeHeader = False if (os.path.exists(path)) else True runTimes.to_csv(path, index=False, header=writeHeader, mode='a')
helpstr = 'Approximate fraction nonzero values. Delimited by spaces. Default "0.0001 0.001 0.01 0.1"' parser.add_argument('--sparsity', default='0.0001 0.001 0.01 0.1', type=str, help=helpstr) args = parser.parse_args() if not os.path.exists('manifest.txt'): with open('manifest.txt', 'w') as fh: fh.write('') # start logging start_make_logging() os.system('hdfs dfs -mkdir -p /scratch') cxn = SQLCxn(timeout=None, username='******', db='ubuntu') sparse_gb = int(args.msize_sparse) sparsity = map(lambda x: float(x), args.sparsity.split(' ')) for sr in sparsity: stub = '{}'.format(sr).replace('0.', '_') gb_stub = sparse_gb fmt = (stub, gb_stub) k = int(np.ceil((sparse_gb * 1e9) / float(8 * 100))) mpath_tall = os.path.abspath( '../output/M{}{}_sparse_tall.mtx'.format(*fmt)) mpath_wide = os.path.abspath( '../output/M{}{}_sparse_wide.mtx'.format(*fmt)) data.gen_data_sparse(k, 100, sr, 'M{}{}_sparse_tall'.format(*fmt), mpath_tall)
parser.add_argument( '--msize', default='2 4 8 16', type=str, help='Approximate size of matrix (GB in memory) to generate') args = parser.parse_args() if not os.path.exists('manifest.txt'): with open('manifest.txt', 'w') as fh: fh.write('') # start logging start_make_logging() os.system('hdfs dfs -mkdir -p /scratch') cxn = SQLCxn(timeout=None, username='******', db='ubuntu') approx_gb = [.001] if len(args.msize) > 0: approx_gb += [int(x) for x in args.msize.split()] mtypes = ['tall', 'wide'] for mtype in mtypes: for gb in approx_gb: print mtype if gb < .1: gb_stub = '_test' else: gb_stub = '_small' if gb < 1 else gb if mtype == 'square': k = int(np.ceil(np.sqrt((gb * 1e9) / float(8)))) m = k
def doMatrixOp(kwargs): opType = kwargs.get('opType') mattype = kwargs.get('mattype') tableStub = kwargs.get('tableStub') savestub = kwargs.get('savestub') nodes = kwargs.get('nodes') outdir = kwargs.get('outdir') sr = kwargs.get('sr') try: tableStub = int(tableStub) except ValueError: pass Mname = 'M{}'.format(tableStub) Nname = 'M{}'.format(tableStub) wname = 'w{}'.format(tableStub) if opType == 'GMM': Nname = Mname.replace('wide','tall') print 'Evaluating: {}'.format(opType) colnames = ['nodes','sr','time1','time2','time3','time4','time5'] runTimes = pd.DataFrame(np.zeros((1,len(colnames)))) runTimes.columns = colnames runTimes['nodes'] = runTimes['nodes'].astype('O') runTimes['sr'] = runTimes['sr'].astype('O') cxn = SQLCxn(username='******', db='ubuntu', timeout=10000) cleanup = [] if (opType == 'TRANS'): call = "matrix_trans('{}',NULL,'Mt',NULL)".format(Mname) cleanup.append('Mt') elif (opType == 'NORM'): call = "matrix_norm('{}',NULL,'fro')".format(Mname) elif (opType == 'GMM'): call = "matrix_mult('{}',NULL,'{}',NULL,'MN',NULL)".format(Mname,Nname) cleanup.append('MN') elif (opType == 'MVM'): array_call = 'SELECT array_agg(random()) FROM generate_series(1,100)' call = "matrix_vec_mult('{}',NULL,({}))".format(Mname,array_call) cleanup.append('Mw') elif (opType == 'TSM'): call = "matrix_mult('{0}','trans=True','{0}',NULL,'MtM',NULL)".format(Mname) cleanup.append('MtM') elif (opType == 'ADD'): call = "matrix_add('{}',NULL,'{}',NULL,'M_N',NULL)".format(Mname, Nname) cleanup.append('M_N') else: raise NotImplementedError('Invalid Operation') for obj in cleanup: cxn.execute('DROP TABLE IF EXISTS {}'.format(obj)) sql_call = 'SELECT madlib.{}'.format(call) fmt = (outdir, mattype, opType, nodes) path = '../output/{}/madlib_{}_{}{}.txt'.format(*fmt) res = cxn.time(sql_call, cleanup) if (res is None): print 'Timed Out' return runTimes.ix[:,'nodes'] = nodes runTimes.ix[:,'sr'] = sr runTimes.ix[:,2:] = res writeHeader = False if (os.path.exists(path)) else True runTimes.to_csv(path, index=False, header = writeHeader, mode = 'a')
def main(): stub = sys.argv[1] cxn = SQLCxn(timeout=None, username='******', db='ubuntu') if not cxn.table_exists('adclick_clean_vectors_split'): shape = cxn.get_shape('adclick_clean{}_dense'.format(stub)) stmt = """ CREATE TABLE adclick_clean_vectors_split AS ( SELECT row_num, val[1]::INTEGER y, val[2:{}]::NUMERIC[] indep_vars FROM adclick_clean{}_dense ) DISTRIBUTED BY (row_num) """.format(shape[1], stub) cxn.execute(stmt) if not cxn.table_exists('adclick_clean_indepvars_long'): stmt = """ CREATE TABLE adclick_clean_indepvars_long AS ( SELECT row_num, ix AS col_num, indep_vars[ix] AS val FROM ( SELECT *, GENERATE_SUBSCRIPTS(indep_vars, 1) AS ix FROM adclick_clean_vectors_split ) tmp ) DISTRIBUTED BY (row_num, col_num) """ cxn.execute(stmt) if not cxn.table_exists('adclick_clean_y'): stmt = """ CREATE TABLE adclick_clean_y AS ( SELECT row_num, 1 AS col_num, y AS val FROM adclick_clean_vectors_split ) DISTRIBUTED BY (row_num) """ cxn.execute(stmt)
def main(kwargs): op_type = kwargs['opType'] nodes = kwargs['nodes'] stub = kwargs['stub'] colnames = [ 'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5' ] runTimes = pd.DataFrame(np.zeros((1, len(colnames)))) runTimes.columns = colnames cxn = SQLCxn(username='******', db='ubuntu') shape = cxn.get_shape_dense('adclick_clean{}_dense'.format(stub)) if not cxn.table_exists('adclick_clean_vectors_split'): stmt = """ CREATE TABLE adclick_clean_vectors_split AS ( SELECT row_num, val[1]::INTEGER y, val[2:{}]::NUMERIC[] indep_vars FROM adclick_clean{}_dense ) DISTRIBUTED BY (row_num) """.format(shape[1], stub) cxn.execute(stmt) # need to do a bit of preprocessing if op_type == 'logit': cxn.execute('DROP TABLE IF EXISTS adclick_logit_summary') cxn.execute('DROP TABLE IF EXISTS adclick_logit') call = """ SELECT madlib.logregr_train('adclick_clean_vectors_split', 'adclick_logit', 'y', 'indep_vars', NULL, 3, 'igd', .000001) """ cleanup = ['adclick_logit_summary', 'adclick_logit'] elif op_type == 'reg': cxn.execute('DROP TABLE IF EXISTS adclick_reg_summary') cxn.execute('DROP TABLE IF EXISTS adclick_reg') call = """ SELECT madlib.linregr_train('adclick_clean_vectors_split', 'adclick_reg', 'y', 'indep_vars') """ cleanup = ['adclick_reg_summary', 'adclick_reg'] elif op_type == 'pca': cxn.execute('DROP TABLE IF EXISTS result_table') cxn.execute('DROP TABLE IF EXISTS result_table_mean') cxn.execute('DROP TABLE IF EXISTS residual_table') cxn.execute('DROP TABLE IF EXISTS result_summary_table') cxn.execute('DROP TABLE IF EXISTS adlick_prj') stmt = """ CREATE TABLE adclick_clean_depvars AS ( SELECT row_num, val[2:{}]::NUMERIC[] val FROM adclick_clean{}_dense ) DISTRIBUTED BY (row_num) """.format(shape[1], stub) if not cxn.table_exists('adclick_clean_depvars'): cxn.execute(stmt) call = """ SELECT madlib.pca_train('adclick_clean_depvars', 'result_table', 'row_num', 5); SELECT madlib.pca_project('adclick_clean_depvars', 'result_table', 'adclick_prj', 'row_num', 'residual_table', 'result_summary_table') """ cleanup = [ 'result_table', 'result_table_mean', 'residual_table', 'result_summary_table', 'adclick_prj' ] #shape = cxn.get_shape_dense('adclick_clean{}_dense'.format(stub)) runTimes.ix[:, ['rows', 'cols']] = shape path = '../output/madlib_{}{}_dense.txt'.format(op_type, int(nodes)) runTimes.ix[:, 'nodes'] = nodes res = cxn.time(call, cleanup) runTimes.ix[:, 3:] = res runTimes.to_csv(path, index=False)
def main(argv): cxn = SQLCxn(timeout=None, username='******', db='ubuntu') coalesce_files(argv.stub, argv.sparse) process_madlib(argv.stub, argv.sparse, cxn)
def main(kwargs): opType = kwargs.get('opType') savestub = kwargs.get('savestub') nodes = kwargs.get('nodes') x_table_name = kwargs.get('xTableName') y_table_name = kwargs.get('yTableName') savestub = '' if (savestub is None) else savestub print 'Evaluating: {}'.format(opType) cxn = SQLCxn(username='******', db='ubuntu', timeout=2000) colnames = [ 'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5' ] runTimes = pd.DataFrame(np.zeros((1, len(colnames)))) runTimes.columns = colnames shape = cxn.get_shape(x_table_name) env = { 'x_table_name': x_table_name, 'y_table_name': y_table_name, 'do_logit': do_logit, 'do_reg': do_reg, 'shape': shape, 'cxn': cxn } cleanup = None if opType == 'logit': call = 'do_logit(x_table_name, y_table_name, shape, cxn)' elif opType == 'gnmf': call = 'do_gnmf(x_table_name, shape, 10, cxn)' elif opType == 'reg': call = 'do_reg(x_table_name, y_table_name, cxn)' cleanup = ("map(lambda x: cxn.execute(" "'DROP TABLE {}'.format(x)), ['XTX','XTY','XTX_INV','B'])") elif opType == 'robust': do_reg(x_table_name, y_table_name, cxn) preproc = """ DROP TABLE IF EXISTS Y_HAT; SELECT madlib.matrix_mult('{X}',NULL,'B',NULL,'Y_HAT'); CREATE TABLE R2 AS ( SELECT {y}.row_num, ARRAY[POW({y}.val[1]-y_hat.val[1],2)] val FROM {y} INNER JOIN y_hat ON {y}.row_num = y_hat.row_num ) DISTRIBUTED BY (row_num) """.format(X=x_table_name, y=y_table_name) cxn.execute(preproc) call = 'do_robust(x_table_name, cxn)' elif opType == 'pca': print 'Not Implemented' return rows = shape[0] cols = shape[1] path = '../output/madlib_adclick_{}{}.txt'.format(opType, int(nodes)) runTimes.ix[:, ['nodes', 'rows', 'cols']] = (nodes, rows, cols) res = utils.timeOp(call, env, cleanup) print res runTimes.ix[:, 3:] = res writeHeader = False if (os.path.exists(path)) else True runTimes.to_csv(path, index=False, header=writeHeader, mode='a')
def main(kwargs): opType = kwargs.get('opType') savestub = kwargs.get('savestub') nodes = kwargs.get('nodes') x_table_name = kwargs.get('xTableName') y_table_name = kwargs.get('yTableName') savestub = '' if (savestub is None) else savestub print 'Evaluating: {}'.format(opType) cxn = SQLCxn(username='******', db='ubuntu', timeout=10000) colnames = ['nodes','rows','cols','time1','time2','time3','time4','time5'] runTimes = pd.DataFrame(np.zeros((1,len(colnames)))) runTimes.columns = colnames shape = cxn.get_shape(x_table_name) env = {'x_table_name': x_table_name, 'y_table_name': y_table_name, 'do_logit': do_logit, 'do_gnmf': do_gnmf, 'do_reg': do_reg, 'do_robust': do_robust, 'shape': shape, 'cxn': cxn} cleanup = None if opType == 'logit': call = 'do_logit(x_table_name, y_table_name, shape, cxn)' elif opType == 'gnmf': call = 'do_gnmf(x_table_name, shape, 10, cxn)' elif opType == 'reg': call = 'do_reg(x_table_name, y_table_name, cxn)' cleanup = ("map(lambda x: cxn.execute(" "'DROP TABLE {}'.format(x)), ['XTX','XTY','XTX_INV','B'])") elif opType == 'robust': #do_reg(x_table_name, y_table_name, cxn) #preproc = """ # DROP TABLE IF EXISTS Y_HAT; # SELECT madlib.matrix_mult('{X}',NULL,'B',NULL,'Y_HAT'); # CREATE TABLE R2 AS ( # SELECT {y}.row_num, ARRAY[POW({y}.val[1]-y_hat.val[1],2)] val # FROM {y} # INNER JOIN y_hat ON {y}.row_num = y_hat.row_num # ) DISTRIBUTED BY (row_num) #""".format(X=x_table_name, y=y_table_name) # We can just generate a vector of residuals on the fly # rather than computing them explicitly. cxn.execute('DROP TABLE IF EXISTS R2') cxn.randomMatrix(shape[0], 1, 'R2') call = 'do_robust(x_table_name, cxn)' rows = shape[0] cols = shape[1] path = '../output/madlib_tall_{}{}.txt'.format(opType, int(nodes)) runTimes.ix[:,['nodes','rows','cols']] = (nodes, rows, cols) res = utils.timeOp(call, env, cleanup) res runTimes.ix[:,3:] = res writeHeader = False if (os.path.exists(path)) else True runTimes.to_csv(path, index=False, header = writeHeader, mode = 'a')
def doMatrixOp(kwargs): opType = kwargs.get('opType') mattype = kwargs.get('mattype') fixedAxis = int(kwargs.get('fixedAxis')) nrow_scale = map(lambda x: int(x), kwargs['nrows'].split(' ')) nproc = kwargs.get('nproc') port = GPDB_PORT_MAP[nproc] if nproc is not None else None if nproc is not None: cxn = start_gpdb(port, nproc) cxn.execute('DROP TABLE IF EXISTS M16_tall') atexit.register(stop_gpdb, nproc, cxn) else: cxn = SQLCxn(username='******', db='ubuntu', timeout=10000) colnames = ['rows', 'time1', 'time2', 'time3', 'time4', 'time5'] runTimes = pd.DataFrame(np.zeros((1, len(colnames)))) runTimes.columns = colnames if nproc is None: path = os.path.join('..', 'output', 'madlib_{}_{}.txt'.format(mattype, opType)) else: path = os.path.join('..', 'output', 'madlib_cpu_{}_scale.txt'.format(opType)) for nr in nrow_scale: nrow = fixedAxis if opType == 'GMM' else nr ncol = nr if opType == 'GMM' else fixedAxis print nrow print ncol Mname = 'M{}{}'.format(nrow, ncol) if not cxn.table_exists('M{}{}'.format(nrow, ncol)): cxn.randomMatrix(nrow, ncol, 'M{}{}'.format(nrow, ncol)) if (opType == 'GMM'): if not cxn.table_exists('N{}{}'.format(ncol, nrow)): cxn.randomMatrix(ncol, nrow, 'N{}{}'.format(ncol, nrow)) Nname = 'N{}{}'.format(ncol, nrow) elif (opType == 'ADD'): if not cxn.table_exists('N{}{}'.format(nrow, ncol)): cxn.randomMatrix(nrow, ncol, 'N{}{}'.format(nrow, ncol)) Nname = 'N{}{}'.format(nrow, ncol) cleanup = [] if (opType == 'TRANS'): call = "matrix_trans('{}',NULL,'Mt',NULL)".format(Mname) cleanup.append('Mt') elif (opType == 'NORM'): call = "matrix_norm('{}',NULL,'fro')".format(Mname) elif (opType == 'GMM'): call = "matrix_mult('{}',NULL,'{}',NULL,'MN',NULL)".format( Mname, Nname) cleanup.append('MN') elif (opType == 'MVM'): array_call = 'SELECT array_agg(random()) FROM generate_series(1,{})'.format( ncol) call = "matrix_vec_mult('{}',NULL,({}))".format(Mname, array_call) elif (opType == 'TSM'): call = "matrix_mult('{0}','trans=True','{0}',NULL,'MtM',NULL)".format( Mname) cleanup.append('MtM') elif (opType == 'ADD'): call = "matrix_add('{}',NULL,'{}',NULL,'M_N',NULL)".format( Mname, Nname) cleanup.append('M_N') else: raise NotImplementedError('Invalid Operation') sql_call = 'SELECT madlib.{}'.format(call) runTimes.ix[:, 'rows'] = nr if nproc is None else nproc runTimes.ix[:, 1:] = cxn.time(sql_call, cleanup) writeHeader = False if (os.path.exists(path)) else True runTimes.to_csv(path, index=False, header=writeHeader, mode='a')
def main(kwargs): op_type = kwargs['opType'] nodes = kwargs['nodes'] stub = kwargs['stub'] colnames = [ 'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5' ] runTimes = pd.DataFrame(np.zeros((1, len(colnames)))) runTimes.columns = colnames cxn = SQLCxn(username='******', db='ubuntu', timeout=10000) shape = cxn.get_shape('adclick_clean_1_sparse') if not cxn.table_exists('adclick_clean_1_vectors_sparse'): stmt = """ CREATE TABLE adclick_clean_1_vectors_sparse AS ( SELECT x.row_num, madlib.svec_cast_positions_float8arr( ARRAY_AGG(x.col_num), ARRAY_AGG(x.val), {}, 0.0 ) AS indep_vars, y.val AS y FROM adclick_clean_1_sparse x INNER JOIN adclick_clean_y y ON x.row_num = y.row_num GROUP BY x.row_num, y.val ) DISTRIBUTED BY (row_num) """.format(shape[1]) cxn.execute(stmt) if op_type == 'logit': cxn.execute('DROP TABLE IF EXISTS adclick_logit_summary') cxn.execute('DROP TABLE IF EXISTS adclick_logit') call = """ SELECT madlib.logregr_train('adclick_clean_1_vectors_sparse', 'adclick_logit', 'y', 'indep_vars', NULL, 3, 'igd', .000001) """ cleanup = ['adclick_logit_summary', 'adclick_logit'] elif op_type == 'reg': cxn.execute('DROP TABLE IF EXISTS adclick_reg_summary') cxn.execute('DROP TABLE IF EXISTS adclick_reg') call = """ SELECT madlib.linregr_train('adclick_clean_1_vectors_sparse', 'adclick_reg', 'y', 'indep_vars') """ cleanup = ['adclick_reg_summary', 'adclick_reg'] elif op_type == 'pca': cxn.execute('DROP TABLE IF EXISTS result_table') cxn.execute('DROP TABLE IF EXISTS result_table_mean') cxn.execute('DROP TABLE IF EXISTS residual_table') cxn.execute('DROP TABLE IF EXISTS result_summary_table') cxn.execute('DROP TABLE IF EXISTS adlick_prj') call = """ SELECT madlib.pca_sparse_train('adclick_clean_1_sparse', 'result_table', 'row_num', 'col_num', 'val', '{0}', '{1}', 5); SELECT madlib.pca_sparse_project('adclick_clean_1_sparse', 'result_table', 'adclick_prj', 'row_num', 'col_num', 'val', '{0}', '{1}', 'residual_table', 'result_summary_table') """.format(*shape) cleanup = [ 'result_table', 'result_table_mean', 'residual_table', 'result_summary_table', 'adclick_prj' ] runTimes.ix[:, ['rows', 'cols']] = shape path = '../output/madlib_{}{}_sparse.txt'.format(op_type, int(nodes)) runTimes.ix[:, 'nodes'] = nodes res = cxn.time(call, cleanup) runTimes.ix[:, 3:] = res runTimes.to_csv(path, index=False)