def main(): stub = sys.argv[1] cxn = SQLCxn(timeout=None, username='******', db='ubuntu') if not cxn.table_exists('adclick_clean_vectors_split'): shape = cxn.get_shape('adclick_clean{}_dense'.format(stub)) stmt = """ CREATE TABLE adclick_clean_vectors_split AS ( SELECT row_num, val[1]::INTEGER y, val[2:{}]::NUMERIC[] indep_vars FROM adclick_clean{}_dense ) DISTRIBUTED BY (row_num) """.format(shape[1], stub) cxn.execute(stmt) if not cxn.table_exists('adclick_clean_indepvars_long'): stmt = """ CREATE TABLE adclick_clean_indepvars_long AS ( SELECT row_num, ix AS col_num, indep_vars[ix] AS val FROM ( SELECT *, GENERATE_SUBSCRIPTS(indep_vars, 1) AS ix FROM adclick_clean_vectors_split ) tmp ) DISTRIBUTED BY (row_num, col_num) """ cxn.execute(stmt) if not cxn.table_exists('adclick_clean_y'): stmt = """ CREATE TABLE adclick_clean_y AS ( SELECT row_num, 1 AS col_num, y AS val FROM adclick_clean_vectors_split ) DISTRIBUTED BY (row_num) """ cxn.execute(stmt)
mpath_tall = os.path.abspath( '../output/M{}{}_sparse_tall.mtx'.format(*fmt)) mpath_wide = os.path.abspath( '../output/M{}{}_sparse_wide.mtx'.format(*fmt)) data.gen_data_sparse(k, 100, sr, 'M{}{}_sparse_tall'.format(*fmt), mpath_tall) data.gen_data_sparse(100, k, sr, 'M{}{}_sparse_wide'.format(*fmt), mpath_wide) data.gen_data_disk('../output/y{}_sparse.csv'.format(sparse_gb), k, 1, k, True) stmt = """ CREATE VIEW N{0}{1}_sparse_tall AS ( SELECT * FROM M{0}{1}_sparse_tall ) """.format(*fmt) if not cxn.table_exists('N{}{}_sparse_tall'.format(*fmt)): cxn.execute(stmt) utils.link_if_not('../output/M{}{}_sparse_tall.mtx'.format(*fmt), '../output/N{}{}_sparse_tall.mtx'.format(*fmt)) cxn.load_dense_matrix('../output/y{}_sparse.csv'.format(sparse_gb), 'y{}_sparse'.format(sparse_gb)) paths = os.listdir('../output') paths = filter( lambda x: (x != '.gitignore') and ('.log' not in x) and ('.mtd' not in x), paths) paths = map(lambda x: os.path.join('../output', x), paths) with open('manifest.txt') as fh:
m = k rows = 2**12 elif mtype == 'tall': k = int(np.ceil((gb * 1e9) / float(8 * 100))) m = 100 rows = 2**14 elif mtype == 'wide': k = 100 m = int(np.ceil((gb * 1e9) / float(8 * 100))) rows = 1 stub = '_' + mtype fmt = (gb_stub, stub) data.gen_data_disk('../output/M{}{}.csv'.format(*fmt), k, m, rows) if ((mtype == 'wide') and (not cxn.table_exists('M{}{}'.format(*fmt)))): print 'CREATING MATRIX: M{}{}'.format(*fmt) cxn.randomMatrix(k, m, 'M{}{}'.format(*fmt)) if mtype != 'tall': continue mpath = os.path.abspath('../output/M{}{}_sparse.mtx'.format(*fmt)) data.gen_data_disk('../output/y{}{}.csv'.format(*fmt), k, 1, rows, True) utils.link_if_not('../output/M{}{}.csv'.format(*fmt), '../output/N{}{}.csv'.format(*fmt)) utils.link_if_not('../output/M{}{}.csv.mtd'.format(*fmt), '../output/N{}{}.csv.mtd'.format(*fmt)) paths = os.listdir('../output') paths = filter( lambda x: (x != '.gitignore') and ('.log' not in x) and ('.mtd' not in x),
def main(kwargs): op_type = kwargs['opType'] nodes = kwargs['nodes'] stub = kwargs['stub'] colnames = [ 'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5' ] runTimes = pd.DataFrame(np.zeros((1, len(colnames)))) runTimes.columns = colnames cxn = SQLCxn(username='******', db='ubuntu') shape = cxn.get_shape_dense('adclick_clean{}_dense'.format(stub)) if not cxn.table_exists('adclick_clean_vectors_split'): stmt = """ CREATE TABLE adclick_clean_vectors_split AS ( SELECT row_num, val[1]::INTEGER y, val[2:{}]::NUMERIC[] indep_vars FROM adclick_clean{}_dense ) DISTRIBUTED BY (row_num) """.format(shape[1], stub) cxn.execute(stmt) # need to do a bit of preprocessing if op_type == 'logit': cxn.execute('DROP TABLE IF EXISTS adclick_logit_summary') cxn.execute('DROP TABLE IF EXISTS adclick_logit') call = """ SELECT madlib.logregr_train('adclick_clean_vectors_split', 'adclick_logit', 'y', 'indep_vars', NULL, 3, 'igd', .000001) """ cleanup = ['adclick_logit_summary', 'adclick_logit'] elif op_type == 'reg': cxn.execute('DROP TABLE IF EXISTS adclick_reg_summary') cxn.execute('DROP TABLE IF EXISTS adclick_reg') call = """ SELECT madlib.linregr_train('adclick_clean_vectors_split', 'adclick_reg', 'y', 'indep_vars') """ cleanup = ['adclick_reg_summary', 'adclick_reg'] elif op_type == 'pca': cxn.execute('DROP TABLE IF EXISTS result_table') cxn.execute('DROP TABLE IF EXISTS result_table_mean') cxn.execute('DROP TABLE IF EXISTS residual_table') cxn.execute('DROP TABLE IF EXISTS result_summary_table') cxn.execute('DROP TABLE IF EXISTS adlick_prj') stmt = """ CREATE TABLE adclick_clean_depvars AS ( SELECT row_num, val[2:{}]::NUMERIC[] val FROM adclick_clean{}_dense ) DISTRIBUTED BY (row_num) """.format(shape[1], stub) if not cxn.table_exists('adclick_clean_depvars'): cxn.execute(stmt) call = """ SELECT madlib.pca_train('adclick_clean_depvars', 'result_table', 'row_num', 5); SELECT madlib.pca_project('adclick_clean_depvars', 'result_table', 'adclick_prj', 'row_num', 'residual_table', 'result_summary_table') """ cleanup = [ 'result_table', 'result_table_mean', 'residual_table', 'result_summary_table', 'adclick_prj' ] #shape = cxn.get_shape_dense('adclick_clean{}_dense'.format(stub)) runTimes.ix[:, ['rows', 'cols']] = shape path = '../output/madlib_{}{}_dense.txt'.format(op_type, int(nodes)) runTimes.ix[:, 'nodes'] = nodes res = cxn.time(call, cleanup) runTimes.ix[:, 3:] = res runTimes.to_csv(path, index=False)
def doMatrixOp(kwargs): opType = kwargs.get('opType') mattype = kwargs.get('mattype') fixedAxis = int(kwargs.get('fixedAxis')) nrow_scale = map(lambda x: int(x), kwargs['nrows'].split(' ')) nproc = kwargs.get('nproc') port = GPDB_PORT_MAP[nproc] if nproc is not None else None if nproc is not None: cxn = start_gpdb(port, nproc) cxn.execute('DROP TABLE IF EXISTS M16_tall') atexit.register(stop_gpdb, nproc, cxn) else: cxn = SQLCxn(username='******', db='ubuntu', timeout=10000) colnames = ['rows', 'time1', 'time2', 'time3', 'time4', 'time5'] runTimes = pd.DataFrame(np.zeros((1, len(colnames)))) runTimes.columns = colnames if nproc is None: path = os.path.join('..', 'output', 'madlib_{}_{}.txt'.format(mattype, opType)) else: path = os.path.join('..', 'output', 'madlib_cpu_{}_scale.txt'.format(opType)) for nr in nrow_scale: nrow = fixedAxis if opType == 'GMM' else nr ncol = nr if opType == 'GMM' else fixedAxis print nrow print ncol Mname = 'M{}{}'.format(nrow, ncol) if not cxn.table_exists('M{}{}'.format(nrow, ncol)): cxn.randomMatrix(nrow, ncol, 'M{}{}'.format(nrow, ncol)) if (opType == 'GMM'): if not cxn.table_exists('N{}{}'.format(ncol, nrow)): cxn.randomMatrix(ncol, nrow, 'N{}{}'.format(ncol, nrow)) Nname = 'N{}{}'.format(ncol, nrow) elif (opType == 'ADD'): if not cxn.table_exists('N{}{}'.format(nrow, ncol)): cxn.randomMatrix(nrow, ncol, 'N{}{}'.format(nrow, ncol)) Nname = 'N{}{}'.format(nrow, ncol) cleanup = [] if (opType == 'TRANS'): call = "matrix_trans('{}',NULL,'Mt',NULL)".format(Mname) cleanup.append('Mt') elif (opType == 'NORM'): call = "matrix_norm('{}',NULL,'fro')".format(Mname) elif (opType == 'GMM'): call = "matrix_mult('{}',NULL,'{}',NULL,'MN',NULL)".format( Mname, Nname) cleanup.append('MN') elif (opType == 'MVM'): array_call = 'SELECT array_agg(random()) FROM generate_series(1,{})'.format( ncol) call = "matrix_vec_mult('{}',NULL,({}))".format(Mname, array_call) elif (opType == 'TSM'): call = "matrix_mult('{0}','trans=True','{0}',NULL,'MtM',NULL)".format( Mname) cleanup.append('MtM') elif (opType == 'ADD'): call = "matrix_add('{}',NULL,'{}',NULL,'M_N',NULL)".format( Mname, Nname) cleanup.append('M_N') else: raise NotImplementedError('Invalid Operation') sql_call = 'SELECT madlib.{}'.format(call) runTimes.ix[:, 'rows'] = nr if nproc is None else nproc runTimes.ix[:, 1:] = cxn.time(sql_call, cleanup) writeHeader = False if (os.path.exists(path)) else True runTimes.to_csv(path, index=False, header=writeHeader, mode='a')
def main(kwargs): op_type = kwargs['opType'] nodes = kwargs['nodes'] stub = kwargs['stub'] colnames = [ 'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5' ] runTimes = pd.DataFrame(np.zeros((1, len(colnames)))) runTimes.columns = colnames cxn = SQLCxn(username='******', db='ubuntu', timeout=10000) shape = cxn.get_shape('adclick_clean_1_sparse') if not cxn.table_exists('adclick_clean_1_vectors_sparse'): stmt = """ CREATE TABLE adclick_clean_1_vectors_sparse AS ( SELECT x.row_num, madlib.svec_cast_positions_float8arr( ARRAY_AGG(x.col_num), ARRAY_AGG(x.val), {}, 0.0 ) AS indep_vars, y.val AS y FROM adclick_clean_1_sparse x INNER JOIN adclick_clean_y y ON x.row_num = y.row_num GROUP BY x.row_num, y.val ) DISTRIBUTED BY (row_num) """.format(shape[1]) cxn.execute(stmt) if op_type == 'logit': cxn.execute('DROP TABLE IF EXISTS adclick_logit_summary') cxn.execute('DROP TABLE IF EXISTS adclick_logit') call = """ SELECT madlib.logregr_train('adclick_clean_1_vectors_sparse', 'adclick_logit', 'y', 'indep_vars', NULL, 3, 'igd', .000001) """ cleanup = ['adclick_logit_summary', 'adclick_logit'] elif op_type == 'reg': cxn.execute('DROP TABLE IF EXISTS adclick_reg_summary') cxn.execute('DROP TABLE IF EXISTS adclick_reg') call = """ SELECT madlib.linregr_train('adclick_clean_1_vectors_sparse', 'adclick_reg', 'y', 'indep_vars') """ cleanup = ['adclick_reg_summary', 'adclick_reg'] elif op_type == 'pca': cxn.execute('DROP TABLE IF EXISTS result_table') cxn.execute('DROP TABLE IF EXISTS result_table_mean') cxn.execute('DROP TABLE IF EXISTS residual_table') cxn.execute('DROP TABLE IF EXISTS result_summary_table') cxn.execute('DROP TABLE IF EXISTS adlick_prj') call = """ SELECT madlib.pca_sparse_train('adclick_clean_1_sparse', 'result_table', 'row_num', 'col_num', 'val', '{0}', '{1}', 5); SELECT madlib.pca_sparse_project('adclick_clean_1_sparse', 'result_table', 'adclick_prj', 'row_num', 'col_num', 'val', '{0}', '{1}', 'residual_table', 'result_summary_table') """.format(*shape) cleanup = [ 'result_table', 'result_table_mean', 'residual_table', 'result_summary_table', 'adclick_prj' ] runTimes.ix[:, ['rows', 'cols']] = shape path = '../output/madlib_{}{}_sparse.txt'.format(op_type, int(nodes)) runTimes.ix[:, 'nodes'] = nodes res = cxn.time(call, cleanup) runTimes.ix[:, 3:] = res runTimes.to_csv(path, index=False)