def main(kwargs): opType = kwargs.get('opType') savestub = kwargs.get('savestub') nodes = kwargs.get('nodes') x_table_name = kwargs.get('xTableName') y_table_name = kwargs.get('yTableName') savestub = '' if (savestub is None) else savestub print 'Evaluating: {}'.format(opType) cxn = SQLCxn(username='******', db='ubuntu', timeout=10000) colnames = ['nodes','rows','cols','time1','time2','time3','time4','time5'] runTimes = pd.DataFrame(np.zeros((1,len(colnames)))) runTimes.columns = colnames shape = cxn.get_shape(x_table_name) env = {'x_table_name': x_table_name, 'y_table_name': y_table_name, 'do_logit': do_logit, 'do_gnmf': do_gnmf, 'do_reg': do_reg, 'do_robust': do_robust, 'shape': shape, 'cxn': cxn} cleanup = None if opType == 'logit': call = 'do_logit(x_table_name, y_table_name, shape, cxn)' elif opType == 'gnmf': call = 'do_gnmf(x_table_name, shape, 10, cxn)' elif opType == 'reg': call = 'do_reg(x_table_name, y_table_name, cxn)' cleanup = ("map(lambda x: cxn.execute(" "'DROP TABLE {}'.format(x)), ['XTX','XTY','XTX_INV','B'])") elif opType == 'robust': #do_reg(x_table_name, y_table_name, cxn) #preproc = """ # DROP TABLE IF EXISTS Y_HAT; # SELECT madlib.matrix_mult('{X}',NULL,'B',NULL,'Y_HAT'); # CREATE TABLE R2 AS ( # SELECT {y}.row_num, ARRAY[POW({y}.val[1]-y_hat.val[1],2)] val # FROM {y} # INNER JOIN y_hat ON {y}.row_num = y_hat.row_num # ) DISTRIBUTED BY (row_num) #""".format(X=x_table_name, y=y_table_name) # We can just generate a vector of residuals on the fly # rather than computing them explicitly. cxn.execute('DROP TABLE IF EXISTS R2') cxn.randomMatrix(shape[0], 1, 'R2') call = 'do_robust(x_table_name, cxn)' rows = shape[0] cols = shape[1] path = '../output/madlib_tall_{}{}.txt'.format(opType, int(nodes)) runTimes.ix[:,['nodes','rows','cols']] = (nodes, rows, cols) res = utils.timeOp(call, env, cleanup) res runTimes.ix[:,3:] = res writeHeader = False if (os.path.exists(path)) else True runTimes.to_csv(path, index=False, header = writeHeader, mode = 'a')
elif mtype == 'tall': k = int(np.ceil((gb * 1e9) / float(8 * 100))) m = 100 rows = 2**14 elif mtype == 'wide': k = 100 m = int(np.ceil((gb * 1e9) / float(8 * 100))) rows = 1 stub = '_' + mtype fmt = (gb_stub, stub) data.gen_data_disk('../output/M{}{}.csv'.format(*fmt), k, m, rows) if ((mtype == 'wide') and (not cxn.table_exists('M{}{}'.format(*fmt)))): print 'CREATING MATRIX: M{}{}'.format(*fmt) cxn.randomMatrix(k, m, 'M{}{}'.format(*fmt)) if mtype != 'tall': continue mpath = os.path.abspath('../output/M{}{}_sparse.mtx'.format(*fmt)) data.gen_data_disk('../output/y{}{}.csv'.format(*fmt), k, 1, rows, True) utils.link_if_not('../output/M{}{}.csv'.format(*fmt), '../output/N{}{}.csv'.format(*fmt)) utils.link_if_not('../output/M{}{}.csv.mtd'.format(*fmt), '../output/N{}{}.csv.mtd'.format(*fmt)) paths = os.listdir('../output') paths = filter( lambda x: (x != '.gitignore') and ('.log' not in x) and ('.mtd' not in x), paths) paths = map(lambda x: os.path.join('../output', x), paths)
def doMatrixOp(kwargs): opType = kwargs.get('opType') mattype = kwargs.get('mattype') fixedAxis = int(kwargs.get('fixedAxis')) nrow_scale = map(lambda x: int(x), kwargs['nrows'].split(' ')) nproc = kwargs.get('nproc') port = GPDB_PORT_MAP[nproc] if nproc is not None else None if nproc is not None: cxn = start_gpdb(port, nproc) cxn.execute('DROP TABLE IF EXISTS M16_tall') atexit.register(stop_gpdb, nproc, cxn) else: cxn = SQLCxn(username='******', db='ubuntu', timeout=10000) colnames = ['rows', 'time1', 'time2', 'time3', 'time4', 'time5'] runTimes = pd.DataFrame(np.zeros((1, len(colnames)))) runTimes.columns = colnames if nproc is None: path = os.path.join('..', 'output', 'madlib_{}_{}.txt'.format(mattype, opType)) else: path = os.path.join('..', 'output', 'madlib_cpu_{}_scale.txt'.format(opType)) for nr in nrow_scale: nrow = fixedAxis if opType == 'GMM' else nr ncol = nr if opType == 'GMM' else fixedAxis print nrow print ncol Mname = 'M{}{}'.format(nrow, ncol) if not cxn.table_exists('M{}{}'.format(nrow, ncol)): cxn.randomMatrix(nrow, ncol, 'M{}{}'.format(nrow, ncol)) if (opType == 'GMM'): if not cxn.table_exists('N{}{}'.format(ncol, nrow)): cxn.randomMatrix(ncol, nrow, 'N{}{}'.format(ncol, nrow)) Nname = 'N{}{}'.format(ncol, nrow) elif (opType == 'ADD'): if not cxn.table_exists('N{}{}'.format(nrow, ncol)): cxn.randomMatrix(nrow, ncol, 'N{}{}'.format(nrow, ncol)) Nname = 'N{}{}'.format(nrow, ncol) cleanup = [] if (opType == 'TRANS'): call = "matrix_trans('{}',NULL,'Mt',NULL)".format(Mname) cleanup.append('Mt') elif (opType == 'NORM'): call = "matrix_norm('{}',NULL,'fro')".format(Mname) elif (opType == 'GMM'): call = "matrix_mult('{}',NULL,'{}',NULL,'MN',NULL)".format( Mname, Nname) cleanup.append('MN') elif (opType == 'MVM'): array_call = 'SELECT array_agg(random()) FROM generate_series(1,{})'.format( ncol) call = "matrix_vec_mult('{}',NULL,({}))".format(Mname, array_call) elif (opType == 'TSM'): call = "matrix_mult('{0}','trans=True','{0}',NULL,'MtM',NULL)".format( Mname) cleanup.append('MtM') elif (opType == 'ADD'): call = "matrix_add('{}',NULL,'{}',NULL,'M_N',NULL)".format( Mname, Nname) cleanup.append('M_N') else: raise NotImplementedError('Invalid Operation') sql_call = 'SELECT madlib.{}'.format(call) runTimes.ix[:, 'rows'] = nr if nproc is None else nproc runTimes.ix[:, 1:] = cxn.time(sql_call, cleanup) writeHeader = False if (os.path.exists(path)) else True runTimes.to_csv(path, index=False, header=writeHeader, mode='a')