Beispiel #1
0
def main(kwargs):
    opType = kwargs.get('opType')
    savestub = kwargs.get('savestub')
    nodes = kwargs.get('nodes')
    x_table_name = kwargs.get('xTableName')
    y_table_name = kwargs.get('yTableName')

    savestub = '' if (savestub is None) else savestub

    print 'Evaluating: {}'.format(opType)

    cxn = SQLCxn(username='******', db='ubuntu', timeout=10000)

    colnames = [
        'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5'
    ]
    runTimes = pd.DataFrame(np.zeros((1, len(colnames))))
    runTimes.columns = colnames

    shape = cxn.get_shape(x_table_name)

    env = {
        'x_table_name': x_table_name,
        'y_table_name': y_table_name,
        'do_logit': do_logit,
        'do_reg': do_reg,
        'do_gnmf': do_gnmf,
        'do_robust': do_robust,
        'shape': shape,
        'cxn': cxn
    }
    cleanup = None
    if opType == 'logit':
        call = 'do_logit(x_table_name, y_table_name, shape, cxn)'
    elif opType == 'gnmf':
        call = 'do_gnmf(x_table_name, shape, 10, cxn)'
    elif opType == 'reg':
        call = 'do_reg(x_table_name, y_table_name, cxn)'
        cleanup = ("map(lambda x: cxn.execute("
                   "'DROP TABLE {}'.format(x)), ['XTX','XTY','XTX_INV','B'])")
    elif opType == 'robust':
        cxn.execute('DROP TABLE IF EXISTS R2 CASCADE')
        cxn.execute(
            "SELECT MADLIB.matrix_random({},1,NULL,'uniform','R2',NULL)".
            format(shape[0]))
        cxn.execute('ALTER TABLE R2 RENAME COLUMN ROW TO ROW_NUM')
        call = 'do_robust(x_table_name, cxn)'

    rows = shape[0]
    cols = shape[1]
    path = '../output/madlib_adclick_{}{}.txt'.format(opType, int(nodes))
    runTimes.ix[:, ['nodes', 'rows', 'cols']] = (nodes, rows, cols)
    res = utils.timeOp(call, env, cleanup)
    print res
    runTimes.ix[:, 3:] = res
    writeHeader = False if (os.path.exists(path)) else True
    runTimes.to_csv(path, index=False, header=writeHeader, mode='a')
Beispiel #2
0
def doMatrixOp(kwargs):
    opType = kwargs.get('opType')
    mattype = kwargs.get('mattype')
    fixedAxis = int(kwargs.get('fixedAxis'))
    nrow_scale = map(lambda x: int(x), kwargs['nrows'].split(' '))
    nproc = kwargs.get('nproc')

    colnames = ['rows','time1','time2','time3','time4','time5']
    runTimes = pd.DataFrame(np.zeros((1,len(colnames))))
    runTimes.columns = colnames

    if (opType == 'TRANS'):
        call = 'M.T'
    elif (opType == 'NORM'):
        call = 'alg.norm(M)'
    elif (opType == 'GMM'):
        call = 'M.dot(N)'
    elif (opType == 'MVM'):
        call = 'M.dot(w)'
    elif (opType == 'TSM'):
        call = 'M.T.dot(M)'
    elif (opType == 'ADD'):
        call = 'M+X'
    elif (opType == 'SM'):
        call = 'np.multiply(10,M)'
    else:
        raise NotImplementedError('Invalid Operation')

    if nproc is None:
        path = os.path.join('..','output','np_{}_{}.txt'.format(mattype, opType))
    else:
        path = os.path.join('..','output','np_cpu_{}_scale.txt'.format(opType))

    for nr in nrow_scale:
        nrow = fixedAxis if opType == 'GMM' else nr
        ncol = nr if opType == 'GMM' else fixedAxis

        env = {'alg' : alg, 'np' : np}
        RNG = np.random
        env['M'] = utils.allocMatrix(nrow, ncol, RNG)

        if (opType == 'GMM'):
            env['N'] = utils.allocMatrix(ncol, nrow, RNG)
        elif (opType == 'MVM'):
            env['w'] = utils.allocMatrix(ncol, 1, RNG)
        elif (opType == 'ADD'):
            env['X'] = utils.allocMatrix(nrow, ncol, RNG)

        runTimes.ix[:,'rows'] = nr if nproc is None else nproc
        runTimes.ix[:,1:] = utils.timeOp(call, env)
        writeHeader = False if (os.path.exists(path)) else True
        runTimes.to_csv(path, index=False, header = writeHeader, mode = 'a')
Beispiel #3
0
def main(kwargs):
    mattype = kwargs['mattype']
    opType = kwargs['opType']
    nrow = int(kwargs['nrow'])
    ncol = int(kwargs['ncol'])
    nproc = int(kwargs['nproc'])

    path = '../output/tf_{}.txt'.format(opType)
    colnames = ['nproc', 'time1', 'time2', 'time3', 'time4', 'time5']
    runTimes = pd.DataFrame(np.zeros((1, len(colnames))))
    runTimes.columns = colnames

    env = {
        'np': np,
        'tf': tf,
        'logit_reg': logit_reg,
        'reg': reg,
        'gnmf': gnmf,
        'robust_se': robust_se
    }
    X = np.random.rand(nrow, ncol).astype(np.float32)
    if opType != 'gnmf':
        y = (np.random.rand(nrow, 1) >= 0.80).astype(np.int64)
    else:
        y = None
    if opType == 'logit':
        call = 'logit_reg(X,y)'
    elif opType == 'reg':
        call = 'reg(X,y)'
    elif opType == 'gnmf':
        call = 'gnmf(X, 10)'
    elif opType == 'robust':
        b = reg(X, y)
        y_hat = X.dot(b)
        env['eps'] = np.power(y_hat, 2).ravel()
        call = 'robust_se(X, eps)'
    else:
        raise StandardError('Invalid Operation')

    env['X'] = X
    env['y'] = y
    runTimes.ix[:, 'nproc'] = nproc
    runTimes.ix[:, 1:] = utils.timeOp(call, env)
    writeHeader = not os.path.exists(path)
    runTimes.to_csv(path, index=False, header=writeHeader, mode='a')
Beispiel #4
0
def main(kwargs):
    op_type  = kwargs.get('opType')
    nodes = kwargs.get('nodes')
    x_table_name = kwargs.get('xTableName')

    nrow, ncol = get_dims(x_table_name)

    path = '../output/scidb_{}{}.txt'.format(op_type, nodes)
    colnames = ['nodes','rows','cols','time1','time2','time3','time4','time5']
    run_times = pd.DataFrame(np.zeros((1,len(colnames))))
    run_times.columns = colnames

    cxn = scidbpy.connect()
    cxn.iquery("load_library('dense_linear_algebra')")

    env = {
        'cxn': cxn, 
        'reg': reg,
        'logit': logit,
        'gnmf': gnmf,
        'robust_se': robust_se
    }

    alloc_matrix(nrow, ncol, 'X{}{}'.format(nrow, ncol), cxn, overwrite=False)
    alloc_matrix(nrow, 0, 'y{}{}'.format(nrow, ncol), cxn,
        overwrite=False, binary=True)
    if op_type == 'reg':
        call = "reg('X{0}{1}', 'y{0}{1}', cxn)".format(nrow, ncol)
    elif op_type == 'logit':
        alloc_matrix(nrow, 0, 'y{}{}b'.format(nrow, ncol), 
            cxn, overwrite=False, binary=True)
        call = "logit('X{0}{1}', 'y{0}{1}b', cxn)".format(nrow, ncol)
    elif op_type == 'gnmf':
        call = "gnmf('X{0}{1}', 10, cxn)".format(nrow, ncol)
    elif op_type == 'robust':
        alloc_matrix(nrow, 0, 'r2{}{}'.format(nrow, ncol), 
            cxn, overwrite=True, val_name='residuals')
        call = "robust_se('X{0}{1}', 'r2{0}{1}', cxn)".format(nrow, ncol)

    run_times.loc[:,['nodes','rows','cols']] = (nodes, nrow, ncol)
    run_times.loc[:,3:] = utils.timeOp(call, env)
    write_header = not os.path.exists(path)
    run_times.to_csv(path, index=False, header=write_header, mode='a')
Beispiel #5
0
def main(kwargs):
    mattype = kwargs['mattype']
    op_type = kwargs['opType']
    nrow = int(kwargs['nrow'])
    ncol = int(kwargs['ncol'])
    nproc = int(kwargs['nproc'])

    path = '../output/scidb_{}.txt'.format(op_type)
    colnames = ['nproc', 'time1', 'time2', 'time3', 'time4', 'time5']
    run_times = pd.DataFrame(np.zeros((1, len(colnames))))
    run_times.columns = colnames

    atexit.register(terminate_scidb)
    P, stdout, stderr = init_scidb(nproc, debug=True)
    cxn = scidbpy.connect()
    cxn.iquery("load_library('dense_linear_algebra')")
    print cxn.iquery("list('instances')", fetch=True)

    env = {
        'cxn': cxn,
        'reg': reg,
        'logit': logit,
        'gnmf': gnmf,
        'robust_se': robust_se
    }

    alloc_matrix(nrow, ncol, 'X{}{}'.format(nrow, ncol), cxn, overwrite=False)
    alloc_matrix(nrow,
                 0,
                 'y{}{}'.format(nrow, ncol),
                 cxn,
                 overwrite=False,
                 binary=True)
    if op_type == 'reg':
        call = "reg('X{0}{1}', 'y{0}{1}', cxn)".format(nrow, ncol)
    elif op_type == 'logit':
        alloc_matrix(nrow,
                     0,
                     'y{}{}b'.format(nrow, ncol),
                     cxn,
                     overwrite=False,
                     binary=True)
        call = "logit('X{0}{1}', 'y{0}{1}b', cxn)".format(nrow, ncol)
    elif op_type == 'gnmf':
        call = "gnmf('X{0}{1}', 10, cxn)".format(nrow, ncol)
    elif op_type == 'robust':
        alloc_matrix(nrow,
                     0,
                     'r2{}{}'.format(nrow, ncol),
                     cxn,
                     overwrite=True,
                     val_name='residuals')
        call = "robust_se('X{0}{1}', 'r2{0}{1}', cxn)".format(nrow, ncol)

    run_times.loc[:, 'nproc'] = nproc
    run_times.loc[:, 1:] = utils.timeOp(call, env)
    write_header = not os.path.exists(path)
    run_times.to_csv(path, index=False, header=write_header, mode='a')

    P.terminate()
    stdout.close()
    stderr.close()
Beispiel #6
0
def main(kwargs):
    opType  = kwargs.get('opType')
    mattype = kwargs.get('mattype')
    rows = int(kwargs.get('nrow'))
    cols = int(kwargs.get('ncol'))
    nproc = kwargs.get('nproc')

    print 'Evaluating: {}'.format(opType)
    port = GPDB_PORT_MAP[nproc]
    cxn = start_gpdb(port, nproc)
    cxn.execute('DROP TABLE IF EXISTS M16_tall')
    atexit.register(stop_gpdb, nproc, cxn)

    shape = (rows,cols)
    if not cxn.table_exists('X{}{}'.format(rows, cols)):
        cxn.randomMatrix(rows, cols, 'X{}{}'.format(rows,cols))
    if (opType not in ['gnmf']) and (not cxn.table_exists('y{}'.format(rows))):
        stmt = """
            CREATE TABLE y_{0} AS (
                SELECT ix as row_num, ARRAY[(RANDOM() > 0.80)::INTEGER] AS val
                  FROM GENERATE_SERIES(1,{0}) ix
            ) DISTRIBUTED BY (row_num)
        """.format(rows)
        try:
            cxn.execute(stmt)
        except:
            pass    

    colnames = ['nproc','time1','time2','time3','time4','time5']
    runTimes = pd.DataFrame(np.zeros((1,len(colnames))))
    runTimes.columns = colnames

    env = {
        'cxn': cxn, 'shape': shape,
        'do_logit': do_logit,
        'do_reg': do_reg,
        'do_gnmf': do_gnmf,
        'do_robust': do_robust,
        'x_table_name': 'X{}{}'.format(rows,cols),
        'y_table_name': 'y_{}'.format(rows)
    }
    cleanup = None
    if opType == 'logit':
        call = 'do_logit(x_table_name, y_table_name, shape, cxn)'
    elif opType == 'gnmf':
        call = 'do_gnmf(x_table_name, shape, 10, cxn)'
    elif opType == 'reg':
        call = 'do_reg(x_table_name, y_table_name, cxn)'
        cleanup = ("map(lambda x: cxn.execute("
                   "'DROP TABLE {}'.format(x)), ['XTX','XTY','XTX_INV','B'])")
    elif opType == 'robust':
        # do_reg(x_table_name, y_table_name, cxn)
        # preproc = """
        #     DROP TABLE IF EXISTS Y_HAT;
        #     SELECT madlib.matrix_mult('{X}',NULL,'B',NULL,'Y_HAT');
        #     CREATE TABLE R2 AS (
        #         SELECT {y}.row_num, ARRAY[POW({y}.val[1]-y_hat.val[1],2)] val
        #           FROM {y}
        #          INNER JOIN y_hat ON {y}.row_num = y_hat.row_num
        #     ) DISTRIBUTED BY (row_num)
        # """.format(X=x_table_name, y=y_table_name)
        # cxn.execute(preproc)
        cxn.execute('DROP TABLE IF EXISTS R2')
        cxn.randomMatrix(rows, 1, 'R2')
        call = 'do_robust(x_table_name, cxn)'

    rows = shape[0]
    cols = shape[1]
    path = '../output/madlib_tall_{}.txt'.format(opType)
    runTimes.ix[:,'nproc'] = nproc
    res = utils.timeOp(call, env, cleanup)
    runTimes.ix[:,1:] = res
    writeHeader = False if (os.path.exists(path)) else True
    runTimes.to_csv(path, index=False, header = writeHeader, mode = 'a')
Beispiel #7
0
def main(kwargs):
    opType = kwargs.get('opType')
    savestub = kwargs.get('savestub')
    nodes = kwargs.get('nodes')
    x_table_name = kwargs.get('xTableName')
    y_table_name = kwargs.get('yTableName')

    savestub = '' if (savestub is None) else savestub

    print 'Evaluating: {}'.format(opType)

    cxn = SQLCxn(username='******', db='ubuntu', timeout=2000)

    colnames = [
        'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5'
    ]
    runTimes = pd.DataFrame(np.zeros((1, len(colnames))))
    runTimes.columns = colnames

    shape = cxn.get_shape(x_table_name)

    env = {
        'x_table_name': x_table_name,
        'y_table_name': y_table_name,
        'do_logit': do_logit,
        'do_reg': do_reg,
        'shape': shape,
        'cxn': cxn
    }
    cleanup = None
    if opType == 'logit':
        call = 'do_logit(x_table_name, y_table_name, shape, cxn)'
    elif opType == 'gnmf':
        call = 'do_gnmf(x_table_name, shape, 10, cxn)'
    elif opType == 'reg':
        call = 'do_reg(x_table_name, y_table_name, cxn)'
        cleanup = ("map(lambda x: cxn.execute("
                   "'DROP TABLE {}'.format(x)), ['XTX','XTY','XTX_INV','B'])")
    elif opType == 'robust':
        do_reg(x_table_name, y_table_name, cxn)
        preproc = """
            DROP TABLE IF EXISTS Y_HAT;
            SELECT madlib.matrix_mult('{X}',NULL,'B',NULL,'Y_HAT');
            CREATE TABLE R2 AS (
                SELECT {y}.row_num, ARRAY[POW({y}.val[1]-y_hat.val[1],2)] val
                  FROM {y}
                 INNER JOIN y_hat ON {y}.row_num = y_hat.row_num
            ) DISTRIBUTED BY (row_num)
        """.format(X=x_table_name, y=y_table_name)
        cxn.execute(preproc)
        call = 'do_robust(x_table_name, cxn)'
    elif opType == 'pca':
        print 'Not Implemented'
        return

    rows = shape[0]
    cols = shape[1]
    path = '../output/madlib_adclick_{}{}.txt'.format(opType, int(nodes))
    runTimes.ix[:, ['nodes', 'rows', 'cols']] = (nodes, rows, cols)
    res = utils.timeOp(call, env, cleanup)
    print res
    runTimes.ix[:, 3:] = res
    writeHeader = False if (os.path.exists(path)) else True
    runTimes.to_csv(path, index=False, header=writeHeader, mode='a')
Beispiel #8
0
def main(kwargs):
    opType  = kwargs.get('opType')
    savestub = kwargs.get('savestub')
    nodes = kwargs.get('nodes')
    x_table_name = kwargs.get('xTableName')
    y_table_name = kwargs.get('yTableName')

    savestub = '' if (savestub is None) else savestub

    print 'Evaluating: {}'.format(opType)

    cxn = SQLCxn(username='******', db='ubuntu', timeout=10000)

    colnames = ['nodes','rows','cols','time1','time2','time3','time4','time5']
    runTimes = pd.DataFrame(np.zeros((1,len(colnames))))
    runTimes.columns = colnames

    shape = cxn.get_shape(x_table_name)

    env = {'x_table_name': x_table_name,
           'y_table_name': y_table_name,
           'do_logit': do_logit,
           'do_gnmf': do_gnmf,
           'do_reg': do_reg,
           'do_robust': do_robust,
           'shape': shape,
           'cxn': cxn}
    cleanup = None
    if opType == 'logit':
        call = 'do_logit(x_table_name, y_table_name, shape, cxn)'
    elif opType == 'gnmf':
        call = 'do_gnmf(x_table_name, shape, 10, cxn)'
    elif opType == 'reg':
        call = 'do_reg(x_table_name, y_table_name, cxn)'
        cleanup = ("map(lambda x: cxn.execute("
                   "'DROP TABLE {}'.format(x)), ['XTX','XTY','XTX_INV','B'])")
    elif opType == 'robust':
        #do_reg(x_table_name, y_table_name, cxn)
        #preproc = """
        #    DROP TABLE IF EXISTS Y_HAT;
        #    SELECT madlib.matrix_mult('{X}',NULL,'B',NULL,'Y_HAT');
        #    CREATE TABLE R2 AS (
        #        SELECT {y}.row_num, ARRAY[POW({y}.val[1]-y_hat.val[1],2)] val
        #          FROM {y}
        #         INNER JOIN y_hat ON {y}.row_num = y_hat.row_num
        #    ) DISTRIBUTED BY (row_num)
        #""".format(X=x_table_name, y=y_table_name)

        # We can just generate a vector of residuals on the fly
        # rather than computing them explicitly.
        cxn.execute('DROP TABLE IF EXISTS R2')
        cxn.randomMatrix(shape[0], 1, 'R2')
        call = 'do_robust(x_table_name, cxn)'

    rows = shape[0]
    cols = shape[1]
    path = '../output/madlib_tall_{}{}.txt'.format(opType, int(nodes))
    runTimes.ix[:,['nodes','rows','cols']] = (nodes, rows, cols)
    res = utils.timeOp(call, env, cleanup)
    res
    runTimes.ix[:,3:] = res
    writeHeader = False if (os.path.exists(path)) else True
    runTimes.to_csv(path, index=False, header = writeHeader, mode = 'a')