Beispiel #1
0
def doMatrixOp(kwargs):
    opType = kwargs.get('opType')
    mattype = kwargs.get('mattype')
    tableStub = kwargs.get('tableStub')
    savestub = kwargs.get('savestub')
    nodes = kwargs.get('nodes')
    outdir = kwargs.get('outdir')

    savestub = '' if (savestub is None) else savestub
    try:
        tableStub = int(tableStub)
    except ValueError:
        pass

    Mname = 'M{}'.format(tableStub)
    Nname = 'N{}'.format(tableStub)
    wname = 'w{}'.format(tableStub)

    print 'Evaluating: {}'.format(opType)

    colnames = [
        'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5'
    ]
    runTimes = pd.DataFrame(np.zeros((1, len(colnames))))
    runTimes.columns = colnames

    cxn = SQLCxn(username='******', db='ubuntu', timeout=2000)
    shape = cxn.get_shape_dense('M{}'.format(tableStub))

    cleanup = []
    if (opType == 'SVD'):
        call = "svd('{}','svd','row_num',10, 10,'svd_summary')".format(
            Mname, shape[1])
        cleanup.append('svd_s')
        cleanup.append('svd_u')
        cleanup.append('svd_v')
        cleanup.append('svd_summary')
    else:
        raise NotImplementedError('Invalid Operation')

    for obj in cleanup:
        cxn.execute('DROP TABLE IF EXISTS {}'.format(obj))

    sql_call = 'SELECT madlib.{}'.format(call)
    rows = shape[0]
    cols = shape[1]
    path = '../output/{}/madlib_{}_{}{}.txt'.format(outdir, mattype, opType,
                                                    int(nodes))
    runTimes.ix[:, ['nodes', 'rows', 'cols']] = (nodes, rows, cols)
    madlib_timeout = ('../temp/madlib_punked_out.json', opType)
    res = cxn.time(sql_call, cleanup, madlib_timeout)
    if (res is None):
        print 'Timed Out'
        return
    runTimes.ix[:, 3:] = res
    writeHeader = False if (os.path.exists(path)) else True
    runTimes.to_csv(path, index=False, header=writeHeader, mode='a')
Beispiel #2
0
def doMatrixOp(kwargs):
    opType  = kwargs.get('opType')
    mattype = kwargs.get('mattype')
    tableStub = kwargs.get('tableStub')
    savestub = kwargs.get('savestub')
    nodes = kwargs.get('nodes')
    outdir = kwargs.get('outdir')
    sr = kwargs.get('sr')

    try:
        tableStub = int(tableStub)
    except ValueError:
        pass

    Mname = 'M{}'.format(tableStub)
    Nname = 'M{}'.format(tableStub)
    wname = 'w{}'.format(tableStub)
    if opType == 'GMM':
        Nname = Mname.replace('wide','tall')

    print 'Evaluating: {}'.format(opType)

    colnames = ['nodes','sr','time1','time2','time3','time4','time5']
    runTimes = pd.DataFrame(np.zeros((1,len(colnames))))
    runTimes.columns = colnames
    runTimes['nodes'] = runTimes['nodes'].astype('O')
    runTimes['sr'] = runTimes['sr'].astype('O')

    cxn = SQLCxn(username='******', db='ubuntu', timeout=10000)
    
    cleanup = []
    if (opType == 'TRANS'):
        call = "matrix_trans('{}',NULL,'Mt',NULL)".format(Mname)
        cleanup.append('Mt')
    elif (opType == 'NORM'):
        call = "matrix_norm('{}',NULL,'fro')".format(Mname)
    elif (opType == 'GMM'):
        call = "matrix_mult('{}',NULL,'{}',NULL,'MN',NULL)".format(Mname,Nname)
        cleanup.append('MN')
    elif (opType == 'MVM'):
        array_call = 'SELECT array_agg(random()) FROM generate_series(1,100)'
        call = "matrix_vec_mult('{}',NULL,({}))".format(Mname,array_call)
        cleanup.append('Mw')
    elif (opType == 'TSM'):
        call = "matrix_mult('{0}','trans=True','{0}',NULL,'MtM',NULL)".format(Mname)
        cleanup.append('MtM')
    elif (opType == 'ADD'):
        call = "matrix_add('{}',NULL,'{}',NULL,'M_N',NULL)".format(Mname, Nname)
        cleanup.append('M_N')
    else:
        raise NotImplementedError('Invalid Operation')

    for obj in cleanup:
        cxn.execute('DROP TABLE IF EXISTS {}'.format(obj))

    sql_call = 'SELECT madlib.{}'.format(call)
    fmt = (outdir, mattype, opType, nodes)
    path = '../output/{}/madlib_{}_{}{}.txt'.format(*fmt)
    res = cxn.time(sql_call, cleanup)
    if (res is None):
        print 'Timed Out'
        return

    runTimes.ix[:,'nodes'] = nodes
    runTimes.ix[:,'sr'] = sr
    runTimes.ix[:,2:] = res
    writeHeader = False if (os.path.exists(path)) else True
    runTimes.to_csv(path, index=False, header = writeHeader, mode = 'a')
Beispiel #3
0
def doMatrixOp(kwargs):
    opType = kwargs.get('opType')
    mattype = kwargs.get('mattype')
    fixedAxis = int(kwargs.get('fixedAxis'))
    nrow_scale = map(lambda x: int(x), kwargs['nrows'].split(' '))
    nproc = kwargs.get('nproc')

    port = GPDB_PORT_MAP[nproc] if nproc is not None else None

    if nproc is not None:
        cxn = start_gpdb(port, nproc)
        cxn.execute('DROP TABLE IF EXISTS M16_tall')
        atexit.register(stop_gpdb, nproc, cxn)
    else:
        cxn = SQLCxn(username='******', db='ubuntu', timeout=10000)

    colnames = ['rows', 'time1', 'time2', 'time3', 'time4', 'time5']
    runTimes = pd.DataFrame(np.zeros((1, len(colnames))))
    runTimes.columns = colnames

    if nproc is None:
        path = os.path.join('..', 'output',
                            'madlib_{}_{}.txt'.format(mattype, opType))
    else:
        path = os.path.join('..', 'output',
                            'madlib_cpu_{}_scale.txt'.format(opType))
    for nr in nrow_scale:
        nrow = fixedAxis if opType == 'GMM' else nr
        ncol = nr if opType == 'GMM' else fixedAxis
        print nrow
        print ncol
        Mname = 'M{}{}'.format(nrow, ncol)
        if not cxn.table_exists('M{}{}'.format(nrow, ncol)):
            cxn.randomMatrix(nrow, ncol, 'M{}{}'.format(nrow, ncol))
        if (opType == 'GMM'):
            if not cxn.table_exists('N{}{}'.format(ncol, nrow)):
                cxn.randomMatrix(ncol, nrow, 'N{}{}'.format(ncol, nrow))
            Nname = 'N{}{}'.format(ncol, nrow)
        elif (opType == 'ADD'):
            if not cxn.table_exists('N{}{}'.format(nrow, ncol)):
                cxn.randomMatrix(nrow, ncol, 'N{}{}'.format(nrow, ncol))
            Nname = 'N{}{}'.format(nrow, ncol)

        cleanup = []
        if (opType == 'TRANS'):
            call = "matrix_trans('{}',NULL,'Mt',NULL)".format(Mname)
            cleanup.append('Mt')
        elif (opType == 'NORM'):
            call = "matrix_norm('{}',NULL,'fro')".format(Mname)
        elif (opType == 'GMM'):
            call = "matrix_mult('{}',NULL,'{}',NULL,'MN',NULL)".format(
                Mname, Nname)
            cleanup.append('MN')
        elif (opType == 'MVM'):
            array_call = 'SELECT array_agg(random()) FROM generate_series(1,{})'.format(
                ncol)
            call = "matrix_vec_mult('{}',NULL,({}))".format(Mname, array_call)
        elif (opType == 'TSM'):
            call = "matrix_mult('{0}','trans=True','{0}',NULL,'MtM',NULL)".format(
                Mname)
            cleanup.append('MtM')
        elif (opType == 'ADD'):
            call = "matrix_add('{}',NULL,'{}',NULL,'M_N',NULL)".format(
                Mname, Nname)
            cleanup.append('M_N')
        else:
            raise NotImplementedError('Invalid Operation')

        sql_call = 'SELECT madlib.{}'.format(call)
        runTimes.ix[:, 'rows'] = nr if nproc is None else nproc
        runTimes.ix[:, 1:] = cxn.time(sql_call, cleanup)
        writeHeader = False if (os.path.exists(path)) else True
        runTimes.to_csv(path, index=False, header=writeHeader, mode='a')
Beispiel #4
0
def main(kwargs):
    op_type = kwargs['opType']
    nodes = kwargs['nodes']
    stub = kwargs['stub']

    colnames = [
        'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5'
    ]
    runTimes = pd.DataFrame(np.zeros((1, len(colnames))))
    runTimes.columns = colnames

    cxn = SQLCxn(username='******', db='ubuntu')
    shape = cxn.get_shape_dense('adclick_clean{}_dense'.format(stub))
    if not cxn.table_exists('adclick_clean_vectors_split'):
        stmt = """
            CREATE TABLE adclick_clean_vectors_split AS (
                SELECT row_num, val[1]::INTEGER y, val[2:{}]::NUMERIC[] indep_vars
                  FROM adclick_clean{}_dense
            ) DISTRIBUTED BY (row_num)
        """.format(shape[1], stub)
        cxn.execute(stmt)

    # need to do a bit of preprocessing
    if op_type == 'logit':
        cxn.execute('DROP TABLE IF EXISTS adclick_logit_summary')
        cxn.execute('DROP TABLE IF EXISTS adclick_logit')
        call = """
            SELECT madlib.logregr_train('adclick_clean_vectors_split',
                                        'adclick_logit',
                                        'y', 'indep_vars', NULL,
                                        3, 'igd', .000001)
        """
        cleanup = ['adclick_logit_summary', 'adclick_logit']
    elif op_type == 'reg':
        cxn.execute('DROP TABLE IF EXISTS adclick_reg_summary')
        cxn.execute('DROP TABLE IF EXISTS adclick_reg')
        call = """
            SELECT madlib.linregr_train('adclick_clean_vectors_split',
                                        'adclick_reg', 'y', 'indep_vars')
        """
        cleanup = ['adclick_reg_summary', 'adclick_reg']
    elif op_type == 'pca':
        cxn.execute('DROP TABLE IF EXISTS result_table')
        cxn.execute('DROP TABLE IF EXISTS result_table_mean')
        cxn.execute('DROP TABLE IF EXISTS residual_table')
        cxn.execute('DROP TABLE IF EXISTS result_summary_table')
        cxn.execute('DROP TABLE IF EXISTS adlick_prj')
        stmt = """
            CREATE TABLE adclick_clean_depvars AS (
                SELECT row_num, val[2:{}]::NUMERIC[] val
                  FROM adclick_clean{}_dense
            ) DISTRIBUTED BY (row_num)
        """.format(shape[1], stub)
        if not cxn.table_exists('adclick_clean_depvars'):
            cxn.execute(stmt)
        call = """
            SELECT madlib.pca_train('adclick_clean_depvars',
                                    'result_table',
                                    'row_num',
                                    5);
            SELECT madlib.pca_project('adclick_clean_depvars',
                                      'result_table',
                                      'adclick_prj',
                                      'row_num',
                                      'residual_table',
                                      'result_summary_table')
        """
        cleanup = [
            'result_table', 'result_table_mean', 'residual_table',
            'result_summary_table', 'adclick_prj'
        ]

    #shape = cxn.get_shape_dense('adclick_clean{}_dense'.format(stub))
    runTimes.ix[:, ['rows', 'cols']] = shape

    path = '../output/madlib_{}{}_dense.txt'.format(op_type, int(nodes))
    runTimes.ix[:, 'nodes'] = nodes
    res = cxn.time(call, cleanup)
    runTimes.ix[:, 3:] = res
    runTimes.to_csv(path, index=False)
Beispiel #5
0
def main(kwargs):
    op_type = kwargs['opType']
    nodes = kwargs['nodes']
    stub = kwargs['stub']

    colnames = [
        'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5'
    ]
    runTimes = pd.DataFrame(np.zeros((1, len(colnames))))
    runTimes.columns = colnames

    cxn = SQLCxn(username='******', db='ubuntu', timeout=10000)

    shape = cxn.get_shape('adclick_clean_1_sparse')
    if not cxn.table_exists('adclick_clean_1_vectors_sparse'):
        stmt = """
        CREATE TABLE adclick_clean_1_vectors_sparse AS (
            SELECT x.row_num, madlib.svec_cast_positions_float8arr(
               ARRAY_AGG(x.col_num), ARRAY_AGG(x.val), {}, 0.0
               ) AS indep_vars, y.val AS y
             FROM adclick_clean_1_sparse x
            INNER JOIN adclick_clean_y y ON x.row_num = y.row_num
            GROUP BY x.row_num, y.val
        ) DISTRIBUTED BY (row_num)
        """.format(shape[1])
        cxn.execute(stmt)

    if op_type == 'logit':
        cxn.execute('DROP TABLE IF EXISTS adclick_logit_summary')
        cxn.execute('DROP TABLE IF EXISTS adclick_logit')
        call = """
            SELECT madlib.logregr_train('adclick_clean_1_vectors_sparse',
                                        'adclick_logit',
                                        'y', 'indep_vars', NULL,
                                        3, 'igd', .000001)
        """
        cleanup = ['adclick_logit_summary', 'adclick_logit']
    elif op_type == 'reg':
        cxn.execute('DROP TABLE IF EXISTS adclick_reg_summary')
        cxn.execute('DROP TABLE IF EXISTS adclick_reg')
        call = """
            SELECT madlib.linregr_train('adclick_clean_1_vectors_sparse',
                                        'adclick_reg', 'y', 'indep_vars')
        """
        cleanup = ['adclick_reg_summary', 'adclick_reg']
    elif op_type == 'pca':
        cxn.execute('DROP TABLE IF EXISTS result_table')
        cxn.execute('DROP TABLE IF EXISTS result_table_mean')
        cxn.execute('DROP TABLE IF EXISTS residual_table')
        cxn.execute('DROP TABLE IF EXISTS result_summary_table')
        cxn.execute('DROP TABLE IF EXISTS adlick_prj')
        call = """
            SELECT madlib.pca_sparse_train('adclick_clean_1_sparse',
                                           'result_table',
                                           'row_num',
                                           'col_num',
                                           'val',
                                           '{0}',
                                           '{1}',
                                           5);
            SELECT madlib.pca_sparse_project('adclick_clean_1_sparse',
                                      'result_table',
                                      'adclick_prj',
                                      'row_num',
                                      'col_num',
                                      'val',
                                      '{0}',
                                      '{1}',
                                      'residual_table',
                                      'result_summary_table')
        """.format(*shape)
        cleanup = [
            'result_table', 'result_table_mean', 'residual_table',
            'result_summary_table', 'adclick_prj'
        ]

    runTimes.ix[:, ['rows', 'cols']] = shape

    path = '../output/madlib_{}{}_sparse.txt'.format(op_type, int(nodes))
    runTimes.ix[:, 'nodes'] = nodes
    res = cxn.time(call, cleanup)
    runTimes.ix[:, 3:] = res
    runTimes.to_csv(path, index=False)