Beispiel #1
0
def main():
    stub = 'azuremlsampleexperiments.blob.core.windows.net/criteo/day_1.gz'
    url = 'http://{}'.format(stub)

    path = '../temp/day_1.gz'
    if not (os.path.exists(path) or os.path.exists(path.replace('.gz', ''))):
        rc = os.system('wget {} -O {}'.format(url, path))
        if rc != 0:
            raise StandardError('Could not fetch data')

    os.system('hdfs dfs -mkdir /scratch')
    rc = hdfs_put(path)
Beispiel #2
0
cxn.load_dense_matrix('../output/y{}_sparse.csv'.format(sparse_gb),
                      'y{}_sparse'.format(sparse_gb))

paths = os.listdir('../output')
paths = filter(
    lambda x: (x != '.gitignore') and ('.log' not in x) and ('.mtd' not in x),
    paths)
paths = map(lambda x: os.path.join('../output', x), paths)

with open('manifest.txt') as fh:
    manifest = fh.read().split('\n')
fh = open('manifest.txt', 'a')

for path in paths:
    dest, ext = path.replace('../output/', '').split('.')

    data.write_sparse_meta(dest, path, cxn)
    if path in manifest:
        continue
    utils.hdfs_put(path)
    fh.write(path + '\n')
    fh.flush()

fh.close()
# make sure git ignores these files
with open('../output/.gitignore', 'w') as fh:
    fh.write('*.csv\n*.mtd\n*.mtx')

# stop logging
end_make_logging()
Beispiel #3
0
#all_files = os.listdir('../output/scale_nodes')
#for s in systems:
#    for op in ops:
#        relevant_files = filter(
#            lambda x: (s in x) and (op in x) and (nodes in x), all_files)
#        map(lambda x:  os.unlink('../output/scale_nodes/{}'.format(x)),
#             relevant_files)

cmd_args = ('opType={opType} mattype={mattype}'
            ' Mpath={Mpath} Npath={Npath}'
            ' wPath={wPath} tableStub={tableStub}'
            ' nodes={nodes} passPath=/scratch/pass.csv'
            ' outdir=scale_nodes')

data.gen_data_disk('../temp/pass.csv', 2, 2, 2**12)
utils.hdfs_put('../temp/pass.csv')
for op in ops:
    mattype_m = 'tall' if op != 'GMM' else 'wide'
    mattype_n = 'tall'

    Mpath_disk = '../external/disk_data/M{}_{}.csv'.format(matsize, mattype_m)
    wPath_disk = '../external/disk_data/w{}_{}.csv'.format(matsize, mattype_m)
    Npath_disk = '../external/disk_data/N{}_{}.csv'.format(matsize, mattype_n)
    if op == 'GMM':
        NPath_disk = '../external/disk_data/M{}_tall.csv'.format(matsize)

    Mpath_hdfs = Mpath_disk.replace('../external/disk_data', '/scratch')
    wPath_hdfs = wPath_disk.replace('../external/disk_data', '/scratch')
    Npath_hdfs = Npath_disk.replace('../external/disk_data', '/scratch')

    cmd_params_disk = {