Example #1
0
def load_json_file_array(root, array_name):
    # Load the datashape
    dsfile = root + ".datashape"
    if not path.isfile(dsfile):
        dsfile = path.dirname(root) + ".datashape"
        if not path.isfile(dsfile):
            raise Exception("No datashape file found for array %s" % array_name)
    with open(dsfile) as f:
        dt = ndt.type(f.read())

    # Load the JSON
    # TODO: Add stream support to parse_json for compressed JSON, etc.
    arr = nd.parse_json(dt, nd.memmap(root + ".json"))
    return array(arr)
Example #2
0
def load_json_directory_array(root, array_name):
    # Load the datashape
    dsfile = root + ".datashape"
    if not path.isfile(dsfile):
        raise Exception("No datashape file found for array %s" % array_name)
    with open(dsfile) as f:
        dt = ndt.type(f.read())

    # Scan for JSON files, assuming they're just #.json
    # Sort them numerically
    files = sorted([(int(path.splitext(path.basename(x))[0]), x) for x in glob.glob(path.join(root, "*.json"))])
    files = [x[1] for x in files]
    # Make an array with an extra fixed dimension, then
    # read a JSON file into each element of that array
    dt = ndt.make_fixed_dim(len(files), dt)
    arr = nd.empty(dt)
    for i, fname in enumerate(files):
        nd.parse_json(arr[i], nd.memmap(fname))
    arr.flag_as_immutable()
    return array(arr)
def load_blaze_array(conf, dir):
    """Loads a blaze array from the catalog configuration and catalog path"""
    # This is a temporary hack, need to transition to using the
    # deferred data descriptors for various formats.
    fsdir = conf.get_fsdir(dir)
    if not path.isfile(fsdir + '.array'):
        raise RuntimeError('Could not find blaze array description file %r'
                           % (fsdir + '.array'))
    with open(fsdir + '.array') as f:
        arrmeta = yaml.load(f)
    tp = arrmeta['type']
    imp = arrmeta['import']
    ds_str = arrmeta.get('datashape')  # optional. HDF5 does not need that.

    if tp == 'csv':
        with open(fsdir + '.csv', 'r') as f:
            rd = csv.reader(f)
            if imp.get('headers', False):
                # Skip the header line
                next(rd)
            dat = list(rd)
        arr = nd.array(dat, ndt.type(ds_str))[:]
        return blaze.array(arr)
    elif tp == 'json':
        arr = nd.parse_json(ds_str, nd.memmap(fsdir + '.json'))
        return blaze.array(arr)
    elif tp == 'hdf5':
        import tables as tb
        from blaze.datadescriptor import HDF5_DDesc
        fname = fsdir + '.h5'   # XXX .h5 assumed for HDF5
        with tb.open_file(fname, 'r') as f:
            dp = imp.get('datapath')  # specifies a path in HDF5
            try:
                dparr = f.get_node(f.root, dp, 'Leaf')
            except tb.NoSuchNodeError:
                raise RuntimeError(
                    'HDF5 file does not have a dataset in %r' % dp)
            dd = HDF5_DDesc(fname, dp)
        return blaze.array(dd)
    elif tp == 'npy':
        import numpy as np
        use_memmap = imp.get('memmap', False)
        if use_memmap:
            arr = np.load(fsdir + '.npy', 'r')
        else:
            arr = np.load(fsdir + '.npy')
        arr = nd.array(arr)
        arr = blaze.array(arr)
        ds = datashape.dshape(ds_str)
        if not matches_datashape_pattern(arr.dshape, ds):
            raise RuntimeError(('NPY file for blaze catalog path %r ' +
                                'has the wrong datashape (%r instead of ' +
                                '%r)') % (arr.dshape, ds))
        return arr
    elif tp == 'py':
        ds = datashape.dshape(ds_str)
        # The script is run with the following globals,
        # and should put the loaded array in a global
        # called 'result'.
        gbl = {'catconf': conf,  # Catalog configuration object
               'impdata': imp,   # Import data from the .array file
               'catpath': dir,   # Catalog path
               'fspath': fsdir,  # Equivalent filesystem path
               'dshape': ds      # Datashape the result should have
               }
        if py2help.PY2:
            execfile(fsdir + '.py', gbl, gbl)
        else:
            with open(fsdir + '.py') as f:
                code = compile(f.read(), fsdir + '.py', 'exec')
                exec(code, gbl, gbl)
        arr = gbl.get('result', None)
        if arr is None:
            raise RuntimeError(('Script for blaze catalog path %r did not ' +
                                'return anything in "result" variable')
                               % (dir))
        elif not isinstance(arr, blaze.Array):
            raise RuntimeError(('Script for blaze catalog path %r returned ' +
                                'wrong type of object (%r instead of ' +
                                'blaze.Array)') % (type(arr)))
        if not matches_datashape_pattern(arr.dshape, ds):
            raise RuntimeError(('Script for blaze catalog path %r returned ' +
                                'array with wrong datashape (%r instead of ' +
                                '%r)') % (arr.dshape, ds))
        return arr
    else:
        raise ValueError(('Unsupported array type %r from ' +
                          'blaze catalog entry %r')
                         % (tp, dir))
Example #4
0
def load_blaze_array(conf, dir):
    """Loads a blaze array from the catalog configuration and catalog path"""
    # This is a temporary hack, need to transition to using the
    # deferred data descriptors for various formats.
    fsdir = conf.get_fsdir(dir)
    if not path.isfile(fsdir + '.array'):
        raise RuntimeError('Could not find blaze array description file %r' %
                           (fsdir + '.array'))
    with open(fsdir + '.array') as f:
        arrmeta = yaml.load(f)
    tp = arrmeta['type']
    imp = arrmeta['import']
    ds_str = arrmeta.get('datashape')  # optional. HDF5 does not need that.

    if tp == 'csv':
        with open(fsdir + '.csv', 'r') as f:
            rd = csv.reader(f)
            if imp.get('headers', False):
                # Skip the header line
                next(rd)
            dat = list(rd)
        arr = nd.array(dat, ndt.type(ds_str))[:]
        return blaze.array(arr)
    elif tp == 'json':
        arr = nd.parse_json(ds_str, nd.memmap(fsdir + '.json'))
        return blaze.array(arr)
    elif tp == 'hdf5':
        import tables as tb
        from blaze.datadescriptor import HDF5DataDescriptor
        fname = fsdir + '.h5'  # XXX .h5 assumed for HDF5
        with tb.open_file(fname, 'r') as f:
            dp = imp.get('datapath')  # specifies a path in HDF5
            try:
                dparr = f.get_node(f.root, dp, 'Leaf')
            except tb.NoSuchNodeError:
                raise RuntimeError('HDF5 file does not have a dataset in %r' %
                                   dp)
            dd = HDF5DataDescriptor(fname, dp)
        return blaze.array(dd)
    elif tp == 'npy':
        import numpy as np
        use_memmap = imp.get('memmap', False)
        if use_memmap:
            arr = np.load(fsdir + '.npy', 'r')
        else:
            arr = np.load(fsdir + '.npy')
        arr = nd.array(arr)
        arr = blaze.array(arr)
        ds = datashape.dshape(ds_str)
        if not compatible_array_dshape(arr, ds):
            raise RuntimeError(
                ('NPY file for blaze catalog path %r ' +
                 'has the wrong datashape (%r instead of ' + '%r)') %
                (arr.dshape, ds))
        return arr
    elif tp == 'py':
        ds = datashape.dshape(ds_str)
        # The script is run with the following globals,
        # and should put the loaded array in a global
        # called 'result'.
        gbl = {
            'catconf': conf,  # Catalog configuration object
            'impdata': imp,  # Import data from the .array file
            'catpath': dir,  # Catalog path
            'fspath': fsdir,  # Equivalent filesystem path
            'dshape': ds  # Datashape the result should have
        }
        if py2help.PY2:
            execfile(fsdir + '.py', gbl, gbl)
        else:
            with open(fsdir + '.py') as f:
                code = compile(f.read(), fsdir + '.py', 'exec')
                exec(code, gbl, gbl)
        arr = gbl.get('result', None)
        if arr is None:
            raise RuntimeError(
                ('Script for blaze catalog path %r did not ' +
                 'return anything in "result" variable') % (dir))
        elif not isinstance(arr, blaze.Array):
            raise RuntimeError(
                ('Script for blaze catalog path %r returned ' +
                 'wrong type of object (%r instead of ' + 'blaze.Array)') %
                (type(arr)))
        if not compatible_array_dshape(arr, ds):
            raise RuntimeError(
                ('Script for blaze catalog path %r returned ' +
                 'array with wrong datashape (%r instead of ' + '%r)') %
                (arr.dshape, ds))
        return arr
    else:
        raise ValueError(
            ('Unsupported array type %r from ' + 'blaze catalog entry %r') %
            (tp, dir))