Beispiel #1
0
def frombinary(path, ext='bin', conf='conf.json', dtype=None, shape=None, skip=0, index=None, labels=None, engine=None, credentials=None):
    """
    Load series data from flat binary files.

    Parameters
    ----------
    path : string URI or local filesystem path
        Directory to load from, can be a URI string with scheme
        (e.g. 'file://', 's3n://', or 'gs://'), or a single file,
        or a directory, or a directory with a single wildcard character.

    ext : str, optional, default = 'bin'
        Optional file extension specifier.

    conf : str, optional, default = 'conf.json'
        Name of conf file with type and size information.

    dtype : dtype or dtype specifier, default 'float64'
        Numerical type to use for data after converting from text.

    shape : tuple or list, optional, default = None
        Shape of data if known, will be inferred otherwise.

    skip : int, optional, default = 0
        Number of items in each record to skip.

    index : array, optional, default = None
        Index for records, if not provided will use (0, 1, ...)

    labels : array, optional, default = None
        Labels for records. If provided, should have shape of shape[:-1].

    engine : object, default = None
        Computational engine (e.g. a SparkContext for Spark)

    credentials : dict, default = None
        Credentials for remote storage (e.g. S3) in the form {access: ***, secret: ***}
    """
    shape, dtype = _binaryconfig(path, conf, dtype, shape, credentials)

    from thunder.readers import normalize_scheme, get_parallel_reader
    path = normalize_scheme(path, ext)

    from numpy import dtype as dtype_func
    nelements = shape[-1] + skip
    recordsize = dtype_func(dtype).itemsize * nelements

    if spark and isinstance(engine, spark):
        lines = engine.binaryRecords(path, recordsize)
        raw = lines.map(lambda x: frombuffer(buffer(x), offset=0, count=nelements, dtype=dtype)[skip:])

        def switch(record):
            ary, idx = record
            return (idx,), ary

        rdd = raw.zipWithIndex().map(switch)

        if shape and len(shape) > 2:
            expand = lambda k: unravel_index(k[0], shape[0:-1])
            rdd = rdd.map(lambda kv: (expand(kv[0]), kv[1]))

        if not index:
            index = arange(shape[-1])

        return fromrdd(rdd, dtype=dtype, shape=shape, index=index)

    else:
        reader = get_parallel_reader(path)(engine, credentials=credentials)
        data = reader.read(path, ext=ext)

        values = []
        for record in data:
            buf = record[1]
            offset = 0
            while offset < len(buf):
                v = frombuffer(buffer(buf), offset=offset, count=nelements, dtype=dtype)
                values.append(v[skip:])
                offset += recordsize

        if not len(values) == prod(shape[0:-1]):
            raise ValueError('Unexpected shape, got %g records but expected %g'
                             % (len(values), prod(shape[0:-1])))

        values = asarray(values, dtype=dtype)

        if shape:
            values = values.reshape(shape)

        return fromarray(values, index=index, labels=labels)
Beispiel #2
0
def fromtext(path, ext='txt', dtype='float64', skip=0, shape=None, index=None, labels=None, npartitions=None, engine=None, credentials=None):
    """
    Loads series data from text files.

    Assumes data are formatted as rows, where each record is a row
    of numbers separated by spaces e.g. 'v v v v v'. You can
    optionally specify a fixed number of initial items per row to skip / discard.

    Parameters
    ----------
    path : string
        Directory to load from, can be a URI string with scheme
        (e.g. 'file://', 's3n://', or 'gs://'), or a single file,
        or a directory, or a directory with a single wildcard character.

    ext : str, optional, default = 'txt'
        File extension.

    dtype : dtype or dtype specifier, default 'float64'
        Numerical type to use for data after converting from text.

    skip : int, optional, default = 0
        Number of items in each record to skip.

    shape : tuple or list, optional, default = None
        Shape of data if known, will be inferred otherwise.

    index : array, optional, default = None
        Index for records, if not provided will use (0, 1, ...)

    labels : array, optional, default = None
        Labels for records. If provided, should have length equal to number of rows.

    npartitions : int, default = None
        Number of partitions for parallelization (Spark only)

    engine : object, default = None
        Computational engine (e.g. a SparkContext for Spark)

    credentials : dict, default = None
        Credentials for remote storage (e.g. S3) in the form {access: ***, secret: ***}
    """
    from thunder.readers import normalize_scheme, get_parallel_reader
    path = normalize_scheme(path, ext)

    if spark and isinstance(engine, spark):

        def parse(line, skip):
            vec = [float(x) for x in line.split(' ')]
            return array(vec[skip:], dtype=dtype)

        lines = engine.textFile(path, npartitions)
        data = lines.map(lambda x: parse(x, skip))

        def switch(record):
            ary, idx = record
            return (idx,), ary

        rdd = data.zipWithIndex().map(switch)
        return fromrdd(rdd, dtype=str(dtype), shape=shape, index=index)

    else:
        reader = get_parallel_reader(path)(engine, credentials=credentials)
        data = reader.read(path, ext=ext)

        values = []
        for kv in data:
            for line in str(kv[1].decode('utf-8')).split('\n')[:-1]:
                values.append(fromstring(line, sep=' '))
        values = asarray(values)

        if skip > 0:
            values = values[:, skip:]

        if shape:
            values = values.reshape(shape)

        return fromarray(values, index=index, labels=labels)