def frombinary(path, ext='bin', conf='conf.json', dtype=None, shape=None, skip=0, index=None, labels=None, engine=None, credentials=None): """ Load series data from flat binary files. Parameters ---------- path : string URI or local filesystem path Directory to load from, can be a URI string with scheme (e.g. 'file://', 's3n://', or 'gs://'), or a single file, or a directory, or a directory with a single wildcard character. ext : str, optional, default = 'bin' Optional file extension specifier. conf : str, optional, default = 'conf.json' Name of conf file with type and size information. dtype : dtype or dtype specifier, default 'float64' Numerical type to use for data after converting from text. shape : tuple or list, optional, default = None Shape of data if known, will be inferred otherwise. skip : int, optional, default = 0 Number of items in each record to skip. index : array, optional, default = None Index for records, if not provided will use (0, 1, ...) labels : array, optional, default = None Labels for records. If provided, should have shape of shape[:-1]. engine : object, default = None Computational engine (e.g. a SparkContext for Spark) credentials : dict, default = None Credentials for remote storage (e.g. S3) in the form {access: ***, secret: ***} """ shape, dtype = _binaryconfig(path, conf, dtype, shape, credentials) from thunder.readers import normalize_scheme, get_parallel_reader path = normalize_scheme(path, ext) from numpy import dtype as dtype_func nelements = shape[-1] + skip recordsize = dtype_func(dtype).itemsize * nelements if spark and isinstance(engine, spark): lines = engine.binaryRecords(path, recordsize) raw = lines.map(lambda x: frombuffer(buffer(x), offset=0, count=nelements, dtype=dtype)[skip:]) def switch(record): ary, idx = record return (idx,), ary rdd = raw.zipWithIndex().map(switch) if shape and len(shape) > 2: expand = lambda k: unravel_index(k[0], shape[0:-1]) rdd = rdd.map(lambda kv: (expand(kv[0]), kv[1])) if not index: index = arange(shape[-1]) return fromrdd(rdd, dtype=dtype, shape=shape, index=index) else: reader = get_parallel_reader(path)(engine, credentials=credentials) data = reader.read(path, ext=ext) values = [] for record in data: buf = record[1] offset = 0 while offset < len(buf): v = frombuffer(buffer(buf), offset=offset, count=nelements, dtype=dtype) values.append(v[skip:]) offset += recordsize if not len(values) == prod(shape[0:-1]): raise ValueError('Unexpected shape, got %g records but expected %g' % (len(values), prod(shape[0:-1]))) values = asarray(values, dtype=dtype) if shape: values = values.reshape(shape) return fromarray(values, index=index, labels=labels)
def fromtext(path, ext='txt', dtype='float64', skip=0, shape=None, index=None, labels=None, npartitions=None, engine=None, credentials=None): """ Loads series data from text files. Assumes data are formatted as rows, where each record is a row of numbers separated by spaces e.g. 'v v v v v'. You can optionally specify a fixed number of initial items per row to skip / discard. Parameters ---------- path : string Directory to load from, can be a URI string with scheme (e.g. 'file://', 's3n://', or 'gs://'), or a single file, or a directory, or a directory with a single wildcard character. ext : str, optional, default = 'txt' File extension. dtype : dtype or dtype specifier, default 'float64' Numerical type to use for data after converting from text. skip : int, optional, default = 0 Number of items in each record to skip. shape : tuple or list, optional, default = None Shape of data if known, will be inferred otherwise. index : array, optional, default = None Index for records, if not provided will use (0, 1, ...) labels : array, optional, default = None Labels for records. If provided, should have length equal to number of rows. npartitions : int, default = None Number of partitions for parallelization (Spark only) engine : object, default = None Computational engine (e.g. a SparkContext for Spark) credentials : dict, default = None Credentials for remote storage (e.g. S3) in the form {access: ***, secret: ***} """ from thunder.readers import normalize_scheme, get_parallel_reader path = normalize_scheme(path, ext) if spark and isinstance(engine, spark): def parse(line, skip): vec = [float(x) for x in line.split(' ')] return array(vec[skip:], dtype=dtype) lines = engine.textFile(path, npartitions) data = lines.map(lambda x: parse(x, skip)) def switch(record): ary, idx = record return (idx,), ary rdd = data.zipWithIndex().map(switch) return fromrdd(rdd, dtype=str(dtype), shape=shape, index=index) else: reader = get_parallel_reader(path)(engine, credentials=credentials) data = reader.read(path, ext=ext) values = [] for kv in data: for line in str(kv[1].decode('utf-8')).split('\n')[:-1]: values.append(fromstring(line, sep=' ')) values = asarray(values) if skip > 0: values = values[:, skip:] if shape: values = values.reshape(shape) return fromarray(values, index=index, labels=labels)
def frompath(path, accessor=None, ext=None, start=None, stop=None, recursive=False, npartitions=None, dims=None, dtype=None, labels=None, recount=False, engine=None, credentials=None): """ Load images from a path using the given accessor. Supports both local and remote filesystems. Parameters ---------- accessor : function Apply to each item after loading to yield an image. ext : str, optional, default=None File extension. npartitions : int, optional, default=None Number of partitions for computational engine, if None will use default for engine. dims : tuple, optional, default=None Dimensions of images. dtype : str, optional, default=None Numerical type of images. labels : array, optional, default = None Labels for records. If provided, should be one-dimensional. start, stop : nonnegative int, optional, default=None Indices of files to load, interpreted using Python slicing conventions. recursive : boolean, optional, default=False If true, will recursively descend directories from path, loading all files with an extension matching 'ext'. recount : boolean, optional, default=False Force subsequent record counting. """ from thunder.readers import get_parallel_reader reader = get_parallel_reader(path)(engine, credentials=credentials) data = reader.read(path, ext=ext, start=start, stop=stop, recursive=recursive, npartitions=npartitions) if spark and isinstance(engine, spark): if accessor: data = data.flatMap(accessor) if recount: nrecords = None def switch(record): ary, idx = record return (idx, ), ary data = data.values().zipWithIndex().map(switch) else: nrecords = reader.nfiles return fromrdd(data, nrecords=nrecords, dims=dims, dtype=dtype, labels=labels, ordered=True) else: if accessor: data = [accessor(d) for d in data] flattened = list(itertools.chain(*data)) values = [kv[1] for kv in flattened] return fromarray(values, labels=labels)
def frompath(path, accessor=None, ext=None, start=None, stop=None, recursive=False, npartitions=None, dims=None, dtype=None, labels=None, recount=False, engine=None, credentials=None): """ Load images from a path using the given accessor. Supports both local and remote filesystems. Parameters ---------- accessor : function Apply to each item after loading to yield an image. ext : str, optional, default=None File extension. npartitions : int, optional, default=None Number of partitions for computational engine, if None will use default for engine. dims : tuple, optional, default=None Dimensions of images. dtype : str, optional, default=None Numerical type of images. labels : array, optional, default = None Labels for records. If provided, should be one-dimensional. start, stop : nonnegative int, optional, default=None Indices of files to load, interpreted using Python slicing conventions. recursive : boolean, optional, default=False If true, will recursively descend directories from path, loading all files with an extension matching 'ext'. recount : boolean, optional, default=False Force subsequent record counting. """ from thunder.readers import get_parallel_reader reader = get_parallel_reader(path)(engine, credentials=credentials) data = reader.read(path, ext=ext, start=start, stop=stop, recursive=recursive, npartitions=npartitions) if spark and isinstance(engine, spark): if accessor: data = data.flatMap(accessor) if recount: nrecords = None def switch(record): ary, idx = record return (idx,), ary data = data.values().zipWithIndex().map(switch) else: nrecords = reader.nfiles return fromrdd(data, nrecords=nrecords, dims=dims, dtype=dtype, labels=labels, ordered=True) else: if accessor: data = [accessor(d) for d in data] flattened = list(itertools.chain(*data)) values = [kv[1] for kv in flattened] return fromarray(values, labels=labels)
IMG_PATH ='/images/b2' MODEL_PATH='/k_model' FEATURE_PATH = '/xtract_feature' SERVER = "127.0.0.1" U_NAME = "***********" PASSWORD = "******" LOCAL_PATH = "/home/amit/A1/b2" sc = spark.sparkContext sqlContext = SQLContext(sc) #data=images.frompng('/home/amit/A1',npartitions=8, engine=sc) from thunder.readers import get_parallel_reader, FileNotFoundError reader = get_parallel_reader(IMG_PATH)(sc) #data = reader.read(IMG_PATH, recursive=True, npartitions=8) from scipy.misc import imread ssh = paramiko.SSHClient() ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) ssh.connect(SERVER, username=U_NAME, password=PASSWORD) ftp = ssh.open_sftp() def readlocal(path, offset=None, size=-1): """ Wrapper around open(path, 'rb') that returns the contents of the file as a string. Will rethrow FileNotFoundError if it receives an IOError. """ #print(path)