def load_autodetect(cls, path, dtype): """ Load from the given path. This can be anything that spark will read from: local file or HDFS file. It can also be a directory, and spark will read and concatenate them all. """ # Read the file as string # Examine the first 100 lines, and cast if necessary to int or float cls._entry(path, dtype) # If the path is a directory, then look for sarray-data file in the directory. # If the path is a file, look for that file # Use type inference to determine the element type. # Passed-in dtype is always str and is ignored. sc = spark_context() if os.path.isdir(path): res = XRdd(sc.pickleFile(path)) metadata_path = os.path.join(path, '_metadata') with open(metadata_path) as f: dtype = pickle.load(f) else: res = XRdd(sc.textFile(path, use_unicode=False)) dtype = infer_type(res) if dtype != str: if dtype in (list, dict): res = res.map(lambda x: ast.literal_eval(x)) else: res = res.map(lambda x: dtype(x)) cls._exit() return cls(res, dtype)
def load_from_iterable(cls, values, dtype, ignore_cast_failure): """ Load RDD from values given by iterable. Note ---- Values must not only be iterable, but also it must support len and __getitem__ Modifies the existing RDD: does not return a new XArray. """ cls._entry(values, dtype, ignore_cast_failure) dtype = dtype or None sc = spark_context() try: if len(values) == 0: cls._exit() return XArrayImpl(XRdd(sc.parallelize([])), dtype) dtype = dtype or infer_type_of_list(values[0:100]) except TypeError: # get here if values does not support len or __getitem pass if dtype is None: # try iterating and see if we get something cpy = copy.copy(values) for val in cpy: dtype = infer_type_of_list([val]) break if dtype is None: raise TypeError('Cannot determine types.') def do_cast(x, dtype, ignore_cast_failure): if is_missing(x): return x if type(x) == dtype: return x try: return dtype(x) except (ValueError, TypeError): # TODO: this does not seem to cach as it should return None if ignore_cast_failure else ValueError raw_rdd = XRdd(sc.parallelize(values)) rdd = raw_rdd.map(lambda x: do_cast(x, dtype, ignore_cast_failure)) if not ignore_cast_failure: errs = len(rdd.filter(lambda x: x is ValueError).take(1)) == 1 if errs: raise ValueError cls._exit() return cls(rdd, dtype)