コード例 #1
0
    def load_autodetect(cls, path, dtype):
        """
        Load from the given path.
        
        This can be anything that spark will read from: local file or HDFS file.
        It can also be a directory, and spark will read and concatenate them all.
        """
        # Read the file as string
        # Examine the first 100 lines, and cast if necessary to int or float
        cls._entry(path, dtype)
        # If the path is a directory, then look for sarray-data file in the directory.
        # If the path is a file, look for that file
        # Use type inference to determine the element type.
        # Passed-in dtype is always str and is ignored.
        sc = spark_context()
        if os.path.isdir(path):
            res = XRdd(sc.pickleFile(path))
            metadata_path = os.path.join(path, '_metadata')
            with open(metadata_path) as f:
                dtype = pickle.load(f)
        else:
            res = XRdd(sc.textFile(path, use_unicode=False))
            dtype = infer_type(res)

        if dtype != str:
            if dtype in (list, dict):
                res = res.map(lambda x: ast.literal_eval(x))
            else:
                res = res.map(lambda x: dtype(x))
        cls._exit()
        return cls(res, dtype)
コード例 #2
0
    def load_from_iterable(cls, values, dtype, ignore_cast_failure):
        """
        Load RDD from values given by iterable.

        Note
        ----
        Values must not only be iterable, but also it must support len and __getitem__
        
        Modifies the existing RDD: does not return a new XArray.
        """
        cls._entry(values, dtype, ignore_cast_failure)
        dtype = dtype or None
        sc = spark_context()
        try:
            if len(values) == 0:
                cls._exit()
                return XArrayImpl(XRdd(sc.parallelize([])), dtype)
                dtype = dtype or infer_type_of_list(values[0:100])
        except TypeError:
            # get here if values does not support len or __getitem
            pass

        if dtype is None:
            # try iterating and see if we get something
            cpy = copy.copy(values)
            for val in cpy:
                dtype = infer_type_of_list([val])
                break

        if dtype is None:
            raise TypeError('Cannot determine types.')

        def do_cast(x, dtype, ignore_cast_failure):
            if is_missing(x): return x
            if type(x) == dtype:
                return x
            try:
                return dtype(x)
            except (ValueError, TypeError):
                # TODO: this does not seem to cach as it should
                return None if ignore_cast_failure else ValueError

        raw_rdd = XRdd(sc.parallelize(values))
        rdd = raw_rdd.map(lambda x: do_cast(x, dtype, ignore_cast_failure))
        if not ignore_cast_failure:
            errs = len(rdd.filter(lambda x: x is ValueError).take(1)) == 1
            if errs: raise ValueError

        cls._exit()
        return cls(rdd, dtype)