コード例 #1
0
    def load_from_iterable(cls, values, dtype, ignore_cast_failure):
        """
        Load RDD from values given by iterable.

        Note
        ----
        Values must not only be iterable, but also it must support len and __getitem__

        Modifies the existing RDD: does not return a new XArray.
        """
        cls._entry(dtype=dtype, ignore_cast_failure=ignore_cast_failure)
        dtype = dtype or None
        sc = CommonSparkContext.spark_context()
        try:
            if len(values) == 0:
                dtype = dtype or infer_type_of_list(values[0:100])
                return XArrayImpl(XRdd(sc.parallelize([])), dtype)
        except TypeError:
            # get here if values does not support len or __getitem
            pass

        if dtype is None:
            # try iterating and see if we get something
            cpy = copy.copy(values)
            for val in cpy:
                dtype = infer_type_of_list([val])
                break

        if dtype is None:
            raise TypeError('Cannot determine types.')

        # noinspection PyShadowingNames
        def do_cast(x, dtype, ignore_cast_failure):
            if is_missing(x):
                return x
            if isinstance(x, str) and dtype is datetime.datetime:
                return date_parser.parse(x)
            if isinstance(x, dtype):
                return x
            try:
                return dtype(x)
            except (ValueError, TypeError):
                # TODO: this does not seem to catch as it should
                return None if ignore_cast_failure else ValueError

        raw_rdd = XRdd(sc.parallelize(values))
        rdd = raw_rdd.map(lambda x: do_cast(x, dtype, ignore_cast_failure))
        if not ignore_cast_failure:
            errs = len(rdd.filter(lambda x: x is ValueError).take(1)) == 1
            if errs:
                raise ValueError

        return cls(rdd, dtype, Lineage.init_array_lineage(Lineage.PROGRAM))
コード例 #2
0
    def load_from_iterable(cls, values, dtype, ignore_cast_failure):
        """
        Load RDD from values given by iterable.

        Note
        ----
        Values must not only be iterable, but also it must support len and __getitem__

        Modifies the existing RDD: does not return a new XArray.
        """
        cls._entry(dtype=dtype, ignore_cast_failure=ignore_cast_failure)
        dtype = dtype or None
        sc = CommonSparkContext.spark_context()
        try:
            if len(values) == 0:
                dtype = dtype or infer_type_of_list(values[0:100])
                return XArrayImpl(XRdd(sc.parallelize([])), dtype)
        except TypeError:
            # get here if values does not support len or __getitem
            pass

        if dtype is None:
            # try iterating and see if we get something
            cpy = copy.copy(values)
            for val in cpy:
                dtype = infer_type_of_list([val])
                break

        if dtype is None:
            raise TypeError('Cannot determine types.')

        # noinspection PyShadowingNames
        def do_cast(x, dtype, ignore_cast_failure):
            if is_missing(x):
                return x
            if isinstance(x, str) and dtype is datetime.datetime:
                return date_parser.parse(x)
            if isinstance(x, dtype):
                return x
            try:
                return dtype(x)
            except (ValueError, TypeError):
                # TODO: this does not seem to catch as it should
                return None if ignore_cast_failure else ValueError

        raw_rdd = XRdd(sc.parallelize(values))
        rdd = raw_rdd.map(lambda x: do_cast(x, dtype, ignore_cast_failure))
        if not ignore_cast_failure:
            errs = len(rdd.filter(lambda x: x is ValueError).take(1)) == 1
            if errs:
                raise ValueError

        return cls(rdd, dtype, Lineage.init_array_lineage(Lineage.PROGRAM))
コード例 #3
0
    def __init__(self):
        """
        Create a spark context.

        The spark configuration is taken from xframes/config.ini and from
        the values set in SparkInitContext.set() if this has been called.
        """

        # This is placed here because otherwise it causes an error when used in a spark slave.
        from pyspark import SparkConf, SparkContext, SQLContext, HiveContext
        # This reads from default.ini and then xframes/config.ini
        # if they exist.
        self._env = Environment.create()
        context = create_spark_config(self._env)
        verbose = self._env.get_config('xframes', 'verbose',
                                       'false').lower() == 'true'
        hdfs_user_name = self._env.get_config('webhdfs', 'user', 'hdfs')
        os.environ['HADOOP_USER_NAME'] = hdfs_user_name
        config_pairs = [(k, v) for k, v in context.iteritems()]
        self._config = (SparkConf().setAll(config_pairs))
        if verbose:
            print 'Spark Config: {}'.format(config_pairs)

        self._sc = SparkContext(conf=self._config)
        self._sqlc = SQLContext(self._sc)
        self._hivec = HiveContext(self._sc)
        self.zip_path = []
        version = [int(n) for n in self._sc.version.split('.')]
        self.status_tracker = self._sc.statusTracker()
        if cmp(version, [1, 4, 1]) >= 0:
            self.application_id = self._sc.applicationId
        else:
            self.application_id = None

        if verbose:
            print 'Spark Version: {}'.format(self._sc.version)
            if self.application_id:
                print 'Application Id: {}'.format(self.application_id)

        if not context['spark.master'].startswith('local'):
            zip_path = self.build_zip(get_xframes_home())
            if zip_path:
                self._sc.addPyFile(zip_path)
                self.zip_path.append(zip_path)

        trace_flag = self._env.get_config('xframes', 'rdd-trace',
                                          'false').lower() == 'true'
        XRdd.set_trace(trace_flag)
        atexit.register(self.close_context)
コード例 #4
0
    def __init__(self):
        """
        Create a spark context.

        The spark configuration is taken from xframes/config.ini and from
        the values set in SparkInitContext.set() if this has been called.
        """

        # This is placed here because otherwise it causes an error when used in a spark slave.
        from pyspark import SparkConf, SparkContext, SQLContext, HiveContext

        # This reads from default.ini and then xframes/config.ini
        # if they exist.
        self._env = Environment.create()
        context = create_spark_config(self._env)
        verbose = self._env.get_config("xframes", "verbose", "false").lower() == "true"
        hdfs_user_name = self._env.get_config("webhdfs", "user", "hdfs")
        os.environ["HADOOP_USER_NAME"] = hdfs_user_name
        config_pairs = [(k, v) for k, v in context.iteritems()]
        self._config = SparkConf().setAll(config_pairs)
        if verbose:
            print "Spark Config: {}".format(config_pairs)

        self._sc = SparkContext(conf=self._config)
        self._sqlc = SQLContext(self._sc)
        self._hivec = HiveContext(self._sc)
        self.zip_path = []
        version = [int(n) for n in self._sc.version.split(".")]
        self.status_tracker = self._sc.statusTracker()
        if cmp(version, [1, 4, 1]) >= 0:
            self.application_id = self._sc.applicationId
        else:
            self.application_id = None

        if verbose:
            print "Spark Version: {}".format(self._sc.version)
            if self.application_id:
                print "Application Id: {}".format(self.application_id)

        if not context["spark.master"].startswith("local"):
            zip_path = self.build_zip(get_xframes_home())
            if zip_path:
                self._sc.addPyFile(zip_path)
                self.zip_path.append(zip_path)

        trace_flag = self._env.get_config("xframes", "rdd-trace", "false").lower() == "true"
        XRdd.set_trace(trace_flag)
        atexit.register(self.close_context)
コード例 #5
0
 def _wrap_rdd(rdd):
     if rdd is None:
         return None
     if isinstance(rdd, RDD):
         return XRdd(rdd)
     if isinstance(rdd, XRdd):
         return rdd
     raise TypeError('Type is not RDD')
コード例 #6
0
 def load_from_const(cls, value, size):
     """
     Load RDD from const value.
     """
     cls._entry(value=value, size=size)
     values = [value for _ in xrange(0, size)]
     sc = CommonSparkContext.spark_context()
     return cls(XRdd(sc.parallelize(values)), type(value),
                Lineage.init_array_lineage(Lineage.CONST))
コード例 #7
0
 def print_perf():
     perf = XRdd.get_perf_count()
     if perf:
         print >> stderr, 'XRDD'
         pprint(perf, stream=stderr)
     perf = XArrayImpl.get_perf_count()
     if perf:
         print >> stderr, 'XArray'
         pprint(perf, stream=stderr)
     perf = XFrameImpl.get_perf_count()
     if perf:
         print >> stderr, 'XFrame'
         pprint(perf, stream=stderr)
コード例 #8
0
 def print_perf():
     perf = XRdd.get_perf_count()
     if perf:
         print >>stderr, 'XRDD'
         pprint(perf, stream=stderr)
     perf = XArrayImpl.get_perf_count()
     if perf:
         print >>stderr, 'XArray'
         pprint(perf, stream=stderr)
     perf = XFrameImpl.get_perf_count()
     if perf:
         print >>stderr, 'XFrame'
         pprint(perf, stream=stderr)
コード例 #9
0
 def create_sequential_xarray(size, start, reverse):
     """
     Create RDD with sequential integer values of given size and starting pos.
     """
     if not reverse:
         stop = start + size
         step = 1
     else:
         stop = start - size
         step = -1
     sc = CommonSparkContext.spark_context()
     rdd = XRdd(sc.parallelize(range(start, stop, step)))
     return XArrayImpl(rdd, int, Lineage.init_array_lineage(Lineage.RANGE))
コード例 #10
0
 def __init__(self, rdd=None, elem_type=None, lineage=None):
     # The RDD holds all the data for the XArray.
     # The rows must be of a single type.
     # Types permitted include int, long, float, string, list, and dict.
     # We record the element type here.
     self._entry(elem_type=elem_type)
     if rdd is None:
         sc = CommonSparkContext.spark_context()
         rdd = XRdd(sc.parallelize([]))
     super(XArrayImpl, self).__init__(rdd)
     self.elem_type = elem_type
     self.lineage = lineage or Lineage.init_array_lineage(Lineage.EMPTY)
     self.materialized = False
     self.iter_pos = 0
コード例 #11
0
    def load_autodetect(cls, path, dtype):
        """
        Load from the given path.

        This can be anything that spark will read from: local file or HDFS file.
        It can also be a directory, and spark will read and concatenate them all.
        """
        # Read the file as string
        # Examine the first 100 lines, and cast if necessary to int, float, or datetime
        cls._entry(path=path, dtype=dtype)
        # If the path is a directory, then look for sarray-data file in the directory.
        # If the path is a file, look for that file
        # Use type inference to determine the element type.
        # Passed-in dtype is always str and is ignored.
        lineage = Lineage.init_array_lineage(path)
        sc = CommonSparkContext.spark_context()
        if os.path.isdir(path):
            res = XRdd(sc.pickleFile(path))
            metadata_path = os.path.join(path, '_metadata')
            with fileio.open_file(metadata_path) as f:
                dtype = pickle.load(f)
            lineage_path = os.path.join(path, '_lineage')
            if fileio.exists(lineage_path):
                lineage = Lineage.load(lineage_path)
        else:
            res = XRdd(sc.textFile(path, use_unicode=False))
            dtype = infer_type(res)

        if dtype != str:
            if dtype in (list, dict):
                res = res.map(lambda x: ast.literal_eval(x))
            elif dtype is datetime.datetime:
                res = res.map(lambda x: date_parser.parse(x))
            else:
                res = res.map(lambda x: dtype(x))
        return cls(res, dtype, lineage)
コード例 #12
0
    def load_autodetect(cls, path, dtype):
        """
        Load from the given path.

        This can be anything that spark will read from: local file or HDFS file.
        It can also be a directory, and spark will read and concatenate them all.
        """
        # Read the file as string
        # Examine the first 100 lines, and cast if necessary to int, float, or datetime
        cls._entry(path=path, dtype=dtype)
        # If the path is a directory, then look for sarray-data file in the directory.
        # If the path is a file, look for that file
        # Use type inference to determine the element type.
        # Passed-in dtype is always str and is ignored.
        lineage = Lineage.init_array_lineage(path)
        sc = CommonSparkContext.spark_context()
        if os.path.isdir(path):
            res = XRdd(sc.pickleFile(path))
            metadata_path = os.path.join(path, '_metadata')
            with fileio.open_file(metadata_path) as f:
                dtype = pickle.load(f)
            lineage_path = os.path.join(path, '_lineage')
            if fileio.exists(lineage_path):
                lineage = Lineage.load(lineage_path)
        else:
            res = XRdd(sc.textFile(path, use_unicode=False))
            dtype = infer_type(res)

        if dtype != str:
            if dtype in (list, dict):
                res = res.map(lambda x: ast.literal_eval(x))
            elif dtype is datetime.datetime:
                res = res.map(lambda x: date_parser.parse(x))
            else:
                res = res.map(lambda x: dtype(x))
        return cls(res, dtype, lineage)
コード例 #13
0
 def xrdd_track(enable=True):
     XRdd.set_perf_count(enable)
コード例 #14
0
 def xrdd_track(enable=True):
     XRdd.set_perf_count(enable)