def load_from_iterable(cls, values, dtype, ignore_cast_failure): """ Load RDD from values given by iterable. Note ---- Values must not only be iterable, but also it must support len and __getitem__ Modifies the existing RDD: does not return a new XArray. """ cls._entry(dtype=dtype, ignore_cast_failure=ignore_cast_failure) dtype = dtype or None sc = CommonSparkContext.spark_context() try: if len(values) == 0: dtype = dtype or infer_type_of_list(values[0:100]) return XArrayImpl(XRdd(sc.parallelize([])), dtype) except TypeError: # get here if values does not support len or __getitem pass if dtype is None: # try iterating and see if we get something cpy = copy.copy(values) for val in cpy: dtype = infer_type_of_list([val]) break if dtype is None: raise TypeError('Cannot determine types.') # noinspection PyShadowingNames def do_cast(x, dtype, ignore_cast_failure): if is_missing(x): return x if isinstance(x, str) and dtype is datetime.datetime: return date_parser.parse(x) if isinstance(x, dtype): return x try: return dtype(x) except (ValueError, TypeError): # TODO: this does not seem to catch as it should return None if ignore_cast_failure else ValueError raw_rdd = XRdd(sc.parallelize(values)) rdd = raw_rdd.map(lambda x: do_cast(x, dtype, ignore_cast_failure)) if not ignore_cast_failure: errs = len(rdd.filter(lambda x: x is ValueError).take(1)) == 1 if errs: raise ValueError return cls(rdd, dtype, Lineage.init_array_lineage(Lineage.PROGRAM))
def __init__(self): """ Create a spark context. The spark configuration is taken from xframes/config.ini and from the values set in SparkInitContext.set() if this has been called. """ # This is placed here because otherwise it causes an error when used in a spark slave. from pyspark import SparkConf, SparkContext, SQLContext, HiveContext # This reads from default.ini and then xframes/config.ini # if they exist. self._env = Environment.create() context = create_spark_config(self._env) verbose = self._env.get_config('xframes', 'verbose', 'false').lower() == 'true' hdfs_user_name = self._env.get_config('webhdfs', 'user', 'hdfs') os.environ['HADOOP_USER_NAME'] = hdfs_user_name config_pairs = [(k, v) for k, v in context.iteritems()] self._config = (SparkConf().setAll(config_pairs)) if verbose: print 'Spark Config: {}'.format(config_pairs) self._sc = SparkContext(conf=self._config) self._sqlc = SQLContext(self._sc) self._hivec = HiveContext(self._sc) self.zip_path = [] version = [int(n) for n in self._sc.version.split('.')] self.status_tracker = self._sc.statusTracker() if cmp(version, [1, 4, 1]) >= 0: self.application_id = self._sc.applicationId else: self.application_id = None if verbose: print 'Spark Version: {}'.format(self._sc.version) if self.application_id: print 'Application Id: {}'.format(self.application_id) if not context['spark.master'].startswith('local'): zip_path = self.build_zip(get_xframes_home()) if zip_path: self._sc.addPyFile(zip_path) self.zip_path.append(zip_path) trace_flag = self._env.get_config('xframes', 'rdd-trace', 'false').lower() == 'true' XRdd.set_trace(trace_flag) atexit.register(self.close_context)
def __init__(self): """ Create a spark context. The spark configuration is taken from xframes/config.ini and from the values set in SparkInitContext.set() if this has been called. """ # This is placed here because otherwise it causes an error when used in a spark slave. from pyspark import SparkConf, SparkContext, SQLContext, HiveContext # This reads from default.ini and then xframes/config.ini # if they exist. self._env = Environment.create() context = create_spark_config(self._env) verbose = self._env.get_config("xframes", "verbose", "false").lower() == "true" hdfs_user_name = self._env.get_config("webhdfs", "user", "hdfs") os.environ["HADOOP_USER_NAME"] = hdfs_user_name config_pairs = [(k, v) for k, v in context.iteritems()] self._config = SparkConf().setAll(config_pairs) if verbose: print "Spark Config: {}".format(config_pairs) self._sc = SparkContext(conf=self._config) self._sqlc = SQLContext(self._sc) self._hivec = HiveContext(self._sc) self.zip_path = [] version = [int(n) for n in self._sc.version.split(".")] self.status_tracker = self._sc.statusTracker() if cmp(version, [1, 4, 1]) >= 0: self.application_id = self._sc.applicationId else: self.application_id = None if verbose: print "Spark Version: {}".format(self._sc.version) if self.application_id: print "Application Id: {}".format(self.application_id) if not context["spark.master"].startswith("local"): zip_path = self.build_zip(get_xframes_home()) if zip_path: self._sc.addPyFile(zip_path) self.zip_path.append(zip_path) trace_flag = self._env.get_config("xframes", "rdd-trace", "false").lower() == "true" XRdd.set_trace(trace_flag) atexit.register(self.close_context)
def _wrap_rdd(rdd): if rdd is None: return None if isinstance(rdd, RDD): return XRdd(rdd) if isinstance(rdd, XRdd): return rdd raise TypeError('Type is not RDD')
def load_from_const(cls, value, size): """ Load RDD from const value. """ cls._entry(value=value, size=size) values = [value for _ in xrange(0, size)] sc = CommonSparkContext.spark_context() return cls(XRdd(sc.parallelize(values)), type(value), Lineage.init_array_lineage(Lineage.CONST))
def print_perf(): perf = XRdd.get_perf_count() if perf: print >> stderr, 'XRDD' pprint(perf, stream=stderr) perf = XArrayImpl.get_perf_count() if perf: print >> stderr, 'XArray' pprint(perf, stream=stderr) perf = XFrameImpl.get_perf_count() if perf: print >> stderr, 'XFrame' pprint(perf, stream=stderr)
def print_perf(): perf = XRdd.get_perf_count() if perf: print >>stderr, 'XRDD' pprint(perf, stream=stderr) perf = XArrayImpl.get_perf_count() if perf: print >>stderr, 'XArray' pprint(perf, stream=stderr) perf = XFrameImpl.get_perf_count() if perf: print >>stderr, 'XFrame' pprint(perf, stream=stderr)
def create_sequential_xarray(size, start, reverse): """ Create RDD with sequential integer values of given size and starting pos. """ if not reverse: stop = start + size step = 1 else: stop = start - size step = -1 sc = CommonSparkContext.spark_context() rdd = XRdd(sc.parallelize(range(start, stop, step))) return XArrayImpl(rdd, int, Lineage.init_array_lineage(Lineage.RANGE))
def __init__(self, rdd=None, elem_type=None, lineage=None): # The RDD holds all the data for the XArray. # The rows must be of a single type. # Types permitted include int, long, float, string, list, and dict. # We record the element type here. self._entry(elem_type=elem_type) if rdd is None: sc = CommonSparkContext.spark_context() rdd = XRdd(sc.parallelize([])) super(XArrayImpl, self).__init__(rdd) self.elem_type = elem_type self.lineage = lineage or Lineage.init_array_lineage(Lineage.EMPTY) self.materialized = False self.iter_pos = 0
def load_autodetect(cls, path, dtype): """ Load from the given path. This can be anything that spark will read from: local file or HDFS file. It can also be a directory, and spark will read and concatenate them all. """ # Read the file as string # Examine the first 100 lines, and cast if necessary to int, float, or datetime cls._entry(path=path, dtype=dtype) # If the path is a directory, then look for sarray-data file in the directory. # If the path is a file, look for that file # Use type inference to determine the element type. # Passed-in dtype is always str and is ignored. lineage = Lineage.init_array_lineage(path) sc = CommonSparkContext.spark_context() if os.path.isdir(path): res = XRdd(sc.pickleFile(path)) metadata_path = os.path.join(path, '_metadata') with fileio.open_file(metadata_path) as f: dtype = pickle.load(f) lineage_path = os.path.join(path, '_lineage') if fileio.exists(lineage_path): lineage = Lineage.load(lineage_path) else: res = XRdd(sc.textFile(path, use_unicode=False)) dtype = infer_type(res) if dtype != str: if dtype in (list, dict): res = res.map(lambda x: ast.literal_eval(x)) elif dtype is datetime.datetime: res = res.map(lambda x: date_parser.parse(x)) else: res = res.map(lambda x: dtype(x)) return cls(res, dtype, lineage)
def xrdd_track(enable=True): XRdd.set_perf_count(enable)