def read_from_text(cls, path, delimiter, nrows, verbose): """ Load RDD from a text file """ # TODO handle nrows, verbose cls._entry(path=path, delimiter=delimiter, nrows=nrows) sc = CommonSparkContext.spark_context() if delimiter is None: rdd = sc.textFile(path) res = rdd.map(lambda line: line.encode('utf-8')) else: conf = {'textinputformat.record.delimiter': delimiter} rdd = sc.newAPIHadoopFile( path, "org.apache.hadoop.mapreduce.lib.input.TextInputFormat", "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text", conf=conf) def fixup_line(line): return str(line).replace('\n', ' ').strip() res = rdd.values().map(lambda line: fixup_line(line)) lineage = Lineage.init_array_lineage(path) return XArrayImpl(res, str, lineage)
def load(cls, path): """ Load a model that was saved previously. Parameters ---------- path : str The path where the model files are stored. This is the same path that was passed to ``save``. There are three files/directories based on this path, with extensions '.model', '.ratings', and '.metadata'. Returns ------- out : MatrixFactorizationModel A model that can be used to predict ratings. """ sc = CommonSparkContext.Instance().sc() model_path, ratings_path, metadata_path = cls._file_paths(path) # load model model = recommendation.MatrixFactorizationModel.load(sc, model_path) # load ratings ratings = XFrame.load(ratings_path) # load metadata with open(metadata_path) as f: user_col, item_col, rating_col = pickle.load(f) return cls(model, ratings, user_col, item_col, rating_col)
def save(self, path): """ Save a model. The model can be saved, then reloaded later to provide recommendations. Parameters ---------- path : str The path where the model will be saved. This should refer to a file, not to a directory. Three items will be stored here: the underlying model parameters, the original ratings, and the column names. These are stored with suffix '.model', '.ratings', and '.metadata'. """ sc = CommonSparkContext.Instance().sc() delete_file_or_dir(path) os.makedirs(path) model_path, ratings_path, metadata_path = self._file_paths(path) # save model self.model.save(sc, model_path) # save ratings self.ratings.save(ratings_path) # save metadata metadata = [self.user_col, self.item_col, self.rating_col] with fileio.open_file(metadata_path, 'w') as f: # TODO detect filesystem errors pickle.dump(metadata, f)
def load_from_const(cls, value, size): """ Load RDD from const value. """ cls._entry(value=value, size=size) values = [value for _ in xrange(0, size)] sc = CommonSparkContext.spark_context() return cls(XRdd(sc.parallelize(values)), type(value), Lineage.init_array_lineage(Lineage.CONST))
def load_from_iterable(cls, values, dtype, ignore_cast_failure): """ Load RDD from values given by iterable. Note ---- Values must not only be iterable, but also it must support len and __getitem__ Modifies the existing RDD: does not return a new XArray. """ cls._entry(dtype=dtype, ignore_cast_failure=ignore_cast_failure) dtype = dtype or None sc = CommonSparkContext.spark_context() try: if len(values) == 0: dtype = dtype or infer_type_of_list(values[0:100]) return XArrayImpl(XRdd(sc.parallelize([])), dtype) except TypeError: # get here if values does not support len or __getitem pass if dtype is None: # try iterating and see if we get something cpy = copy.copy(values) for val in cpy: dtype = infer_type_of_list([val]) break if dtype is None: raise TypeError('Cannot determine types.') # noinspection PyShadowingNames def do_cast(x, dtype, ignore_cast_failure): if is_missing(x): return x if isinstance(x, str) and dtype is datetime.datetime: return date_parser.parse(x) if isinstance(x, dtype): return x try: return dtype(x) except (ValueError, TypeError): # TODO: this does not seem to catch as it should return None if ignore_cast_failure else ValueError raw_rdd = XRdd(sc.parallelize(values)) rdd = raw_rdd.map(lambda x: do_cast(x, dtype, ignore_cast_failure)) if not ignore_cast_failure: errs = len(rdd.filter(lambda x: x is ValueError).take(1)) == 1 if errs: raise ValueError return cls(rdd, dtype, Lineage.init_array_lineage(Lineage.PROGRAM))
def create_sequential_xarray(size, start, reverse): """ Create RDD with sequential integer values of given size and starting pos. """ if not reverse: stop = start + size step = 1 else: stop = start - size step = -1 sc = CommonSparkContext.spark_context() rdd = XRdd(sc.parallelize(range(start, stop, step))) return XArrayImpl(rdd, int, Lineage.init_array_lineage(Lineage.RANGE))
def __init__(self, rdd=None, elem_type=None, lineage=None): # The RDD holds all the data for the XArray. # The rows must be of a single type. # Types permitted include int, long, float, string, list, and dict. # We record the element type here. self._entry(elem_type=elem_type) if rdd is None: sc = CommonSparkContext.spark_context() rdd = XRdd(sc.parallelize([])) super(XArrayImpl, self).__init__(rdd) self.elem_type = elem_type self.lineage = lineage or Lineage.init_array_lineage(Lineage.EMPTY) self.materialized = False self.iter_pos = 0
def read_from_text(cls, path, delimiter, nrows, verbose): """ Load RDD from a text file """ # TODO handle nrows, verbose cls._entry(path=path, delimiter=delimiter, nrows=nrows) sc = CommonSparkContext.spark_context() if delimiter is None: rdd = sc.textFile(path) res = rdd.map(lambda line: line.encode('utf-8')) else: conf = {'textinputformat.record.delimiter': delimiter} rdd = sc.newAPIHadoopFile(path, "org.apache.hadoop.mapreduce.lib.input.TextInputFormat", "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text", conf=conf) def fixup_line(line): return str(line).replace('\n', ' ').strip() res = rdd.values().map(lambda line: fixup_line(line)) lineage = Lineage.init_array_lineage(path) return XArrayImpl(res, str, lineage)
def load_autodetect(cls, path, dtype): """ Load from the given path. This can be anything that spark will read from: local file or HDFS file. It can also be a directory, and spark will read and concatenate them all. """ # Read the file as string # Examine the first 100 lines, and cast if necessary to int, float, or datetime cls._entry(path=path, dtype=dtype) # If the path is a directory, then look for sarray-data file in the directory. # If the path is a file, look for that file # Use type inference to determine the element type. # Passed-in dtype is always str and is ignored. lineage = Lineage.init_array_lineage(path) sc = CommonSparkContext.spark_context() if os.path.isdir(path): res = XRdd(sc.pickleFile(path)) metadata_path = os.path.join(path, '_metadata') with fileio.open_file(metadata_path) as f: dtype = pickle.load(f) lineage_path = os.path.join(path, '_lineage') if fileio.exists(lineage_path): lineage = Lineage.load(lineage_path) else: res = XRdd(sc.textFile(path, use_unicode=False)) dtype = infer_type(res) if dtype != str: if dtype in (list, dict): res = res.map(lambda x: ast.literal_eval(x)) elif dtype is datetime.datetime: res = res.map(lambda x: date_parser.parse(x)) else: res = res.map(lambda x: dtype(x)) return cls(res, dtype, lineage)
def hive_context(): return CommonSparkContext.hive_context()
def spark_sql_context(): return CommonSparkContext.spark_sql_context()
def make_internal_url_simple(url): """ Takes a user input url string and translates into url relative to the server process. - URL to a local location begins with "local://" or has no "*://" modifier. If the server is local, returns the absolute path of the url. For example: "local:///tmp/foo" -> "/tmp/foo" and "./foo" -> os.path.abspath("./foo"). If the server is not local, raise NotImplementedError. - URL to a server location begins with "remote://". Returns the absolute path after the "remote://" modifier. For example: "remote:///tmp/foo" -> "/tmp/foo". - URL to a s3 location begins with "s3://": Returns the s3 URL with credentials filled in using xframes.aws.get_aws_credential(). For example: "s3://mybucket/foo" -> "s3://$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY:mybucket/foo". - URL to other remote locations, e.g. http://, will remain as is. - Expands ~ to $HOME Parameters ---------- url : str A URL (as described above). Returns ------- out : str Translated url. Raises ------ ValueError If a bad url is provided. """ if not url: raise ValueError('Invalid url: {}'.format(url)) # Try to split the url into (protocol, path). urlsplit = url.split("://") if len(urlsplit) == 2: protocol, path = urlsplit if not path: raise ValueError('Invalid url: {}'.format(url)) if protocol in ['http', 'https']: # protocol is a remote url not on server, just return return url elif protocol == 'hdfs': if not fileio.has_hdfs(): raise ValueError( 'HDFS URL is not supported because Hadoop not found. ' 'Please make hadoop available from PATH or set the environment variable ' 'HADOOP_HOME and try again.') else: return url elif protocol == 's3': if len(path.split(":")) == 3: # s3 url already contains secret key/id pairs, just return return url else: # s3 url does not contain secret key/id pair, query the environment variables # k, v = get_credentials() # return 's3n://' + k + ':' + v + '@' + path return 's3n://' + path elif protocol == 'remote': # url for files on the server path_on_server = path elif protocol == 'local' or protocol == 'file': # url for files on local client, check if we are connecting to local server # # get spark context, get master, see if it starts with local sc = CommonSparkContext.spark_context() if sc.master.startswith('local'): path_on_server = path else: raise ValueError( 'Cannot use local URL when connecting to a remote server.') else: raise ValueError( 'Invalid url protocol {}. Supported url protocols are: ' 'remote://, local://, file:// s3://, https:// and hdfs://'. format(protocol)) elif len(urlsplit) == 1: # expand ~ to $HOME url = os.path.expanduser(url) # url for files on local client, check if we are connecting to local server if True: path_on_server = url else: raise ValueError( 'Cannot use local URL when connecting to a remote server.') else: raise ValueError('Invalid url: {}.'.format(url)) if path_on_server: return os.path.abspath(os.path.expanduser(path_on_server)) else: raise ValueError('Invalid url: {}.'.format(url))
def train(self): sc = CommonSparkContext.Instance().sc() rdd = self.corpus.to_spark_rdd() model = Word2Vec().setVectorSize(self.vector_size).setSeed( self.seed).fit(rdd) return TextModel(model)