Example #1
0
    def read_from_text(cls, path, delimiter, nrows, verbose):
        """
        Load RDD from a text file
        """
        # TODO handle nrows, verbose
        cls._entry(path=path, delimiter=delimiter, nrows=nrows)
        sc = CommonSparkContext.spark_context()
        if delimiter is None:
            rdd = sc.textFile(path)
            res = rdd.map(lambda line: line.encode('utf-8'))
        else:
            conf = {'textinputformat.record.delimiter': delimiter}
            rdd = sc.newAPIHadoopFile(
                path,
                "org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
                "org.apache.hadoop.io.Text",
                "org.apache.hadoop.io.Text",
                conf=conf)

            def fixup_line(line):
                return str(line).replace('\n', ' ').strip()

            res = rdd.values().map(lambda line: fixup_line(line))
        lineage = Lineage.init_array_lineage(path)
        return XArrayImpl(res, str, lineage)
    def load(cls, path):
        """
        Load a model that was saved previously.

        Parameters
        ----------
        path : str
            The path where the model files are stored.
            This is the same path that was passed to ``save``.
            There are three files/directories based on this path, with
            extensions '.model', '.ratings', and '.metadata'.

        Returns
        -------
        out : MatrixFactorizationModel
            A model that can be used to predict ratings.
        """
        sc = CommonSparkContext.Instance().sc()
        model_path, ratings_path, metadata_path = cls._file_paths(path)
        # load model
        model = recommendation.MatrixFactorizationModel.load(sc, model_path)
        # load ratings
        ratings = XFrame.load(ratings_path)
        # load metadata
        with open(metadata_path) as f:
            user_col, item_col, rating_col = pickle.load(f)

        return cls(model, ratings, user_col, item_col, rating_col)
    def save(self, path):
        """
        Save a model.

        The model can be saved, then reloaded later to provide recommendations.

        Parameters
        ----------
        path : str
            The path where the model will be saved.
            This should refer to a file, not to a directory.
            Three items will be stored here: the underlying model parameters, the original ratings,
            and the column names.  These are stored with suffix '.model', '.ratings', and
            '.metadata'.
        """
        sc = CommonSparkContext.Instance().sc()
        delete_file_or_dir(path)
        os.makedirs(path)
        model_path, ratings_path, metadata_path = self._file_paths(path)
        # save model
        self.model.save(sc, model_path)
        # save ratings
        self.ratings.save(ratings_path)
        # save metadata
        metadata = [self.user_col, self.item_col, self.rating_col]
        with fileio.open_file(metadata_path, 'w') as f:
            # TODO detect filesystem errors
            pickle.dump(metadata, f)
Example #4
0
 def load_from_const(cls, value, size):
     """
     Load RDD from const value.
     """
     cls._entry(value=value, size=size)
     values = [value for _ in xrange(0, size)]
     sc = CommonSparkContext.spark_context()
     return cls(XRdd(sc.parallelize(values)), type(value), Lineage.init_array_lineage(Lineage.CONST))
Example #5
0
 def load_from_const(cls, value, size):
     """
     Load RDD from const value.
     """
     cls._entry(value=value, size=size)
     values = [value for _ in xrange(0, size)]
     sc = CommonSparkContext.spark_context()
     return cls(XRdd(sc.parallelize(values)), type(value),
                Lineage.init_array_lineage(Lineage.CONST))
Example #6
0
    def load_from_iterable(cls, values, dtype, ignore_cast_failure):
        """
        Load RDD from values given by iterable.

        Note
        ----
        Values must not only be iterable, but also it must support len and __getitem__

        Modifies the existing RDD: does not return a new XArray.
        """
        cls._entry(dtype=dtype, ignore_cast_failure=ignore_cast_failure)
        dtype = dtype or None
        sc = CommonSparkContext.spark_context()
        try:
            if len(values) == 0:
                dtype = dtype or infer_type_of_list(values[0:100])
                return XArrayImpl(XRdd(sc.parallelize([])), dtype)
        except TypeError:
            # get here if values does not support len or __getitem
            pass

        if dtype is None:
            # try iterating and see if we get something
            cpy = copy.copy(values)
            for val in cpy:
                dtype = infer_type_of_list([val])
                break

        if dtype is None:
            raise TypeError('Cannot determine types.')

        # noinspection PyShadowingNames
        def do_cast(x, dtype, ignore_cast_failure):
            if is_missing(x):
                return x
            if isinstance(x, str) and dtype is datetime.datetime:
                return date_parser.parse(x)
            if isinstance(x, dtype):
                return x
            try:
                return dtype(x)
            except (ValueError, TypeError):
                # TODO: this does not seem to catch as it should
                return None if ignore_cast_failure else ValueError

        raw_rdd = XRdd(sc.parallelize(values))
        rdd = raw_rdd.map(lambda x: do_cast(x, dtype, ignore_cast_failure))
        if not ignore_cast_failure:
            errs = len(rdd.filter(lambda x: x is ValueError).take(1)) == 1
            if errs:
                raise ValueError

        return cls(rdd, dtype, Lineage.init_array_lineage(Lineage.PROGRAM))
Example #7
0
    def load_from_iterable(cls, values, dtype, ignore_cast_failure):
        """
        Load RDD from values given by iterable.

        Note
        ----
        Values must not only be iterable, but also it must support len and __getitem__

        Modifies the existing RDD: does not return a new XArray.
        """
        cls._entry(dtype=dtype, ignore_cast_failure=ignore_cast_failure)
        dtype = dtype or None
        sc = CommonSparkContext.spark_context()
        try:
            if len(values) == 0:
                dtype = dtype or infer_type_of_list(values[0:100])
                return XArrayImpl(XRdd(sc.parallelize([])), dtype)
        except TypeError:
            # get here if values does not support len or __getitem
            pass

        if dtype is None:
            # try iterating and see if we get something
            cpy = copy.copy(values)
            for val in cpy:
                dtype = infer_type_of_list([val])
                break

        if dtype is None:
            raise TypeError('Cannot determine types.')

        # noinspection PyShadowingNames
        def do_cast(x, dtype, ignore_cast_failure):
            if is_missing(x):
                return x
            if isinstance(x, str) and dtype is datetime.datetime:
                return date_parser.parse(x)
            if isinstance(x, dtype):
                return x
            try:
                return dtype(x)
            except (ValueError, TypeError):
                # TODO: this does not seem to catch as it should
                return None if ignore_cast_failure else ValueError

        raw_rdd = XRdd(sc.parallelize(values))
        rdd = raw_rdd.map(lambda x: do_cast(x, dtype, ignore_cast_failure))
        if not ignore_cast_failure:
            errs = len(rdd.filter(lambda x: x is ValueError).take(1)) == 1
            if errs:
                raise ValueError

        return cls(rdd, dtype, Lineage.init_array_lineage(Lineage.PROGRAM))
Example #8
0
 def create_sequential_xarray(size, start, reverse):
     """
     Create RDD with sequential integer values of given size and starting pos.
     """
     if not reverse:
         stop = start + size
         step = 1
     else:
         stop = start - size
         step = -1
     sc = CommonSparkContext.spark_context()
     rdd = XRdd(sc.parallelize(range(start, stop, step)))
     return XArrayImpl(rdd, int, Lineage.init_array_lineage(Lineage.RANGE))
Example #9
0
 def create_sequential_xarray(size, start, reverse):
     """
     Create RDD with sequential integer values of given size and starting pos.
     """
     if not reverse:
         stop = start + size
         step = 1
     else:
         stop = start - size
         step = -1
     sc = CommonSparkContext.spark_context()
     rdd = XRdd(sc.parallelize(range(start, stop, step)))
     return XArrayImpl(rdd, int, Lineage.init_array_lineage(Lineage.RANGE))
Example #10
0
 def __init__(self, rdd=None, elem_type=None, lineage=None):
     # The RDD holds all the data for the XArray.
     # The rows must be of a single type.
     # Types permitted include int, long, float, string, list, and dict.
     # We record the element type here.
     self._entry(elem_type=elem_type)
     if rdd is None:
         sc = CommonSparkContext.spark_context()
         rdd = XRdd(sc.parallelize([]))
     super(XArrayImpl, self).__init__(rdd)
     self.elem_type = elem_type
     self.lineage = lineage or Lineage.init_array_lineage(Lineage.EMPTY)
     self.materialized = False
     self.iter_pos = 0
Example #11
0
 def __init__(self, rdd=None, elem_type=None, lineage=None):
     # The RDD holds all the data for the XArray.
     # The rows must be of a single type.
     # Types permitted include int, long, float, string, list, and dict.
     # We record the element type here.
     self._entry(elem_type=elem_type)
     if rdd is None:
         sc = CommonSparkContext.spark_context()
         rdd = XRdd(sc.parallelize([]))
     super(XArrayImpl, self).__init__(rdd)
     self.elem_type = elem_type
     self.lineage = lineage or Lineage.init_array_lineage(Lineage.EMPTY)
     self.materialized = False
     self.iter_pos = 0
Example #12
0
    def read_from_text(cls, path, delimiter, nrows, verbose):
        """
        Load RDD from a text file
        """
        # TODO handle nrows, verbose
        cls._entry(path=path, delimiter=delimiter, nrows=nrows)
        sc = CommonSparkContext.spark_context()
        if delimiter is None:
            rdd = sc.textFile(path)
            res = rdd.map(lambda line: line.encode('utf-8'))
        else:
            conf = {'textinputformat.record.delimiter': delimiter}
            rdd = sc.newAPIHadoopFile(path,
                                      "org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
                                      "org.apache.hadoop.io.Text",
                                      "org.apache.hadoop.io.Text",
                                      conf=conf)

            def fixup_line(line):
                return str(line).replace('\n', ' ').strip()
            res = rdd.values().map(lambda line: fixup_line(line))
        lineage = Lineage.init_array_lineage(path)
        return XArrayImpl(res, str, lineage)
Example #13
0
    def load_autodetect(cls, path, dtype):
        """
        Load from the given path.

        This can be anything that spark will read from: local file or HDFS file.
        It can also be a directory, and spark will read and concatenate them all.
        """
        # Read the file as string
        # Examine the first 100 lines, and cast if necessary to int, float, or datetime
        cls._entry(path=path, dtype=dtype)
        # If the path is a directory, then look for sarray-data file in the directory.
        # If the path is a file, look for that file
        # Use type inference to determine the element type.
        # Passed-in dtype is always str and is ignored.
        lineage = Lineage.init_array_lineage(path)
        sc = CommonSparkContext.spark_context()
        if os.path.isdir(path):
            res = XRdd(sc.pickleFile(path))
            metadata_path = os.path.join(path, '_metadata')
            with fileio.open_file(metadata_path) as f:
                dtype = pickle.load(f)
            lineage_path = os.path.join(path, '_lineage')
            if fileio.exists(lineage_path):
                lineage = Lineage.load(lineage_path)
        else:
            res = XRdd(sc.textFile(path, use_unicode=False))
            dtype = infer_type(res)

        if dtype != str:
            if dtype in (list, dict):
                res = res.map(lambda x: ast.literal_eval(x))
            elif dtype is datetime.datetime:
                res = res.map(lambda x: date_parser.parse(x))
            else:
                res = res.map(lambda x: dtype(x))
        return cls(res, dtype, lineage)
Example #14
0
    def load_autodetect(cls, path, dtype):
        """
        Load from the given path.

        This can be anything that spark will read from: local file or HDFS file.
        It can also be a directory, and spark will read and concatenate them all.
        """
        # Read the file as string
        # Examine the first 100 lines, and cast if necessary to int, float, or datetime
        cls._entry(path=path, dtype=dtype)
        # If the path is a directory, then look for sarray-data file in the directory.
        # If the path is a file, look for that file
        # Use type inference to determine the element type.
        # Passed-in dtype is always str and is ignored.
        lineage = Lineage.init_array_lineage(path)
        sc = CommonSparkContext.spark_context()
        if os.path.isdir(path):
            res = XRdd(sc.pickleFile(path))
            metadata_path = os.path.join(path, '_metadata')
            with fileio.open_file(metadata_path) as f:
                dtype = pickle.load(f)
            lineage_path = os.path.join(path, '_lineage')
            if fileio.exists(lineage_path):
                lineage = Lineage.load(lineage_path)
        else:
            res = XRdd(sc.textFile(path, use_unicode=False))
            dtype = infer_type(res)

        if dtype != str:
            if dtype in (list, dict):
                res = res.map(lambda x: ast.literal_eval(x))
            elif dtype is datetime.datetime:
                res = res.map(lambda x: date_parser.parse(x))
            else:
                res = res.map(lambda x: dtype(x))
        return cls(res, dtype, lineage)
Example #15
0
 def hive_context():
     return CommonSparkContext.hive_context()
Example #16
0
 def spark_sql_context():
     return CommonSparkContext.spark_sql_context()
Example #17
0
def make_internal_url_simple(url):
    """
    Takes a user input url string and translates into url relative to the server process.
    - URL to a local location begins with "local://" or has no "*://" modifier.
      If the server is local, returns the absolute path of the url.
      For example: "local:///tmp/foo" -> "/tmp/foo" and "./foo" -> os.path.abspath("./foo").
      If the server is not local, raise NotImplementedError.
    - URL to a server location begins with "remote://".
      Returns the absolute path after the "remote://" modifier.
      For example: "remote:///tmp/foo" -> "/tmp/foo".
    - URL to a s3 location begins with "s3://":
      Returns the s3 URL with credentials filled in using xframes.aws.get_aws_credential().
      For example: "s3://mybucket/foo" -> "s3://$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY:mybucket/foo".
    - URL to other remote locations, e.g. http://, will remain as is.
    - Expands ~ to $HOME

    Parameters
    ----------
    url : str
        A URL (as described above).

    Returns
    -------
    out : str
        Translated url.

    Raises
    ------
    ValueError
        If a bad url is provided.
    """
    if not url:
        raise ValueError('Invalid url: {}'.format(url))

    # Try to split the url into (protocol, path).
    urlsplit = url.split("://")
    if len(urlsplit) == 2:
        protocol, path = urlsplit
        if not path:
            raise ValueError('Invalid url: {}'.format(url))
        if protocol in ['http', 'https']:
            # protocol is a remote url not on server, just return
            return url
        elif protocol == 'hdfs':
            if not fileio.has_hdfs():
                raise ValueError(
                    'HDFS URL is not supported because Hadoop not found. '
                    'Please make hadoop available from PATH or set the environment variable '
                    'HADOOP_HOME and try again.')
            else:
                return url
        elif protocol == 's3':
            if len(path.split(":")) == 3:
                # s3 url already contains secret key/id pairs, just return
                return url
            else:
                # s3 url does not contain secret key/id pair, query the environment variables
                # k, v = get_credentials()
                # return 's3n://' + k + ':' + v + '@' + path
                return 's3n://' + path
        elif protocol == 'remote':
            # url for files on the server
            path_on_server = path
        elif protocol == 'local' or protocol == 'file':
            # url for files on local client, check if we are connecting to local server
            #
            # get spark context, get master, see if it starts with local
            sc = CommonSparkContext.spark_context()
            if sc.master.startswith('local'):
                path_on_server = path
            else:
                raise ValueError(
                    'Cannot use local URL when connecting to a remote server.')
        else:
            raise ValueError(
                'Invalid url protocol {}. Supported url protocols are: '
                'remote://, local://, file:// s3://, https:// and hdfs://'.
                format(protocol))
    elif len(urlsplit) == 1:
        # expand ~ to $HOME
        url = os.path.expanduser(url)
        # url for files on local client, check if we are connecting to local server
        if True:
            path_on_server = url
        else:
            raise ValueError(
                'Cannot use local URL when connecting to a remote server.')
    else:
        raise ValueError('Invalid url: {}.'.format(url))

    if path_on_server:
        return os.path.abspath(os.path.expanduser(path_on_server))
    else:
        raise ValueError('Invalid url: {}.'.format(url))
Example #18
0
 def train(self):
     sc = CommonSparkContext.Instance().sc()
     rdd = self.corpus.to_spark_rdd()
     model = Word2Vec().setVectorSize(self.vector_size).setSeed(
         self.seed).fit(rdd)
     return TextModel(model)
 def hive_context():
     return CommonSparkContext.hive_context()
 def spark_sql_context():
     return CommonSparkContext.spark_sql_context()