Beispiel #1
0
 def _checkpointFile(self, name, input_deserializer):
     jrdd = self._jsc.checkpointFile(name)
     return RDD(jrdd, self, input_deserializer)
Beispiel #2
0
def import_csv_raw(path, delimiter=",", header=False, tc=TkContext.implicit):
    """
    Creates a frame by importing the data as strings from the specified csv file.  If the csv file has a header row,
    those values will be used as column names.  Otherwise, columns will be named generically, like 'C0', 'C1', 'C2', etc.

    Parameters
    ----------

    :param path: (str) Full path to the csv file
    :param delimiter: (str) A string which indicates the separation of data fields.  This is usually a single character
                      and could be a non-visible character, such as a tab. The default delimiter is a comma (,).
    :param header: (bool) Boolean value indicating if the first line of the file will be used to name columns, and not
                   be included in the data.  The default value is false.
    :return: (Frame) Frame that contains the data from the csv file

    Examples
    --------

    Import raw data from a csv file by specifying the path to the file, delimiter, and header option.  All data will
    be brought in the frame as strings, and columns will be named according to the header row, if there was one.

        >>> file_path = "../datasets/cities.csv"

        >>> frame = tc.frame.import_csv_raw(file_path, delimiter="|", header=True)
        -etc-

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]  1     Portland     609456           583776           4.40%   Multnomah
        [1]  2     Salem        160614           154637           3.87%   Marion
        [2]  3     Eugene       159190           156185           1.92%   Lane
        [3]  4     Gresham      109397           105594           3.60%   Multnomah
        [4]  5     Hillsboro    97368            91611            6.28%   Washington
        [5]  6     Beaverton    93542            89803            4.16%   Washington
        [6]  15    Grants Pass  35076            34533            1.57%   Josephine
        [7]  16    Oregon City  34622            31859            8.67%   Clackamas
        [8]  17    McMinnville  33131            32187            2.93%   Yamhill
        [9]  18    Redmond      27427            26215            4.62%   Deschutes

        >>> frame.schema
        [('rank', <type 'str'>), ('city', <type 'str'>), ('population_2013', <type 'str'>), ('population_2010', <type 'str'>), ('change', <type 'str'>), ('county', <type 'str'>)]


    """
    TkContext.validate(tc)
    require_type.non_empty_str(path, "path")
    require_type.non_empty_str(delimiter, "delimiter")
    require_type(bool, header, "header")

    df = tc.sql_context.read.format(
        "com.databricks.spark.csv.org.trustedanalytics.sparktk").options(
            delimiter=delimiter,
            header=str(header).lower(),
            inferschema="false").load(path, schema=None)

    df_schema = []

    for column in df.schema.fields:
        try:
            datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(
                type(column.dataType))
        except ValueError:
            raise TypeError(
                "Unsupported data type ({0}) for column {1}.".format(
                    str(column.dataType), column.name))
        df_schema.append((column.name, datatype))

    jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(
        df._jdf.rdd())
    rdd = RDD(jrdd, tc.sc)

    from sparktk.frame.frame import Frame  # circular dependency, so import late
    return Frame(tc, rdd, df_schema)
Beispiel #3
0
    def emptyRDD(self):
        """
        Create an RDD that has no partitions or elements.
		创建一个没有分区或元素的RDD。
        """
        return RDD(self._jsc.emptyRDD(), self, NoOpSerializer())
Beispiel #4
0
def import_csv(path,
               delimiter=",",
               header=False,
               inferschema=True,
               schema=None,
               tc=implicit):
    """
    Creates a frame with data from a csv file.

    Parameters
    ----------

    :param path: (str) Full path to the csv file
    :param delimiter: (Optional[str]) A string which indicates the separation of data fields.  This is usually a
                      single character and could be a non-visible character, such as a tab. The default delimiter
                      is a comma (,).
    :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns,
                   and not be included in the data.  The default value is false.
    :param inferschema:(Optional[bool]) Boolean value indicating if the column types will be automatically inferred.
                       It requires one extra pass over the data and is false by default.
    :param: schema: (Optional[List[tuple(str, type)]]) Optionally specify the schema for the dataset.  Number of
                    columns specified in the schema must match the number of columns in the csv file provided.
    :return: (Frame) Frame that contains the data from the csv file

    Examples
    --------
    Load a frame from a csv file by specifying the path to the file, delimiter, and options that specify that
    there is a header and to infer the schema based on the data.

        >>> file_path = "../integration-tests/datasets/cities.csv"

        >>> frame = tc.load_frame_from_csv(file_path, "|", header=True, inferschema=True)
        -etc-

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]     1  Portland              609456           583776  4.40%   Multnomah
        [1]     2  Salem                 160614           154637  3.87%   Marion
        [2]     3  Eugene                159190           156185  1.92%   Lane
        [3]     4  Gresham               109397           105594  3.60%   Multnomah
        [4]     5  Hillsboro              97368            91611  6.28%   Washington
        [5]     6  Beaverton              93542            89803  4.16%   Washington
        [6]    15  Grants Pass            35076            34533  1.57%   Josephine
        [7]    16  Oregon City            34622            31859  8.67%   Clackamas
        [8]    17  McMinnville            33131            32187  2.93%   Yamhill
        [9]    18  Redmond                27427            26215  4.62%   Deschutes

        >>> frame.schema
        [('rank', int),
         ('city', str),
         ('population_2013', int),
         ('population_2010', int),
         ('change', str),
         ('county', str)]
    """

    if schema is not None:
        inferschema = False  # if a custom schema is provided, don't waste time inferring the schema during load
    if not isinstance(header, bool):
        raise ValueError(
            "header parameter must be a boolean, but is {0}.".format(
                type(header)))
    if not isinstance(inferschema, bool):
        raise ValueError(
            "inferschema parameter must be a boolean, but is {0}.".format(
                type(inferschema)))
    if tc is implicit:
        implicit.error('tc')
    if not isinstance(tc, TkContext):
        raise ValueError("tc must be type TkContext, received %s" % type(tc))

    header_str = str(header).lower()
    inferschema_str = str(inferschema).lower()
    pyspark_schema = None

    if (not inferschema) and (schema is not None):
        fields = []
        for column in schema:
            if dtypes._data_type_to_pyspark_type_table.has_key(column[1]):
                fields.append(
                    StructField(
                        column[0],
                        dtypes._data_type_to_pyspark_type_table[column[1]],
                        True))
            else:
                raise TypeError(
                    "Unsupported type {0} in schema for column {1}.".format(
                        column[1], column[0]))
        pyspark_schema = StructType(fields)

    sqlContext = SQLContext(tc.sc)
    df = sqlContext.read.format("com.databricks.spark.csv").options(
        delimiter=delimiter,
        header=header_str,
        dateformat="yyyy-MM-dd'T'HH:mm:ss.SSSX",
        inferschema=inferschema_str).load(path, schema=pyspark_schema)

    df_schema = []

    if schema is None:
        for column in df.schema.fields:
            try:
                datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(
                    type(column.dataType))
            except ValueError:
                raise TypeError(
                    "Unsupported data type ({0}) for column {1}.".format(
                        str(column.dataType), column.name))
            df_schema.append((column.name, datatype))
    else:
        df_column_count = len(df.schema.fields)
        custom_column_count = len(schema)
        if (df_column_count != custom_column_count):
            raise ValueError(
                "Bad schema value.  The number of columns in the custom schema ({0}) must match the"
                "number of columns in the csv file data ({1}).".format(
                    custom_column_count, df_column_count))
        df_schema = schema

    def cast_datetime(row):
        """
        The spark data frame gives uses datetime objects.  Convert them to long (ms since epoch) for our frame.
        """
        data = []
        for column_index in xrange(0, len(df_schema)):
            if df_schema[column_index][1] == dtypes.datetime and isinstance(
                    row[column_index], datetime):
                data.append(long(dtypes.datetime_to_ms(row[column_index])))
            else:
                data.append(row[column_index])
        return data

    rdd = df.rdd

    if any(c[1] == dtypes.datetime for c in df_schema):
        # If any columns are date/time we must do this map
        rdd = df.rdd.map(cast_datetime)

    from sparktk.frame.frame import Frame  # circular dependency, so import late
    jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(
        df._jdf.rdd())
    rdd = RDD(jrdd, tc.sc)
    return Frame(tc, rdd, df_schema)
Beispiel #5
0
 def _checkpointFile(self, name):
     jrdd = self._jsc.checkpointFile(name)
     return RDD(jrdd, self)
Beispiel #6
0
    def createRDD(sc,
                  kafkaParams,
                  offsetRanges,
                  leaders=None,
                  keyDecoder=utf8_decoder,
                  valueDecoder=utf8_decoder,
                  messageHandler=None):
        """
        .. note:: Experimental

        Create a RDD from Kafka using offset ranges for each topic and partition.

        :param sc:  SparkContext object
        :param kafkaParams: Additional params for Kafka
        :param offsetRanges:  list of offsetRange to specify topic:partition:[start, end) to consume
        :param leaders: Kafka brokers for each TopicAndPartition in offsetRanges.  May be an empty
            map, in which case leaders will be looked up on the driver.
        :param keyDecoder:  A function used to decode key (default is utf8_decoder)
        :param valueDecoder:  A function used to decode value (default is utf8_decoder)
        :param messageHandler: A function used to convert KafkaMessageAndMetadata. You can assess
                               meta using messageHandler (default is None).
        :return: A RDD object
        """
        if leaders is None:
            leaders = dict()
        if not isinstance(kafkaParams, dict):
            raise TypeError("kafkaParams should be dict")
        if not isinstance(offsetRanges, list):
            raise TypeError("offsetRanges should be list")

        def funcWithoutMessageHandler(k_v):
            return (keyDecoder(k_v[0]), valueDecoder(k_v[1]))

        def funcWithMessageHandler(m):
            m._set_key_decoder(keyDecoder)
            m._set_value_decoder(valueDecoder)
            return messageHandler(m)

        try:
            helperClass = sc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
                .loadClass("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper")
            helper = helperClass.newInstance()
            joffsetRanges = [o._jOffsetRange(helper) for o in offsetRanges]
            jleaders = dict([(k._jTopicAndPartition(helper),
                              v._jBroker(helper))
                             for (k, v) in leaders.items()])
            if messageHandler is None:
                jrdd = helper.createRDDWithoutMessageHandler(
                    sc._jsc, kafkaParams, joffsetRanges, jleaders)
                ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
                rdd = RDD(jrdd, sc, ser).map(funcWithoutMessageHandler)
            else:
                jrdd = helper.createRDDWithMessageHandler(
                    sc._jsc, kafkaParams, joffsetRanges, jleaders)
                rdd = RDD(jrdd, sc).map(funcWithMessageHandler)
        except Py4JJavaError as e:
            if 'ClassNotFoundException' in str(e.java_exception):
                KafkaUtils._printErrorMsg(sc)
            raise e

        return KafkaRDD(rdd._jrdd, sc, rdd._jrdd_deserializer)
 def func(sc, *a, **kw):
     jrdd = f(sc, *a, **kw)
     return RDD(sc._jvm.PythonRDD.javaToPython(jrdd), sc,
                BatchedSerializer(PickleSerializer(), 1024))
Beispiel #8
0
def import_csv(path,
               delimiter=",",
               header=False,
               schema=None,
               datetime_format="yyyy-MM-dd'T'HH:mm:ss.SSSX",
               tc=TkContext.implicit):
    """
    Creates a frame with data from a csv file.

    Parameters
    ----------

    :param path: (str) Full path to the csv file
    :param delimiter: (Optional[str]) A string which indicates the separation of data fields.  This is usually a
                      single character and could be a non-visible character, such as a tab. The default delimiter
                      is a comma (,).
    :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns
                   (unless a schema is provided), and not be included in the data.  The default value is false.
    :param schema: (Optional(list[tuple(str, type)] or list[str])) The are different options for specifying a schema:

    * Provide the full schema for the frame as a list of tuples (string column name and data type)
    * Provide the column names as a list of strings.  Column data types will be inferred, based on the data.  The column names specified will override column names that are found in the header row.
    * None, where the schema is automatically inferred based on the data.  Columns are named based on the header, or will be named generically ("C0", "C1", "C2", etc).

    :param datetime_format: (str) String specifying how date/time columns are formatted, using the java.text.SimpleDateFormat
                        specified at https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html
    :return: (Frame) Frame that contains the data from the csv file

    Examples
    --------

    Load a frame from a csv file by specifying the path to the file, delimiter

        >>> file_path = "../datasets/cities.csv"

        >>> frame = tc.frame.import_csv(file_path, "|", header=True)
        -etc-

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]     1  Portland              609456           583776  4.40%   Multnomah
        [1]     2  Salem                 160614           154637  3.87%   Marion
        [2]     3  Eugene                159190           156185  1.92%   Lane
        [3]     4  Gresham               109397           105594  3.60%   Multnomah
        [4]     5  Hillsboro              97368            91611  6.28%   Washington
        [5]     6  Beaverton              93542            89803  4.16%   Washington
        [6]    15  Grants Pass            35076            34533  1.57%   Josephine
        [7]    16  Oregon City            34622            31859  8.67%   Clackamas
        [8]    17  McMinnville            33131            32187  2.93%   Yamhill
        [9]    18  Redmond                27427            26215  4.62%   Deschutes

        >>> frame.schema
        [('rank', <type 'int'>), ('city', <type 'str'>), ('population_2013', <type 'int'>), ('population_2010', <type 'int'>), ('change', <type 'str'>), ('county', <type 'str'>)]

    The schema parameter can be used to specify a custom schema (column names and data types) or column names (and the
    data types are inferred based on the data).  Here, we will specify the column names, which will override the
    header from the csv file.

        >>> column_names = ["Rank", "City", "2013", "2010", "Percent_Change", "County"]
        >>> frame = tc.frame.import_csv(file_path, "|", header=True, schema=column_names)
        -etc-

        >>> frame.schema
        [('Rank', <type 'int'>), ('City', <type 'str'>), ('2013', <type 'int'>), ('2010', <type 'int'>), ('Percent_Change', <type 'str'>), ('County', <type 'str'>)]

        <hide>
        >>> file_path = "../datasets/unicode.csv"
        >>> schema = [("a", unicode),("b", unicode),("c",unicode)]
        >>> frame = tc.frame.import_csv(file_path, schema=schema, header=False)
        -etc-

        >>> frame.inspect()
        [#]  a  b  c
        ============
        [0]  à  ë  ñ
        [1]  ã  ê  ü

        </hide>

    """
    TkContext.validate(tc)
    require_type.non_empty_str(path, "path")
    require_type.non_empty_str(delimiter, "delimiter")
    require_type(bool, header, "header")
    require_type(str, datetime_format, "datetime_format")

    infer_schema = True
    column_names = []  # custom column names

    if schema is not None:
        if not isinstance(schema, list):
            raise TypeError("Unsupported type %s for schema parameter." %
                            type(schema))
        elif all(isinstance(item, basestring) for item in schema):
            # schema is just column names
            column_names = schema
            schema = None
        else:
            infer_schema = False  # if a custom schema is provided, don't waste time inferring the schema during load
            sparktk_schema.validate(schema)

    header_str = str(header).lower()
    infer_schema_str = str(infer_schema).lower()
    pyspark_schema = None

    if schema is not None:
        fields = []
        for column in schema:
            if dtypes._data_type_to_pyspark_type_table.has_key(column[1]):
                fields.append(
                    StructField(
                        column[0],
                        dtypes._data_type_to_pyspark_type_table[column[1]],
                        True))
            else:
                raise TypeError(
                    "Unsupported type {0} in schema for column {1}.".format(
                        column[1], column[0]))
        pyspark_schema = StructType(fields)

    df = tc.sql_context.read.format(
        "com.databricks.spark.csv.org.trustedanalytics.sparktk").options(
            delimiter=delimiter,
            header=header_str,
            dateformat=datetime_format,
            inferschema=infer_schema_str).load(path, schema=pyspark_schema)

    df_schema = []

    if schema is None:
        for i, column in enumerate(df.schema.fields):
            try:
                datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(
                    type(column.dataType))
            except ValueError:
                raise TypeError(
                    "Unsupported data type ({0}) for column {1}.".format(
                        str(column.dataType), column.name))
            column_name = column_names[i] if (
                i < len(column_names)) else column.name
            df_schema.append((column_name, datatype))
    else:
        df_column_count = len(df.schema.fields)
        custom_column_count = len(schema)
        if (df_column_count != custom_column_count):
            raise ValueError(
                "Bad schema value.  The number of columns in the custom schema ({0}) must match the"
                "number of columns in the csv file data ({1}).".format(
                    custom_column_count, df_column_count))
        df_schema = schema

    def cast_datetime(row):
        """
        The spark data frame gives uses datetime objects.  Convert them to long (ms since epoch) for our frame.
        """
        data = []
        for column_index in xrange(0, len(df_schema)):
            if df_schema[column_index][1] == dtypes.datetime and isinstance(
                    row[column_index], datetime):
                data.append(long(dtypes.datetime_to_ms(row[column_index])))
            else:
                data.append(row[column_index])
        return data

    jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(
        df._jdf.rdd())
    rdd = RDD(jrdd, tc.sc)

    if any(c[1] == dtypes.datetime for c in df_schema):
        # If any columns are date/time we must do this map
        rdd = df.rdd.map(cast_datetime)

    from sparktk.frame.frame import Frame  # circular dependency, so import late
    return Frame(tc, rdd, df_schema)