Exemple #1
0
def schema_to_scala(sc, python_schema):
    list_of_list_of_str_schema = map(lambda t: [t[0], dtypes.to_string(t[1])], python_schema)  # convert dtypes to strings
    return jvm_scala_schema(sc).pythonToScala(list_of_list_of_str_schema)
Exemple #2
0
def schema_to_scala(sc, python_schema):
    list_of_list_of_str_schema = map(
        lambda t: [t[0], dtypes.to_string(t[1])],
        python_schema)  # convert dtypes to strings
    return jvm_scala_schema(sc).pythonToScala(list_of_list_of_str_schema)
def import_hbase(table_name,
                 schema,
                 start_tag=None,
                 end_tag=None,
                 tc=TkContext.implicit):
    """
    Import data from hbase table into frame

    :param table_name: (str) hbase table name
    :param schema: (list[list[str, str, type]]) hbase schema as a List of List(string) (columnFamily, columnName,
                   dataType for cell value)
    :param start_tag: (Optional(str)) optional start tag for filtering
    :param end_tag: (Optional(str)) optional end tag for filtering
    :return: (Frame) frame with data from hbase table

    Example
    ---------
    Load data into frame from a hbase table

    <skip>
        >>> frame = tc.frame.import_hbase("demo_test_hbase", [["test_family", "a", int],["test_family", "b", float], ["test_family", "c", int],["test_family", "d", int]])
        -etc-
        >>> frame.inspect()
        [#]  test_family_a  test_family_b  test_family_c  test_family_d
        ===============================================================
        [0]              1            0.2             -2              5
        [1]              2            0.4             -1              6
        [2]              3            0.6              0              7
        [3]              4            0.8              1              8

        Use of start_tag and end_tag. (Hbase creates a unique row id for data in hbase tables)
        start_tag: It is the unique row id from where row scan should start
        end_tag: It is the unique row id where row scan should end

        Assuming you already have data on hbase table "test_startendtag" under "startendtag" family name with single column named "number".
        data: column contains values from 1 to 99. Here rowid is generated by hbase.

        Sample hbase data. Few rows from hbase table looks as below.
        hbase(main):002:0> scan "test_startendtag"
        ROW             COLUMN+CELL
         0          column=startendtag:number, timestamp=1465342524846, value=1
         1          column=startendtag:number, timestamp=1465342524846, value=25
         10         column=startendtag:number, timestamp=1465342524847, value=51
         103        column=startendtag:number, timestamp=1465342524851, value=98
         107        column=startendtag:number, timestamp=1465342524851, value=99
         11         column=startendtag:number, timestamp=1465342524851, value=75
         12         column=startendtag:number, timestamp=1465342524846, value=4
         13         column=startendtag:number, timestamp=1465342524846, value=28
         14         column=startendtag:number, timestamp=1465342524847, value=52
         15         column=startendtag:number, timestamp=1465342524851, value=76
         16         column=startendtag:number, timestamp=1465342524846, value=5
         17         column=startendtag:number, timestamp=1465342524846, value=29
         18         column=startendtag:number, timestamp=1465342524847, value=53
         19         column=startendtag:number, timestamp=1465342524851, value=77
         2          column=startendtag:number, timestamp=1465342524847, value=49
         20         column=startendtag:number, timestamp=1465342524846, value=6
         21         column=startendtag:number, timestamp=1465342524846, value=30

        >>> frame = tc.frame.import_hbase("test_startendtag", [["startendtag", "number", int]], start_tag="20", end_tag="50")
        -etc-
        >>> frame.count()
        33
        >>> frame.inspect(33)
        [##]  startendtag_number
        ========================
        [0]                    6
        [1]                   30
        [2]                   54
        [3]                   78
        [4]                    7
        [5]                   31
        [6]                   55
        [7]                   79
        [8]                    8
        [9]                   32
        [10]                  73
        [11]                  56
        [12]                  80
        [13]                   9
        [14]                  33
        [15]                  57
        [16]                  81
        [17]                  10
        [18]                  34
        [19]                  58


        [##]  startendtag_number
        ========================
        [20]                  82
        [21]                   2
        [22]                  11
        [23]                  35
        [24]                  59
        [25]                  83
        [26]                  12
        [27]                  36
        [28]                  60
        [29]                  84
        [30]                  13
        [31]                  37
        [32]                  26

    </skip>

    """

    if not isinstance(table_name, basestring):
        raise ValueError(
            "table name parameter must be a string, but is {0}.".format(
                type(table_name)))
    if not isinstance(schema, list):
        raise ValueError("schema parameter must be a list, but is {0}.".format(
            type(table_name)))
    TkContext.validate(tc)

    inner_lists = [
        tc._jutils.convert.to_scala_list(
            [item[0], item[1], dtypes.to_string(item[2])]) for item in schema
    ]
    scala_final_schema = tc.jutils.convert.to_scala_list(inner_lists)

    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importHbase(
        tc.jutils.get_scala_sc(), table_name, scala_final_schema,
        tc._jutils.convert.to_scala_option(start_tag),
        tc._jutils.convert.to_scala_option(end_tag))

    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)
def import_hbase(table_name, schema, start_tag=None, end_tag=None, tc=TkContext.implicit):
    """
    Import data from hbase table into frame

    :param table_name: (str) hbase table name
    :param schema: (list[list[str, str, type]]) hbase schema as a List of List(string) (columnFamily, columnName,
                   dataType for cell value)
    :param start_tag: (Optional(str)) optional start tag for filtering
    :param end_tag: (Optional(str)) optional end tag for filtering
    :return: (Frame) frame with data from hbase table

    Example
    ---------
    Load data into frame from a hbase table

    <skip>
        >>> frame = tc.frame.import_hbase("demo_test_hbase", [["test_family", "a", int],["test_family", "b", float], ["test_family", "c", int],["test_family", "d", int]])
        -etc-
        >>> frame.inspect()
        [#]  test_family_a  test_family_b  test_family_c  test_family_d
        ===============================================================
        [0]              1            0.2             -2              5
        [1]              2            0.4             -1              6
        [2]              3            0.6              0              7
        [3]              4            0.8              1              8

        Use of start_tag and end_tag. (Hbase creates a unique row id for data in hbase tables)
        start_tag: It is the unique row id from where row scan should start
        end_tag: It is the unique row id where row scan should end

        Assuming you already have data on hbase table "test_startendtag" under "startendtag" family name with single column named "number".
        data: column contains values from 1 to 99. Here rowid is generated by hbase.

        Sample hbase data. Few rows from hbase table looks as below.
        hbase(main):002:0> scan "test_startendtag"
        ROW             COLUMN+CELL
         0          column=startendtag:number, timestamp=1465342524846, value=1
         1          column=startendtag:number, timestamp=1465342524846, value=25
         10         column=startendtag:number, timestamp=1465342524847, value=51
         103        column=startendtag:number, timestamp=1465342524851, value=98
         107        column=startendtag:number, timestamp=1465342524851, value=99
         11         column=startendtag:number, timestamp=1465342524851, value=75
         12         column=startendtag:number, timestamp=1465342524846, value=4
         13         column=startendtag:number, timestamp=1465342524846, value=28
         14         column=startendtag:number, timestamp=1465342524847, value=52
         15         column=startendtag:number, timestamp=1465342524851, value=76
         16         column=startendtag:number, timestamp=1465342524846, value=5
         17         column=startendtag:number, timestamp=1465342524846, value=29
         18         column=startendtag:number, timestamp=1465342524847, value=53
         19         column=startendtag:number, timestamp=1465342524851, value=77
         2          column=startendtag:number, timestamp=1465342524847, value=49
         20         column=startendtag:number, timestamp=1465342524846, value=6
         21         column=startendtag:number, timestamp=1465342524846, value=30

        >>> frame = tc.frame.import_hbase("test_startendtag", [["startendtag", "number", int]], start_tag="20", end_tag="50")
        -etc-
        >>> frame.count()
        33
        >>> frame.inspect(33)
        [##]  startendtag_number
        ========================
        [0]                    6
        [1]                   30
        [2]                   54
        [3]                   78
        [4]                    7
        [5]                   31
        [6]                   55
        [7]                   79
        [8]                    8
        [9]                   32
        [10]                  73
        [11]                  56
        [12]                  80
        [13]                   9
        [14]                  33
        [15]                  57
        [16]                  81
        [17]                  10
        [18]                  34
        [19]                  58


        [##]  startendtag_number
        ========================
        [20]                  82
        [21]                   2
        [22]                  11
        [23]                  35
        [24]                  59
        [25]                  83
        [26]                  12
        [27]                  36
        [28]                  60
        [29]                  84
        [30]                  13
        [31]                  37
        [32]                  26

    </skip>

    """

    if not isinstance(table_name, basestring):
        raise ValueError("table name parameter must be a string, but is {0}.".format(type(table_name)))
    if not isinstance(schema, list):
        raise ValueError("schema parameter must be a list, but is {0}.".format(type(table_name)))
    TkContext.validate(tc)

    inner_lists=[tc._jutils.convert.to_scala_list([item[0], item[1], dtypes.to_string(item[2])]) for item in schema]
    scala_final_schema = tc.jutils.convert.to_scala_list(inner_lists)

    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importHbase(tc.jutils.get_scala_sc(),
                                                                                                         table_name, scala_final_schema,
                                                                                                         tc._jutils.convert.to_scala_option(start_tag),
                                                                                                         tc._jutils.convert.to_scala_option(end_tag))

    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)