Python TkContextの例、sparktk.tkcontext.TkContext Pythonの例

コード例 #1

0

ファイルを表示

ファイル: import_dcm.py プロジェクト: aayushidwivedi01/spark-tk-old

def import_dcm(dicom_dir_path, tc=TkContext.implicit):
    """
    Creates a dicom object with metadataFrame and pixeldataFrame from a dcm file(s)

    Parameters
    ----------

    :param dicom_dir_path: (str) Local/HDFS path of the dcm file(s)
    :return: (Dicom) returns a dicom object with metadata and pixeldata frames


    Examples
    --------
        #Path can be local/hdfs to dcm file(s)
        >>> dicom_path = "../datasets/dicom_uncompressed"

        #use import_dcm available inside dicom module to create a dicom object from given dicom_path
        >>> dicom = tc.dicom.import_dcm(dicom_path)

        #Type of dicom object created
        >>> type(dicom)
        <class 'sparktk.dicom.dicom.Dicom'>

        #Inspect metadata property to see dicom metadata xml content
        <skip>
        >>> dicom.metadata.inspect(truncate=30)
        [#]  id  metadata
        =======================================
        [0]   0  <?xml version="1.0" encodin...
        [1]   1  <?xml version="1.0" encodin...
        [2]   2  <?xml version="1.0" encodin...
        </skip>

        #pixeldata property is sparktk frame
        >>> pixeldata = dicom.pixeldata.take(1)

        <skip>

        >>> pixeldata
        [[0L, array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  7.,  5., ...,  5.,  7.,  8.],
        [ 0.,  7.,  6., ...,  5.,  6.,  7.],
        ...,
        [ 0.,  6.,  7., ...,  5.,  5.,  6.],
        [ 0.,  2.,  5., ...,  5.,  5.,  4.],
        [ 1.,  1.,  3., ...,  1.,  1.,  0.]])]]
        </skip>

    """
    if not isinstance(dicom_dir_path, basestring):
        raise ValueError(
            "dicom_dir_path parameter must be a string, but is {0}.".format(
                type(dicom_dir_path)))

    TkContext.validate(tc)

    scala_dicom = tc.sc._jvm.org.trustedanalytics.sparktk.dicom.internal.constructors.Import.importDcm(
        tc.jutils.get_scala_sc(), dicom_dir_path)
    from sparktk.dicom.dicom import Dicom
    return Dicom._from_scala(tc, scala_dicom)

コード例 #2

0

ファイルを表示

ファイル: create.py プロジェクト: grehx/spark-tk

def create(source_or_vertices_frame, edges_frame=None, tc=TkContext.implicit):
    """
    Create a sparktk Graph from two sparktk Frames (or some other source)

    Parameters
    ----------
    :param source_or_vertices_frame: a graph source or a vertices frame
                        Valid sources include: a python and spark GraphFrame, or a scala Graph
                        Otherwise if a vertices frame is provided, then the edges_frame arg must also be supplied.
                        A vertices frame defines the vertices for the graph and must have a schema with a column
                        named "id" which provides unique vertex ID.  All other columns are treated as vertex properties.
                        If a column is also found named "vertex_type", it will be used as a special label to denote the
                        type of vertex, for example, when interfacing with logic (such as a graph DB) which expects a
                        specific vertex type.

    :param edges_frame: (valid only if the source_or_vertices_frame arg is a vertices Frame) An edge frame defines the
                        edges of the graph; schema must have columns names "src" and "dst" which provide the vertex ids
                        of the edge.  All other columns are treated as edge properties.  If a column is also found named
                        "edge_type", it will be used as a special label to denote the type of edge, for example, when
                        interfacing with logic (such as a graph DB) which expects a specific edge type.
    """
    TkContext.validate(tc)
    from sparktk.graph.graph import Graph

    return Graph(tc, source_or_vertices_frame, edges_frame)

コード例 #3

0

ファイルを表示

ファイル: import_dcm.py プロジェクト: lewisc/spark-tk-1

def import_dcm(dicom_dir_path, min_partitions=2, tc=TkContext.implicit):
    """
    Creates a dicom object with metadataFrame and pixeldataFrame from a dcm file(s)

    Parameters
    ----------

    :param dicom_dir_path: (str) Local/HDFS path of the dcm file(s)
    :param min_partitions: (int) Minimum no.of HDFS partitions to use for import dcm
    :return: (Dicom) returns a dicom object with metadata and pixeldata frames


    Examples
    --------
        #Path can be local/hdfs to dcm file(s)
        >>> dicom_path = "../datasets/dicom_uncompressed"

        #use import_dcm available inside dicom module to create a dicom object from given dicom_path
        >>> dicom = tc.dicom.import_dcm(dicom_path)

        #Type of dicom object created
        >>> type(dicom)
        <class 'sparktk.dicom.dicom.Dicom'>

        #Inspect metadata property to see dicom metadata xml content
        <skip>
        >>> dicom.metadata.inspect(truncate=30)
        [#]  id  metadata
        =======================================
        [0]   0  <?xml version="1.0" encodin...
        [1]   1  <?xml version="1.0" encodin...
        [2]   2  <?xml version="1.0" encodin...
        </skip>

        #pixeldata property is sparktk frame
        >>> pixeldata = dicom.pixeldata.take(1)

        <skip>

        >>> pixeldata
        [[0L, array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  7.,  5., ...,  5.,  7.,  8.],
        [ 0.,  7.,  6., ...,  5.,  6.,  7.],
        ...,
        [ 0.,  6.,  7., ...,  5.,  5.,  6.],
        [ 0.,  2.,  5., ...,  5.,  5.,  4.],
        [ 1.,  1.,  3., ...,  1.,  1.,  0.]])]]
        </skip>

    """

    require_type.non_empty_str(dicom_dir_path, "dicom_dir_path")
    require_type.non_negative_int(min_partitions, "min_partitions")

    TkContext.validate(tc)

    scala_dicom = tc.sc._jvm.org.trustedanalytics.sparktk.dicom.internal.constructors.Import.importDcm(
        tc.jutils.get_scala_sc(), dicom_dir_path, min_partitions)
    from sparktk.dicom.dicom import Dicom
    return Dicom._from_scala(tc, scala_dicom)

コード例 #4

0

ファイルを表示

ファイル: import_dcm.py プロジェクト: Haleyo/spark-tk

def import_dcm(dicom_dir_path, min_partitions=2, tc=TkContext.implicit):
    """
    Creates a dicom object with metadataFrame and pixeldataFrame from a dcm file(s)

    Parameters
    ----------

    :param dicom_dir_path: (str) Local/HDFS path of the dcm file(s)
    :param min_partitions: (int) Minimum no.of HDFS partitions to use for import dcm
    :return: (Dicom) returns a dicom object with metadata and pixeldata frames


    Examples
    --------
        #Path can be local/hdfs to dcm file(s)
        >>> dicom_path = "../datasets/dicom_uncompressed"

        #use import_dcm available inside dicom module to create a dicom object from given dicom_path
        >>> dicom = tc.dicom.import_dcm(dicom_path)

        #Type of dicom object created
        >>> type(dicom)
        <class 'sparktk.dicom.dicom.Dicom'>

        #Inspect metadata property to see dicom metadata xml content
        <skip>
        >>> dicom.metadata.inspect(truncate=30)
        [#]  id  metadata
        =======================================
        [0]   0  <?xml version="1.0" encodin...
        [1]   1  <?xml version="1.0" encodin...
        [2]   2  <?xml version="1.0" encodin...
        </skip>

        #pixeldata property is sparktk frame
        >>> pixeldata = dicom.pixeldata.take(1)

        <skip>

        >>> pixeldata
        [[0L, array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  7.,  5., ...,  5.,  7.,  8.],
        [ 0.,  7.,  6., ...,  5.,  6.,  7.],
        ...,
        [ 0.,  6.,  7., ...,  5.,  5.,  6.],
        [ 0.,  2.,  5., ...,  5.,  5.,  4.],
        [ 1.,  1.,  3., ...,  1.,  1.,  0.]])]]
        </skip>

    """

    require_type.non_empty_str(dicom_dir_path, "dicom_dir_path")
    require_type.non_negative_int(min_partitions, "min_partitions")

    TkContext.validate(tc)

    scala_dicom = tc.sc._jvm.org.trustedanalytics.sparktk.dicom.internal.constructors.Import.importDcm(tc.jutils.get_scala_sc(), dicom_dir_path, min_partitions)
    from sparktk.dicom.dicom import Dicom
    return Dicom._from_scala(tc, scala_dicom)

コード例 #5

0

ファイルを表示

ファイル: import_dcm.py プロジェクト: ashaarunkumar/spark-tk

def import_dcm(dicom_dir_path, tc=TkContext.implicit):
    """
    Creates a dicom object with metadataFrame and pixeldataFrame from a dcm file(s)

    Parameters
    ----------

    :param dicom_dir_path: (str) Local/HDFS path of the dcm file(s)
    :return: (Dicom) returns a dicom object with metadata and pixeldata frames


    Examples
    --------
        #Path can be local/hdfs to dcm file(s)
        >>> dicom_path = "../datasets/dicom_uncompressed"

        #use import_dcm available inside dicom module to create a dicom object from given dicom_path
        >>> dicom = tc.dicom.import_dcm(dicom_path)

        #Type of dicom object created
        >>> type(dicom)
        <class 'sparktk.dicom.dicom.Dicom'>

        #Inspect metadata property to see dicom metadata xml content
        <skip>
        >>> dicom.metadata.inspect(truncate=30)
        [#]  id  metadata
        =======================================
        [0]   0  <?xml version="1.0" encodin...
        [1]   1  <?xml version="1.0" encodin...
        [2]   2  <?xml version="1.0" encodin...
        </skip>

        #pixeldata property is sparktk frame
        >>> pixeldata = dicom.pixeldata.take(1)

        <skip>

        >>> pixeldata
        [[0L, array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  7.,  5., ...,  5.,  7.,  8.],
        [ 0.,  7.,  6., ...,  5.,  6.,  7.],
        ...,
        [ 0.,  6.,  7., ...,  5.,  5.,  6.],
        [ 0.,  2.,  5., ...,  5.,  5.,  4.],
        [ 1.,  1.,  3., ...,  1.,  1.,  0.]])]]
        </skip>

    """
    if not isinstance(dicom_dir_path, basestring):
        raise ValueError("dicom_dir_path parameter must be a string, but is {0}.".format(type(dicom_dir_path)))

    TkContext.validate(tc)

    scala_dicom = tc.sc._jvm.org.trustedanalytics.sparktk.dicom.internal.constructors.Import.importDcm(tc.jutils.get_scala_sc(), dicom_dir_path)
    from sparktk.dicom.dicom import Dicom
    return Dicom._from_scala(tc, scala_dicom)

コード例 #6

0

ファイルを表示

ファイル: create_orientdb_conf.py プロジェクト: Haleyo/spark-tk

def create_orientdb_conf(hostname,
                         port_number,
                         db_user_name,
                         db_password,
                         root_password,
                         tc=TkContext.implicit):

    """
    Create OrientDB connection settings to be passed to export_to_orientdb and import_orientdb_graph APIs.

    Parameters
    ----------

    :param hostname: (str) OrientDB server hostname
    :param port_number: (str) OrientDB server port number
    :param db_user_name: (str) OrientDB database user name
    :param db_password: (str) the database password
    :param root_password: (str) OrientDB server root password

    :return (OrientConf) OrientDB connection settings

    Example
    -------

        >>> hostname = "localhost"

        >>> port_number = "2424"

        >>> root_password = "******"

        >>> orient_conf = tc.graph.create_orientdb_config(hostname,
        ...                                               port_number,
        ...                                               "admin",
        ...                                               "admin",
        ...                                               root_password)

        >>> orient_conf
        db_password   = admin
        db_user_name  = admin
        hostname      = localhost
        port_number   = 2424
        root_password = root


    """
    TkContext.validate(tc)
    scala_obj = tc.sc._jvm.org.trustedanalytics.sparktk.graph.internal.ops.orientdb.OrientdbConnection
    return OrientdbConf(tc,
                        scala_obj.createOrientdbConf(hostname,
                                                   port_number,
                                                   db_user_name,
                                                   db_password,
                                                   root_password))

コード例 #7

0

ファイルを表示

def set_orientdb_configurations(hostname, port_number, db_user_name, db_password, root_password, db_properties = None,batch_size = 1000, tc=TkContext.implicit):

    """
    Set OrientDB configurations to be passed to export_to_orientdb and import_orientdb_graph APIs.

    Parameters
    ----------

    :param:(str) hostname: OrientDB server hostname
    :param:(str) port_number: OrientDB server port number
    :param:(str) db_user_name: OrientDB database user name
    :param:(str) password: the database password
    :param:(str) root_password: OrientDB server root password
    :param:(int) batch_size: batch size for graph ETL to OrientDB database
    :param:(Optional(dict(str,any))) db_properties: additional properties for OrientDB database

    :return:(OrientConf) OrientDB configurations

    Example
    -------

        >>> hostname = "localhost"

        >>> port_number = "2424"

        >>> root_password = "******"

        >>> orient_conf = tc.graph.set_orientdb_configurations(hostname,port_number,"admin","admin",root_password)

        >>> orient_conf
        batch_size    = 1000
        db_password   = admin
        db_properties = None
        db_user_name  = admin
        hostname      = localhost
        port_number   = 2424
        root_password = root


    """
    TkContext.validate(tc)
    scala_obj = tc.sc._jvm.org.trustedanalytics.sparktk.graph.internal.ops.orientdb.ExportToOrientdb
    return OrientConf(tc,
                      scala_obj.setOrientdbConfigurations(hostname,
                                                         port_number,
                                                         db_user_name,
                                                         db_password,
                                                         root_password,
                                                         tc.jutils.convert.to_scala_option_map(db_properties),
                                                         batch_size))

コード例 #8

0

ファイルを表示

ファイル: create_orientdb_conf.py プロジェクト: lewisc/spark-tk-1

def create_orientdb_conf(hostname,
                         port_number,
                         db_user_name,
                         db_password,
                         root_password,
                         tc=TkContext.implicit):
    """
    Create OrientDB connection settings to be passed to export_to_orientdb and import_orientdb_graph APIs.

    Parameters
    ----------

    :param hostname: (str) OrientDB server hostname
    :param port_number: (str) OrientDB server port number
    :param db_user_name: (str) OrientDB database user name
    :param db_password: (str) the database password
    :param root_password: (str) OrientDB server root password

    :return (OrientConf) OrientDB connection settings

    Example
    -------

        >>> hostname = "localhost"

        >>> port_number = "2424"

        >>> root_password = "******"

        >>> orient_conf = tc.graph.create_orientdb_config(hostname,
        ...                                               port_number,
        ...                                               "admin",
        ...                                               "admin",
        ...                                               root_password)

        >>> orient_conf
        db_password   = admin
        db_user_name  = admin
        hostname      = localhost
        port_number   = 2424
        root_password = root


    """
    TkContext.validate(tc)
    scala_obj = tc.sc._jvm.org.trustedanalytics.sparktk.graph.internal.ops.orientdb.OrientdbConnection
    return OrientdbConf(
        tc,
        scala_obj.createOrientdbConf(hostname, port_number, db_user_name,
                                     db_password, root_password))

コード例 #9

0

ファイルを表示

ファイル: import_jdbc.py プロジェクト: mapleNvg/spark-tk

def import_jdbc(connection_url, table_name, tc=TkContext.implicit):
    """
    Import data from jdbc table into frame.

    Parameters
    ----------

    :param connection_url: (str) JDBC connection url to database server
    :param table_name: (str) JDBC table name
    :return: (Frame) returns frame with jdbc table data

    Examples
    --------
    Load a frame from a jdbc table specifying the connection url to the database server.

    <skip>
        >>> url = "jdbc:postgresql://localhost/postgres"
        >>> tb_name = "demo_test"

        >>> frame = tc.frame.import_jdbc(url, tb_name)
        -etc-

        >>> frame.inspect()
        [#]  a  b    c   d
        ==================
        [0]  1  0.2  -2  5
        [1]  2  0.4  -1  6
        [2]  3  0.6   0  7
        [3]  4  0.8   1  8

        >>> frame.schema
        [(u'a', int), (u'b', float), (u'c', int), (u'd', int)]
    </skip>
    """
    if not isinstance(connection_url, basestring):
        raise ValueError(
            "connection url parameter must be a string, but is {0}.".format(
                type(connection_url)))
    if not isinstance(table_name, basestring):
        raise ValueError(
            "table name parameter must be a string, but is {0}.".format(
                type(table_name)))
    TkContext.validate(tc)

    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importJdbc(
        tc.jutils.get_scala_sc(), connection_url, table_name)

    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)

コード例 #10

0

ファイルを表示

ファイル: import_orientdb_graph.py プロジェクト: aayushidwivedi01/spark-tk

def import_orientdb_graph(db_url, user_name, password, root_password,tc=TkContext.implicit):
    """
    Import graph from OrientDB to spark-tk as spark-tk graph (Spark graph frame)

    Parameters
    ----------
    :param db_url: OrientDB URI
    :param user_name: the database username
    :param password: the database password
    :param root_password: OrientDB server password
    """
    TkContext.validate(tc)
    scala_graph = tc.sc._jvm.org.trustedanalytics.sparktk.graph.internal.constructors.fromorientdb.ImportFromOrientdb.importOrientdbGraph(tc.jutils.get_scala_sc(), db_url,user_name,password,root_password)
    from sparktk.graph.graph import Graph
    return Graph(tc, scala_graph)

コード例 #11

0

ファイルを表示

ファイル: import_jdbc.py プロジェクト: ashaarunkumar/spark-tk

def import_jdbc(connection_url, table_name, tc=TkContext.implicit):
    """
    Import data from jdbc table into frame.

    Parameters
    ----------

    :param connection_url: (str) JDBC connection url to database server
    :param table_name: (str) JDBC table name
    :return: (Frame) returns frame with jdbc table data

    Examples
    --------
    Load a frame from a jdbc table specifying the connection url to the database server.

    <skip>
        >>> url = "jdbc:postgresql://localhost/postgres"
        >>> tb_name = "demo_test"

        >>> frame = tc.frame.import_jdbc(url, tb_name)
        -etc-

        >>> frame.inspect()
        [#]  a  b    c   d
        ==================
        [0]  1  0.2  -2  5
        [1]  2  0.4  -1  6
        [2]  3  0.6   0  7
        [3]  4  0.8   1  8

        >>> frame.schema
        [(u'a', int), (u'b', float), (u'c', int), (u'd', int)]
    </skip>
    """
    if not isinstance(connection_url, basestring):
        raise ValueError("connection url parameter must be a string, but is {0}.".format(type(connection_url)))
    if not isinstance(table_name, basestring):
        raise ValueError("table name parameter must be a string, but is {0}.".format(type(table_name)))
    TkContext.validate(tc)

    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importJdbc(tc.jutils.get_scala_sc(), connection_url, table_name)

    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)

コード例 #12

0

ファイルを表示

def create(source_or_vertices_frame, edges_frame=None, tc=TkContext.implicit):
    """
    Create a sparktk Graph from two sparktk Frames (or some other source)

    Parameters
    ----------
    :param source_or_vertices_frame: a graph source or a vertices frame
                        Valid sources include: a python and spark GraphFrame, or a scala Graph
                        Otherwise if a vertices frame is provided, then the edges_frame arg must also be supplied.
                        A vertices frame defines the vertices for the graph and must have a schema with a column
                        named "id" which provides unique vertex ID.  All other columns are treated as vertex properties.
                        If a column is also found named "vertex_type", it will be used as a special label to denote the
                        type of vertex, for example, when interfacing with logic (such as a graph DB) which expects a
                        specific vertex type.

    :param edges_frame: (valid only if the source_or_vertices_frame arg is a vertices Frame) An edge frame defines the
                        edges of the graph; schema must have columns names "src" and "dst" which provide the vertex ids
                        of the edge.  All other columns are treated as edge properties.  If a column is also found named
                        "edge_type", it will be used as a special label to denote the type of edge, for example, when
                        interfacing with logic (such as a graph DB) which expects a specific edge type.
    """
    TkContext.validate(tc)
    from sparktk.graph.graph import Graph
    return Graph(tc, source_or_vertices_frame, edges_frame)

コード例 #13

0

ファイルを表示

ファイル: import_hbase.py プロジェクト: aayushidwivedi01/spark-tk-old

def import_hbase(table_name,
                 schema,
                 start_tag=None,
                 end_tag=None,
                 tc=TkContext.implicit):
    """
    Import data from hbase table into frame

    :param table_name: (str) hbase table name
    :param schema: (list[list[str, str, type]]) hbase schema as a List of List(string) (columnFamily, columnName,
                   dataType for cell value)
    :param start_tag: (Optional(str)) optional start tag for filtering
    :param end_tag: (Optional(str)) optional end tag for filtering
    :return: (Frame) frame with data from hbase table

    Example
    ---------
    Load data into frame from a hbase table

    <skip>
        >>> frame = tc.frame.import_hbase("demo_test_hbase", [["test_family", "a", int],["test_family", "b", float], ["test_family", "c", int],["test_family", "d", int]])
        -etc-
        >>> frame.inspect()
        [#]  test_family_a  test_family_b  test_family_c  test_family_d
        ===============================================================
        [0]              1            0.2             -2              5
        [1]              2            0.4             -1              6
        [2]              3            0.6              0              7
        [3]              4            0.8              1              8

        Use of start_tag and end_tag. (Hbase creates a unique row id for data in hbase tables)
        start_tag: It is the unique row id from where row scan should start
        end_tag: It is the unique row id where row scan should end

        Assuming you already have data on hbase table "test_startendtag" under "startendtag" family name with single column named "number".
        data: column contains values from 1 to 99. Here rowid is generated by hbase.

        Sample hbase data. Few rows from hbase table looks as below.
        hbase(main):002:0> scan "test_startendtag"
        ROW             COLUMN+CELL
         0          column=startendtag:number, timestamp=1465342524846, value=1
         1          column=startendtag:number, timestamp=1465342524846, value=25
         10         column=startendtag:number, timestamp=1465342524847, value=51
         103        column=startendtag:number, timestamp=1465342524851, value=98
         107        column=startendtag:number, timestamp=1465342524851, value=99
         11         column=startendtag:number, timestamp=1465342524851, value=75
         12         column=startendtag:number, timestamp=1465342524846, value=4
         13         column=startendtag:number, timestamp=1465342524846, value=28
         14         column=startendtag:number, timestamp=1465342524847, value=52
         15         column=startendtag:number, timestamp=1465342524851, value=76
         16         column=startendtag:number, timestamp=1465342524846, value=5
         17         column=startendtag:number, timestamp=1465342524846, value=29
         18         column=startendtag:number, timestamp=1465342524847, value=53
         19         column=startendtag:number, timestamp=1465342524851, value=77
         2          column=startendtag:number, timestamp=1465342524847, value=49
         20         column=startendtag:number, timestamp=1465342524846, value=6
         21         column=startendtag:number, timestamp=1465342524846, value=30

        >>> frame = tc.frame.import_hbase("test_startendtag", [["startendtag", "number", int]], start_tag="20", end_tag="50")
        -etc-
        >>> frame.count()
        33
        >>> frame.inspect(33)
        [##]  startendtag_number
        ========================
        [0]                    6
        [1]                   30
        [2]                   54
        [3]                   78
        [4]                    7
        [5]                   31
        [6]                   55
        [7]                   79
        [8]                    8
        [9]                   32
        [10]                  73
        [11]                  56
        [12]                  80
        [13]                   9
        [14]                  33
        [15]                  57
        [16]                  81
        [17]                  10
        [18]                  34
        [19]                  58


        [##]  startendtag_number
        ========================
        [20]                  82
        [21]                   2
        [22]                  11
        [23]                  35
        [24]                  59
        [25]                  83
        [26]                  12
        [27]                  36
        [28]                  60
        [29]                  84
        [30]                  13
        [31]                  37
        [32]                  26

    </skip>

    """

    if not isinstance(table_name, basestring):
        raise ValueError(
            "table name parameter must be a string, but is {0}.".format(
                type(table_name)))
    if not isinstance(schema, list):
        raise ValueError("schema parameter must be a list, but is {0}.".format(
            type(table_name)))
    TkContext.validate(tc)

    inner_lists = [
        tc._jutils.convert.to_scala_list(
            [item[0], item[1], dtypes.to_string(item[2])]) for item in schema
    ]
    scala_final_schema = tc.jutils.convert.to_scala_list(inner_lists)

    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importHbase(
        tc.jutils.get_scala_sc(), table_name, scala_final_schema,
        tc._jutils.convert.to_scala_option(start_tag),
        tc._jutils.convert.to_scala_option(end_tag))

    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)

コード例 #14

0

ファイルを表示

def import_csv_raw(path, delimiter=",", header=False, tc=TkContext.implicit):
    """
    Creates a frame by importing the data as strings from the specified csv file.  If the csv file has a header row,
    those values will be used as column names.  Otherwise, columns will be named generically, like 'C0', 'C1', 'C2', etc.

    Parameters
    ----------

    :param path: (str) Full path to the csv file
    :param delimiter: (str) A string which indicates the separation of data fields.  This is usually a single character
                      and could be a non-visible character, such as a tab. The default delimiter is a comma (,).
    :param header: (bool) Boolean value indicating if the first line of the file will be used to name columns, and not
                   be included in the data.  The default value is false.
    :return: (Frame) Frame that contains the data from the csv file

    Examples
    --------

    Import raw data from a csv file by specifying the path to the file, delimiter, and header option.  All data will
    be brought in the frame as strings, and columns will be named according to the header row, if there was one.

        >>> file_path = "../datasets/cities.csv"

        >>> frame = tc.frame.import_csv_raw(file_path, delimiter="|", header=True)
        -etc-

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]  1     Portland     609456           583776           4.40%   Multnomah
        [1]  2     Salem        160614           154637           3.87%   Marion
        [2]  3     Eugene       159190           156185           1.92%   Lane
        [3]  4     Gresham      109397           105594           3.60%   Multnomah
        [4]  5     Hillsboro    97368            91611            6.28%   Washington
        [5]  6     Beaverton    93542            89803            4.16%   Washington
        [6]  15    Grants Pass  35076            34533            1.57%   Josephine
        [7]  16    Oregon City  34622            31859            8.67%   Clackamas
        [8]  17    McMinnville  33131            32187            2.93%   Yamhill
        [9]  18    Redmond      27427            26215            4.62%   Deschutes

        >>> frame.schema
        [('rank', <type 'str'>), ('city', <type 'str'>), ('population_2013', <type 'str'>), ('population_2010', <type 'str'>), ('change', <type 'str'>), ('county', <type 'str'>)]


    """
    TkContext.validate(tc)
    require_type.non_empty_str(path, "path")
    require_type.non_empty_str(delimiter, "delimiter")
    require_type(bool, header, "header")

    df = tc.sql_context.read.format(
        "com.databricks.spark.csv.org.trustedanalytics.sparktk").options(
            delimiter=delimiter,
            header=str(header).lower(),
            inferschema="false").load(path, schema=None)

    df_schema = []

    for column in df.schema.fields:
        try:
            datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(
                type(column.dataType))
        except ValueError:
            raise TypeError(
                "Unsupported data type ({0}) for column {1}.".format(
                    str(column.dataType), column.name))
        df_schema.append((column.name, datatype))

    jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(
        df._jdf.rdd())
    rdd = RDD(jrdd, tc.sc)

    from sparktk.frame.frame import Frame  # circular dependency, so import late
    return Frame(tc, rdd, df_schema)

コード例 #15

0

ファイルを表示

ファイル: import_csv.py プロジェクト: mapleNvg/spark-tk

def import_csv(path,
               delimiter=",",
               header=False,
               infer_schema=True,
               schema=None,
               tc=TkContext.implicit):
    """
    Creates a frame with data from a csv file.

    Parameters
    ----------

    :param path: (str) Full path to the csv file
    :param delimiter: (Optional[str]) A string which indicates the separation of data fields.  This is usually a
                      single character and could be a non-visible character, such as a tab. The default delimiter
                      is a comma (,).
    :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns,
                   and not be included in the data.  The default value is false.
    :param infer_schema:(Optional[bool]) Boolean value indicating if the column types will be automatically inferred.
                       It requires one extra pass over the data and is false by default.
    :param: schema: (Optional[List[tuple(str, type)]]) Optionally specify the schema for the dataset.  Number of
                    columns specified in the schema must match the number of columns in the csv file provided.
    :return: (Frame) Frame that contains the data from the csv file

    Examples
    --------
    Load a frame from a csv file by specifying the path to the file, delimiter, and options that specify that
    there is a header and to infer the schema based on the data.

        >>> file_path = "../integration-tests/datasets/cities.csv"

        >>> frame = tc.load_frame_from_csv(file_path, "|", header=True, infer_schema=True)
        -etc-

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]     1  Portland              609456           583776  4.40%   Multnomah
        [1]     2  Salem                 160614           154637  3.87%   Marion
        [2]     3  Eugene                159190           156185  1.92%   Lane
        [3]     4  Gresham               109397           105594  3.60%   Multnomah
        [4]     5  Hillsboro              97368            91611  6.28%   Washington
        [5]     6  Beaverton              93542            89803  4.16%   Washington
        [6]    15  Grants Pass            35076            34533  1.57%   Josephine
        [7]    16  Oregon City            34622            31859  8.67%   Clackamas
        [8]    17  McMinnville            33131            32187  2.93%   Yamhill
        [9]    18  Redmond                27427            26215  4.62%   Deschutes

        >>> frame.schema
        [('rank', int),
         ('city', str),
         ('population_2013', int),
         ('population_2010', int),
         ('change', str),
         ('county', str)]
    """

    if schema is not None:
        infer_schema = False  # if a custom schema is provided, don't waste time inferring the schema during load
    if not isinstance(header, bool):
        raise ValueError(
            "header parameter must be a boolean, but is {0}.".format(
                type(header)))
    if not isinstance(infer_schema, bool):
        raise ValueError(
            "infer_schema parameter must be a boolean, but is {0}.".format(
                type(infer_schema)))
    TkContext.validate(tc)

    header_str = str(header).lower()
    infer_schema_str = str(infer_schema).lower()
    pyspark_schema = None

    if (not infer_schema) and (schema is not None):
        fields = []
        for column in schema:
            if dtypes._data_type_to_pyspark_type_table.has_key(column[1]):
                fields.append(
                    StructField(
                        column[0],
                        dtypes._data_type_to_pyspark_type_table[column[1]],
                        True))
            else:
                raise TypeError(
                    "Unsupported type {0} in schema for column {1}.".format(
                        column[1], column[0]))
        pyspark_schema = StructType(fields)

    df = tc.sql_context.read.format(
        "com.databricks.spark.csv.org.trustedanalytics.sparktk").options(
            delimiter=delimiter,
            header=header_str,
            dateformat="yyyy-MM-dd'T'HH:mm:ss.SSSX",
            inferschema=infer_schema_str).load(path, schema=pyspark_schema)

    df_schema = []

    if schema is None:
        for column in df.schema.fields:
            try:
                datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(
                    type(column.dataType))
            except ValueError:
                raise TypeError(
                    "Unsupported data type ({0}) for column {1}.".format(
                        str(column.dataType), column.name))
            df_schema.append((column.name, datatype))
    else:
        df_column_count = len(df.schema.fields)
        custom_column_count = len(schema)
        if (df_column_count != custom_column_count):
            raise ValueError(
                "Bad schema value.  The number of columns in the custom schema ({0}) must match the"
                "number of columns in the csv file data ({1}).".format(
                    custom_column_count, df_column_count))
        df_schema = schema

    def cast_datetime(row):
        """
        The spark data frame gives uses datetime objects.  Convert them to long (ms since epoch) for our frame.
        """
        data = []
        for column_index in xrange(0, len(df_schema)):
            if df_schema[column_index][1] == dtypes.datetime and isinstance(
                    row[column_index], datetime):
                data.append(long(dtypes.datetime_to_ms(row[column_index])))
            else:
                data.append(row[column_index])
        return data

    jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(
        df._jdf.rdd())
    rdd = RDD(jrdd, tc.sc)

    if any(c[1] == dtypes.datetime for c in df_schema):
        # If any columns are date/time we must do this map
        rdd = df.rdd.map(cast_datetime)

    from sparktk.frame.frame import Frame  # circular dependency, so import late
    return Frame(tc, rdd, df_schema)

コード例 #16

0

ファイルを表示

def import_hive(hive_query, tc=TkContext.implicit):
    """
    Import data from hive table into frame.

    Define the sql query to retrieve the data from a hive table.

    Only a subset of Hive data types are supported:


        DataType    Support
        ----------  ------------------------------------
        boolean     cast to int

        bigint      native support
        int         native support
        tinyint     cast to int
        smallint    cast to int

        decimal     cast to double, may lose precision
        double      native support
        float       native support

        date        cast to string
        string      native support
        timestamp   cast to string
        varchar     cast to string

        arrays      not supported
        binary      not supported
        char        not supported
        maps        not supported
        structs     not supported
        union       not supported


    Parameters
    ----------

    :param hive_query: (str) hive query to fetch data from table
    :param tc: (TkContext) TK context
    :return: (Frame) returns frame with hive table data

    Examples
    --------
    Load data into frame from a hive table based on hive query

    <skip>
        >>> h_query = "select * from demo_test"
        >>> frame = tc.frame.import_hive(h_query)
        -etc-

        >>> frame.inspect()
        [#]  number  strformat
        ======================
        [0]       1  one
        [1]       2  two
        [2]       3  three
        [3]       4  four
    </skip>

    """
    if not isinstance(hive_query, basestring):
        raise ValueError("hive query parameter must be a string, but is {0}.".format(type(hive_query)))
    TkContext.validate(tc)

    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importHive(tc.jutils.get_scala_sc(), hive_query)

    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)

コード例 #17

0

ファイルを表示

ファイル: import_csv.py プロジェクト: lewisc/spark-tk-1

def import_csv(path, delimiter=",", header=False, schema=None, datetime_format="yyyy-MM-dd'T'HH:mm:ss.SSSX", tc=TkContext.implicit):
    """
    Creates a frame with data from a csv file.

    Parameters
    ----------

    :param path: (str) Full path to the csv file
    :param delimiter: (Optional[str]) A string which indicates the separation of data fields.  This is usually a
                      single character and could be a non-visible character, such as a tab. The default delimiter
                      is a comma (,).
    :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns
                   (unless a schema is provided), and not be included in the data.  The default value is false.
    :param schema: (Optional(list[tuple(str, type)] or list[str])) The are different options for specifying a schema:

    * Provide the full schema for the frame as a list of tuples (string column name and data type)
    * Provide the column names as a list of strings.  Column data types will be inferred, based on the data.  The column names specified will override column names that are found in the header row.
    * None, where the schema is automatically inferred based on the data.  Columns are named based on the header, or will be named generically ("C0", "C1", "C2", etc).

    :param datetime_format: (str) String specifying how date/time columns are formatted, using the java.text.SimpleDateFormat
                        specified at https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html
    :return: (Frame) Frame that contains the data from the csv file

    Examples
    --------

    Load a frame from a csv file by specifying the path to the file, delimiter

        >>> file_path = "../datasets/cities.csv"

        >>> frame = tc.frame.import_csv(file_path, "|", header=True)
        -etc-

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]     1  Portland              609456           583776  4.40%   Multnomah
        [1]     2  Salem                 160614           154637  3.87%   Marion
        [2]     3  Eugene                159190           156185  1.92%   Lane
        [3]     4  Gresham               109397           105594  3.60%   Multnomah
        [4]     5  Hillsboro              97368            91611  6.28%   Washington
        [5]     6  Beaverton              93542            89803  4.16%   Washington
        [6]    15  Grants Pass            35076            34533  1.57%   Josephine
        [7]    16  Oregon City            34622            31859  8.67%   Clackamas
        [8]    17  McMinnville            33131            32187  2.93%   Yamhill
        [9]    18  Redmond                27427            26215  4.62%   Deschutes

        >>> frame.schema
        [('rank', <type 'int'>), ('city', <type 'str'>), ('population_2013', <type 'int'>), ('population_2010', <type 'int'>), ('change', <type 'str'>), ('county', <type 'str'>)]

    The schema parameter can be used to specify a custom schema (column names and data types) or column names (and the
    data types are inferred based on the data).  Here, we will specify the column names, which will override the
    header from the csv file.

        >>> column_names = ["Rank", "City", "2013", "2010", "Percent_Change", "County"]
        >>> frame = tc.frame.import_csv(file_path, "|", header=True, schema=column_names)
        -etc-

        >>> frame.schema
        [('Rank', <type 'int'>), ('City', <type 'str'>), ('2013', <type 'int'>), ('2010', <type 'int'>), ('Percent_Change', <type 'str'>), ('County', <type 'str'>)]

        <hide>
        >>> file_path = "../datasets/unicode.csv"
        >>> schema = [("a", unicode),("b", unicode),("c",unicode)]
        >>> frame = tc.frame.import_csv(file_path, schema=schema, header=False)
        -etc-

        >>> print unicode(frame.get_inspect()).encode('utf-8')  # because this file is UT8 and this docstring is str
        [#]  a  b  c
        ============
        [0]  à  ë  ñ
        [1]  ã  ê  ü

        </hide>

    """
    TkContext.validate(tc)
    require_type.non_empty_str(path, "path")
    require_type.non_empty_str(delimiter, "delimiter")
    require_type(bool, header, "header")
    require_type(str, datetime_format, "datetime_format")

    infer_schema = True
    column_names = []   # custom column names

    if schema is not None:
        if not isinstance(schema, list):
            raise TypeError("Unsupported type %s for schema parameter." % type(schema))
        elif all(isinstance(item, basestring) for item in schema):
            # schema is just column names
            column_names = schema
            schema = None
        else:
            infer_schema = False   # if a custom schema is provided, don't waste time inferring the schema during load
            sparktk_schema.validate(schema)

    header_str = str(header).lower()
    infer_schema_str = str(infer_schema).lower()
    pyspark_schema = None

    if schema is not None:
        fields = []
        for column in schema:
            if dtypes._data_type_to_pyspark_type_table.has_key(column[1]):
                fields.append(StructField(column[0], dtypes._data_type_to_pyspark_type_table[column[1]], True))
            else:
                raise TypeError("Unsupported type {0} in schema for column {1}.".format(column[1], column[0]))
        pyspark_schema = StructType(fields)

    df = tc.sql_context.read.format(
        "com.databricks.spark.csv.org.trustedanalytics.sparktk").options(
            delimiter=delimiter,
            header=header_str,
            dateformat=datetime_format,
            inferschema=infer_schema_str).load(path, schema=pyspark_schema)

    df_schema = []

    if schema is None:
        for i, column in enumerate(df.schema.fields):
            try:
                datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(type(column.dataType))
            except ValueError:
                raise TypeError("Unsupported data type ({0}) for column {1}.".format(str(column.dataType), column.name))
            column_name = column_names[i] if (i < len(column_names)) else column.name
            df_schema.append((column_name, datatype))
    else:
        df_column_count = len(df.schema.fields)
        custom_column_count = len(schema)
        if (df_column_count != custom_column_count):
            raise ValueError("Bad schema value.  The number of columns in the custom schema ({0}) must match the"
                             "number of columns in the csv file data ({1}).".format(custom_column_count, df_column_count))
        df_schema = schema

    def cast_datetime(row):
        """
        The spark data frame gives uses datetime objects.  Convert them to long (ms since epoch) for our frame.
        """
        data = []
        for column_index in xrange(0, len(df_schema)):
            if df_schema[column_index][1] == dtypes.datetime and isinstance(row[column_index], datetime):
                data.append(long(dtypes.datetime_to_ms(row[column_index])))
            else:
                data.append(row[column_index])
        return data

    jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(df._jdf.rdd())
    rdd = RDD(jrdd, tc.sc)

    if any(c[1] == dtypes.datetime for c in df_schema):
        # If any columns are date/time we must do this map
        rdd = df.rdd.map(cast_datetime)

    from sparktk.frame.frame import Frame  # circular dependency, so import late
    return Frame(tc, rdd, df_schema)

コード例 #18

0

ファイルを表示

ファイル: import_xml.py プロジェクト: Haleyo/spark-tk

def import_xml(file_name, record_tag, tc=TkContext.implicit):
    """
    Imports a file of XML records

    XML records can span multiple lines.  Returns a Frame of one column containing a XML string per row

    Note: Only records which start with the given tag will be included (multiple different tags not supported)

    Parameters
    ----------

    :param file_name: file path
    :param record_tag: value of the XML element which contains a record
    :return: Frame

    Examples
    --------

    Consider a file of XML records:

        <?xml version="1.0" encoding="UTF-8"?>
        <table>
            <shape type="triangle">
                <x>0</x>
                <y>0</y>
                <size>12</size>
            </shape>
            <shape type="square">
                <x>8</x>
                <y>0</y>
                <size>4</size>
            </shape>
            <shape color="blue" type="pentagon">
                <x>0</x>
                <y>10</y>
                <size>2</size>
            </shape>
            <shape type="square">
                <x>-4</x>
                <y>6</y>
                <size>7</size>
            </shape>
        </table>

    We can parse this file into a frame of records:

        >>> f = tc.frame.import_xml("../datasets/shapes1.xml", "shape")
        >>> f.inspect()
        [#]  records
        =========================================
        [0]  <shape type="triangle">
                     <x>0</x>
                     <y>0</y>
                     <size>12</size>
                 </shape>
        [1]  <shape type="square">
                     <x>8</x>
                     <y>0</y>
                     <size>4</size>
                 </shape>
        [2]  <shape color="blue" type="pentagon">
                     <x>0</x>
                     <y>10</y>
                     <size>2</size>
                 </shape>
        [3]  <shape type="square">
                     <x>-4</x>
                     <y>6</y>
                     <size>7</size>
                 </shape>


    We can further break the XML records into individual columns with a map_columns (or add_columns) operation:


        >>> import xml.etree.ElementTree as ET
        >>> def parse_my_xml(row):
        ...     ele = ET.fromstring(row[0])
        ...     return [ele.get("type"), int(ele.find("x").text), int(ele.find("y").text), int(ele.find("size").text)]

        >>> f2 = f.map_columns(parse_my_xml, [('shape', str), ('x', int), ('y', int), ('size', int)])

        >>> f2.inspect()
        [#]  shape     x   y   size
        ===========================
        [0]  triangle   0   0    12
        [1]  square     8   0     4
        [2]  pentagon   0  10     2
        [3]  square    -4   6     7


    Consider another file of XML records, this time with different element names for the records:

        <?xml version="1.0" encoding="UTF-8"?>
        <shapes>
            <triangle>
                <x>0</x>
                <y>0</y>
                <size>12</size>
            </triangle>
            <square>
                <x>8</x>
                <y>0</y>
                <size>4</size>
            </square>
            <pentagon color="blue">
                <x>0</x>
                <y>10</y>
                <size>2</size>
            </pentagon>
            <square>
                <x>-4</x>
                <y>6</y>
                <size>7</size>
            </square>
        </shapes>

    We can parse this file into a frame of records of a single type.  We must pick only one.  The others
    will be filtered out:

        >>> f3 = tc.frame.import_xml("../datasets/shapes2.xml", "square")
        >>> f3.inspect()
        [#]  records
        ===========================
        [0]  <square>
                     <x>8</x>
                     <y>0</y>
                     <size>4</size>
                 </square>
        [1]  <square>
                     <x>-4</x>
                     <y>6</y>
                     <size>7</size>
                 </square>


    We can further break the XML records into individual columns with a map_columns (or add_columns) operation:


        >>> def parse_my_squares(row):
        ...     ele = ET.fromstring(row[0])
        ...     return [int(ele.find("x").text), int(ele.find("y").text), int(ele.find("size").text)]

        >>> f4 = f3.map_columns(parse_my_squares, [('x', int), ('y', int), ('size', int)])

        >>> f4.inspect()
        [#]  x   y  size
        ================
        [0]   8  0     4
        [1]  -4  6     7

    """

    TkContext.validate(tc)
    require_type.non_empty_str(file_name, "file_name")
    require_type.non_empty_str(record_tag, "record_tag")
    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.\
        frame.internal.constructors.ImportMultiLineRecords.importXml(tc.jutils.get_scala_sc(), file_name, record_tag)
    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)

コード例 #19

0

ファイルを表示

ファイル: import_json.py プロジェクト: Haleyo/spark-tk

def import_json(file_name, tc=TkContext.implicit):
    """
    Imports a file of JSON records

    JSON records can span multiple lines.  Returns a Frame of one column containing a JSON string per row

    Parameters
    ----------

    :param file_name: file path
    :return: Frame

    Examples
    --------

    Consider a file of JSON records:

        { "obj": {
            "color": "blue",
            "size": 4,
            "shape": "square" }
          }
          { "obj": {
          "color": "green",
          "size": 3,
          "shape": "triangle" }
          }
          { "obj": { "color": "yellow", "size": 5, "shape": "pentagon" } }
          { "obj": {
          "color": "orange",
          "size": 2,
          "shape": "lentil" }
        }

    We can parse this file into a frame of records:

        >>> f = tc.frame.import_json("../datasets/shapes.json")
        >>> f.inspect()
        [#]  records
        =====================================================================
        [0]  { "obj": {
               "color": "blue",
               "size": 4,
               "shape": "square" }
             }
        [1]  { "obj": {
             "color": "green",
             "size": 3,
             "shape": "triangle" }
             }
        [2]  { "obj": { "color": "yellow", "size": 5, "shape": "pentagon" } }
        [3]  { "obj": {
             "color": "orange",
             "size": 2,
             "shape": "lentil" }
             }


    We can further break the JSON records into individual columns with a map_columns (or add_columns) operation:

        >>> import json
        >>> def parse_my_json(row):
        ...     record = json.loads(row.records)['obj']
        ...     return [record['color'], record['size'], record['shape']]

        >>> f2 = f.map_columns(parse_my_json, [('color', str), ('size', int), ('shape', str)])
        >>> f2.inspect()
        [#]  color   size  shape
        ===========================
        [0]  blue       4  square
        [1]  green      3  triangle
        [2]  yellow     5  pentagon
        [3]  orange     2  lentil

    """

    TkContext.validate(tc)
    require_type.non_empty_str(file_name, "file_name")
    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.\
        frame.internal.constructors.ImportMultiLineRecords.importJson(tc.jutils.get_scala_sc(), file_name)
    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)

コード例 #20

0

ファイルを表示

ファイル: import_jdbc.py プロジェクト: Haleyo/spark-tk

def import_jdbc(connection_url, table_name, tc=TkContext.implicit):
    """
    Import data from jdbc table into frame.

    Parameters
    ----------

    :param connection_url: (str) JDBC connection url to database server
    :param table_name: (str) JDBC table name
    :return: (Frame) returns frame with jdbc table data

    Examples
    --------
    Load a frame from a jdbc table specifying the connection url to the database server.

    <skip>
        >>> url = "jdbc:postgresql://localhost/postgres"
        >>> tb_name = "demo_test"

        >>> frame = tc.frame.import_jdbc(url, tb_name)
        -etc-

        >>> frame.inspect()
        [#]  a  b    c   d
        ==================
        [0]  1  0.2  -2  5
        [1]  2  0.4  -1  6
        [2]  3  0.6   0  7
        [3]  4  0.8   1  8

        >>> frame.schema
        [(u'a', int), (u'b', float), (u'c', int), (u'd', int)]
    </skip>

    Notes
    -----

        java.sql.SQLException: No suitable driver found for <jdbcUrl>

    If this error is encountered while running your application, then your JDBC library cannot be found by the node
    running the application. If you're running in Local mode, make sure that you have used the --driver-class-path
    parameter. If a Spark cluster is involved, make sure that each cluster member has a copy of library, and that
    each node of the cluster has been restarted since you modified the spark-defaults.conf file.  See this
    [site](https://sparkour.urizone.net/recipes/using-jdbc/).

    Sparktk does not come with any JDBC drivers.  A driver compatible with the JDBC data source must be supplied when
    creating the TkContext instance:

        <skip>
        >>> tc = sparktk.TkContext(pyspark_submit_args='--jars myJDBCDriver.jar')
        </skip>

    """
    if not isinstance(connection_url, basestring):
        raise ValueError("connection url parameter must be a string, but is {0}.".format(type(connection_url)))
    if not isinstance(table_name, basestring):
        raise ValueError("table name parameter must be a string, but is {0}.".format(type(table_name)))
    TkContext.validate(tc)

    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importJdbc(tc.jutils.get_scala_sc(), connection_url, table_name)

    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)

コード例 #21

0

ファイルを表示

ファイル: import_orientdb_graph.py プロジェクト: ashaarunkumar/spark-tk

def import_orientdb_graph(db_url, user_name, password, root_password,tc=TkContext.implicit):
    """
    Import graph from OrientDB to spark-tk as spark-tk graph (Spark GraphFrame)

    Parameters
    ----------
    :param:(str) db_url: OrientDB URI
    :param:(str) user_name: the database username
    :param:(str) password: the database password
    :param :(str)root_password: OrientDB server password

    Example
    -------

       >>> v = tc.frame.create([("a", "Alice", 34,"F"),
        ...     ("b", "Bob", 36,"M"),
        ...     ("c", "Charlie", 30,"M"),
        ...     ("d", "David", 29,"M"),
        ...     ("e", "Esther", 32,"F"),
        ...     ("f", "Fanny", 36,"F"),
        ...     ], ["id", "name", "age","gender"])

        >>> e = tc.frame.create([("a", "b", "friend"),
        ...     ("b", "c", "follow"),
        ...     ("c", "b", "follow"),
        ...     ("f", "c", "follow"),
        ...     ("e", "f", "follow"),
        ...     ("e", "d", "friend"),
        ...     ("d", "a", "friend"),
        ...     ("a", "e", "friend")
        ...     ], ["src", "dst", "relationship"])

        >>> sparktk_graph = tc.graph.create(v,e)

  <skip>
        >>> db = "test_db"

        >>> sparktk_graph.export_to_orientdb(db_url="remote:hostname:2424/%s" % db,user_name= "admin",password = "******",root_password = "******",vertex_type_column_name= "gender",edge_type_column_name="relationship")

        >>> imported_gf = tc.graph.import_orientdb_graph(db_url="remote:hostname:2424/%s" % db,user_name= "admin",password = "******",root_password = "******")

        >>> imported_gf.graphframe.vertices.show()

+-------+------+---+---+
|   name|gender| id|age|
+-------+------+---+---+
|    Bob|     M|  b| 36|
|  David|     M|  d| 29|
|Charlie|     M|  c| 30|
|  Alice|     F|  a| 34|
| Esther|     F|  e| 32|
|  Fanny|     F|  f| 36|
+-------+------+---+---+

        >>> imported_gf.graphframe.edges.show()

+---+------------+---+
|dst|relationship|src|
+---+------------+---+
|  f|      follow|  e|
|  b|      follow|  c|
|  c|      follow|  b|
|  c|      follow|  f|
|  b|      friend|  a|
|  a|      friend|  d|
|  d|      friend|  e|
|  e|      friend|  a|
+---+------------+---+

  </skip>
    """
    TkContext.validate(tc)
    scala_graph = tc.sc._jvm.org.trustedanalytics.sparktk.graph.internal.constructors.fromorientdb.ImportFromOrientdb.importOrientdbGraph(tc.jutils.get_scala_sc(), db_url,user_name,password,root_password)
    from sparktk.graph.graph import Graph
    return Graph(tc, scala_graph)

コード例 #22

0

ファイルを表示

ファイル: import_hbase.py プロジェクト: aayushidwivedi01/spark-tk

def import_hbase(table_name, schema, start_tag=None, end_tag=None, tc=TkContext.implicit):
    """
    Import data from hbase table into frame

    :param table_name: (str) hbase table name
    :param schema: (list[list[str, str, type]]) hbase schema as a List of List(string) (columnFamily, columnName,
                   dataType for cell value)
    :param start_tag: (Optional(str)) optional start tag for filtering
    :param end_tag: (Optional(str)) optional end tag for filtering
    :return: (Frame) frame with data from hbase table

    Example
    ---------
    Load data into frame from a hbase table

    <skip>
        >>> frame = tc.frame.import_hbase("demo_test_hbase", [["test_family", "a", int],["test_family", "b", float], ["test_family", "c", int],["test_family", "d", int]])
        -etc-
        >>> frame.inspect()
        [#]  test_family_a  test_family_b  test_family_c  test_family_d
        ===============================================================
        [0]              1            0.2             -2              5
        [1]              2            0.4             -1              6
        [2]              3            0.6              0              7
        [3]              4            0.8              1              8

        Use of start_tag and end_tag. (Hbase creates a unique row id for data in hbase tables)
        start_tag: It is the unique row id from where row scan should start
        end_tag: It is the unique row id where row scan should end

        Assuming you already have data on hbase table "test_startendtag" under "startendtag" family name with single column named "number".
        data: column contains values from 1 to 99. Here rowid is generated by hbase.

        Sample hbase data. Few rows from hbase table looks as below.
        hbase(main):002:0> scan "test_startendtag"
        ROW             COLUMN+CELL
         0          column=startendtag:number, timestamp=1465342524846, value=1
         1          column=startendtag:number, timestamp=1465342524846, value=25
         10         column=startendtag:number, timestamp=1465342524847, value=51
         103        column=startendtag:number, timestamp=1465342524851, value=98
         107        column=startendtag:number, timestamp=1465342524851, value=99
         11         column=startendtag:number, timestamp=1465342524851, value=75
         12         column=startendtag:number, timestamp=1465342524846, value=4
         13         column=startendtag:number, timestamp=1465342524846, value=28
         14         column=startendtag:number, timestamp=1465342524847, value=52
         15         column=startendtag:number, timestamp=1465342524851, value=76
         16         column=startendtag:number, timestamp=1465342524846, value=5
         17         column=startendtag:number, timestamp=1465342524846, value=29
         18         column=startendtag:number, timestamp=1465342524847, value=53
         19         column=startendtag:number, timestamp=1465342524851, value=77
         2          column=startendtag:number, timestamp=1465342524847, value=49
         20         column=startendtag:number, timestamp=1465342524846, value=6
         21         column=startendtag:number, timestamp=1465342524846, value=30

        >>> frame = tc.frame.import_hbase("test_startendtag", [["startendtag", "number", int]], start_tag="20", end_tag="50")
        -etc-
        >>> frame.count()
        33
        >>> frame.inspect(33)
        [##]  startendtag_number
        ========================
        [0]                    6
        [1]                   30
        [2]                   54
        [3]                   78
        [4]                    7
        [5]                   31
        [6]                   55
        [7]                   79
        [8]                    8
        [9]                   32
        [10]                  73
        [11]                  56
        [12]                  80
        [13]                   9
        [14]                  33
        [15]                  57
        [16]                  81
        [17]                  10
        [18]                  34
        [19]                  58


        [##]  startendtag_number
        ========================
        [20]                  82
        [21]                   2
        [22]                  11
        [23]                  35
        [24]                  59
        [25]                  83
        [26]                  12
        [27]                  36
        [28]                  60
        [29]                  84
        [30]                  13
        [31]                  37
        [32]                  26

    </skip>

    """

    if not isinstance(table_name, basestring):
        raise ValueError("table name parameter must be a string, but is {0}.".format(type(table_name)))
    if not isinstance(schema, list):
        raise ValueError("schema parameter must be a list, but is {0}.".format(type(table_name)))
    TkContext.validate(tc)

    inner_lists=[tc._jutils.convert.to_scala_list([item[0], item[1], dtypes.to_string(item[2])]) for item in schema]
    scala_final_schema = tc.jutils.convert.to_scala_list(inner_lists)

    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importHbase(tc.jutils.get_scala_sc(),
                                                                                                         table_name, scala_final_schema,
                                                                                                         tc._jutils.convert.to_scala_option(start_tag),
                                                                                                         tc._jutils.convert.to_scala_option(end_tag))

    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)

コード例 #23

0

ファイルを表示

ファイル: import_csv_raw.py プロジェクト: Haleyo/spark-tk

def import_csv_raw(path, delimiter=",", header=False, tc=TkContext.implicit):
    """
    Creates a frame by importing the data as strings from the specified csv file.  If the csv file has a header row,
    those values will be used as column names.  Otherwise, columns will be named generically, like 'C0', 'C1', 'C2', etc.

    Parameters
    ----------

    :param path: (str) Full path to the csv file
    :param delimiter: (str) A string which indicates the separation of data fields.  This is usually a single character
                      and could be a non-visible character, such as a tab. The default delimiter is a comma (,).
    :param header: (bool) Boolean value indicating if the first line of the file will be used to name columns, and not
                   be included in the data.  The default value is false.
    :return: (Frame) Frame that contains the data from the csv file

    Examples
    --------

    Import raw data from a csv file by specifying the path to the file, delimiter, and header option.  All data will
    be brought in the frame as strings, and columns will be named according to the header row, if there was one.

        >>> file_path = "../datasets/cities.csv"

        >>> frame = tc.frame.import_csv_raw(file_path, delimiter="|", header=True)
        -etc-

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]  1     Portland     609456           583776           4.40%   Multnomah
        [1]  2     Salem        160614           154637           3.87%   Marion
        [2]  3     Eugene       159190           156185           1.92%   Lane
        [3]  4     Gresham      109397           105594           3.60%   Multnomah
        [4]  5     Hillsboro    97368            91611            6.28%   Washington
        [5]  6     Beaverton    93542            89803            4.16%   Washington
        [6]  15    Grants Pass  35076            34533            1.57%   Josephine
        [7]  16    Oregon City  34622            31859            8.67%   Clackamas
        [8]  17    McMinnville  33131            32187            2.93%   Yamhill
        [9]  18    Redmond      27427            26215            4.62%   Deschutes

        >>> frame.schema
        [('rank', <type 'str'>), ('city', <type 'str'>), ('population_2013', <type 'str'>), ('population_2010', <type 'str'>), ('change', <type 'str'>), ('county', <type 'str'>)]


    """
    TkContext.validate(tc)
    require_type.non_empty_str(path, "path")
    require_type.non_empty_str(delimiter, "delimiter")
    require_type(bool, header, "header")

    df = tc.sql_context.read.format(
        "com.databricks.spark.csv.org.trustedanalytics.sparktk").options(
        delimiter=delimiter,
        header=str(header).lower(),
        inferschema="false").load(path, schema=None)

    df_schema = []

    for column in df.schema.fields:
        try:
            datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(type(column.dataType))
        except ValueError:
            raise TypeError("Unsupported data type ({0}) for column {1}.".format(str(column.dataType), column.name))
        df_schema.append((column.name, datatype))

    jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(df._jdf.rdd())
    rdd = RDD(jrdd, tc.sc)

    from sparktk.frame.frame import Frame  # circular dependency, so import late
    return Frame(tc, rdd, df_schema)

コード例 #24

0

ファイルを表示

ファイル: import_orientdb_graph.py プロジェクト: Haleyo/spark-tk

def import_orientdb_graph(orient_conf, db_name, db_properties=None, tc=TkContext.implicit):
    """
    Import graph from OrientDB to spark-tk as spark-tk graph (Spark GraphFrame)

    Parameters
    ----------

    :param orient_conf: (OrientConf) configuration settings for the OrientDB database
    :param db_name: (str) the database name
    :param db_properties: (Optional(dict(str,any))) additional properties for OrientDB database, for more OrientDB
                            database properties options. See http://orientdb.com/docs/2.1/Configuration.html

    Example
    -------

        >>> v = tc.frame.create([("a", "Alice", 34,"F"),
        ...     ("b", "Bob", 36,"M"),
        ...     ("c", "Charlie", 30,"M"),
        ...     ("d", "David", 29,"M"),
        ...     ("e", "Esther", 32,"F"),
        ...     ("f", "Fanny", 36,"F"),
        ...     ], ["id", "name", "age","gender"])

        >>> e = tc.frame.create([("a", "b", "friend"),
        ...     ("b", "c", "follow"),
        ...     ("c", "b", "follow"),
        ...     ("f", "c", "follow"),
        ...     ("e", "f", "follow"),
        ...     ("e", "d", "friend"),
        ...     ("d", "a", "friend"),
        ...     ("a", "e", "friend")
        ...     ], ["src", "dst", "relationship"])

        >>> sparktk_graph = tc.graph.create(v,e)

  <skip>

        >>> hostname = "localhost"

        >>> port_number = "2424"

        >>> db_name = "GraphDatabase"

        >>> root_password = "******"

        >>> orient_conf = tc.graph.create_orientdb_conf(hostname, port_number, "admin", "admin", root_password)

        >>> sparktk_graph.export_to_orientdb(orient_conf,
        ...                                  db_name,
        ...                                  vertex_type_column_name= "gender",
        ...                                  edge_type_column_name="relationship")

        >>> imported_gf = tc.graph.import_orientdb_graph(orient_conf, db_name, db_properties = ({"db.validation":"false"}))

        >>> imported_gf.graphframe.vertices.show()

        +-------+------+---+---+
        |   name|gender| id|age|
        +-------+------+---+---+
        |    Bob|     M|  b| 36|
        |  David|     M|  d| 29|
        |Charlie|     M|  c| 30|
        |  Alice|     F|  a| 34|
        | Esther|     F|  e| 32|
        |  Fanny|     F|  f| 36|
        +-------+------+---+---+

        >>> imported_gf.graphframe.edges.show()

        +---+------------+---+
        |dst|relationship|src|
        +---+------------+---+
        |  f|      follow|  e|
        |  b|      follow|  c|
        |  c|      follow|  b|
        |  c|      follow|  f|
        |  b|      friend|  a|
        |  a|      friend|  d|
        |  d|      friend|  e|
        |  e|      friend|  a|
        +---+------------+---+

  </skip>
    """
    TkContext.validate(tc)
    scala_obj = tc.sc._jvm.org.trustedanalytics.sparktk.graph.internal.constructors.fromorientdb.ImportFromOrientdb
    scala_graph = scala_obj.importOrientdbGraph(tc.jutils.get_scala_sc(), orient_conf._scala, db_name, tc.jutils.convert.to_scala_option_map(db_properties))
    from sparktk.graph.graph import Graph
    return Graph(tc, scala_graph)

コード例 #25

0

ファイルを表示

def import_xml(file_name, record_tag, tc=TkContext.implicit):
    """
    Imports a file of XML records

    XML records can span multiple lines.  Returns a Frame of one column containing a XML string per row

    Note: Only records which start with the given tag will be included (multiple different tags not supported)

    Parameters
    ----------

    :param file_name: file path
    :param record_tag: value of the XML element which contains a record
    :return: Frame

    Examples
    --------

    Consider a file of XML records:

        <?xml version="1.0" encoding="UTF-8"?>
        <table>
            <shape type="triangle">
                <x>0</x>
                <y>0</y>
                <size>12</size>
            </shape>
            <shape type="square">
                <x>8</x>
                <y>0</y>
                <size>4</size>
            </shape>
            <shape color="blue" type="pentagon">
                <x>0</x>
                <y>10</y>
                <size>2</size>
            </shape>
            <shape type="square">
                <x>-4</x>
                <y>6</y>
                <size>7</size>
            </shape>
        </table>

    We can parse this file into a frame of records:

        >>> f = tc.frame.import_xml("../datasets/shapes1.xml", "shape")
        >>> f.inspect()
        [#]  records
        =========================================
        [0]  <shape type="triangle">
                     <x>0</x>
                     <y>0</y>
                     <size>12</size>
                 </shape>
        [1]  <shape type="square">
                     <x>8</x>
                     <y>0</y>
                     <size>4</size>
                 </shape>
        [2]  <shape color="blue" type="pentagon">
                     <x>0</x>
                     <y>10</y>
                     <size>2</size>
                 </shape>
        [3]  <shape type="square">
                     <x>-4</x>
                     <y>6</y>
                     <size>7</size>
                 </shape>


    We can further break the XML records into individual columns with a map_columns (or add_columns) operation:


        >>> import xml.etree.ElementTree as ET
        >>> def parse_my_xml(row):
        ...     ele = ET.fromstring(row[0])
        ...     return [ele.get("type"), int(ele.find("x").text), int(ele.find("y").text), int(ele.find("size").text)]

        >>> f2 = f.map_columns(parse_my_xml, [('shape', str), ('x', int), ('y', int), ('size', int)])

        >>> f2.inspect()
        [#]  shape     x   y   size
        ===========================
        [0]  triangle   0   0    12
        [1]  square     8   0     4
        [2]  pentagon   0  10     2
        [3]  square    -4   6     7


    Consider another file of XML records, this time with different element names for the records:

        <?xml version="1.0" encoding="UTF-8"?>
        <shapes>
            <triangle>
                <x>0</x>
                <y>0</y>
                <size>12</size>
            </triangle>
            <square>
                <x>8</x>
                <y>0</y>
                <size>4</size>
            </square>
            <pentagon color="blue">
                <x>0</x>
                <y>10</y>
                <size>2</size>
            </pentagon>
            <square>
                <x>-4</x>
                <y>6</y>
                <size>7</size>
            </square>
        </shapes>

    We can parse this file into a frame of records of a single type.  We must pick only one.  The others
    will be filtered out:

        >>> f3 = tc.frame.import_xml("../datasets/shapes2.xml", "square")
        >>> f3.inspect()
        [#]  records
        ===========================
        [0]  <square>
                     <x>8</x>
                     <y>0</y>
                     <size>4</size>
                 </square>
        [1]  <square>
                     <x>-4</x>
                     <y>6</y>
                     <size>7</size>
                 </square>


    We can further break the XML records into individual columns with a map_columns (or add_columns) operation:


        >>> def parse_my_squares(row):
        ...     ele = ET.fromstring(row[0])
        ...     return [int(ele.find("x").text), int(ele.find("y").text), int(ele.find("size").text)]

        >>> f4 = f3.map_columns(parse_my_squares, [('x', int), ('y', int), ('size', int)])

        >>> f4.inspect()
        [#]  x   y  size
        ================
        [0]   8  0     4
        [1]  -4  6     7

    """

    TkContext.validate(tc)
    require_type.non_empty_str(file_name, "file_name")
    require_type.non_empty_str(record_tag, "record_tag")
    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.\
        frame.internal.constructors.ImportMultiLineRecords.importXml(tc.jutils.get_scala_sc(), file_name, record_tag)
    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)

コード例 #26

0

ファイルを表示

ファイル: import_csv.py プロジェクト: Haleyo/spark-tk

def import_csv(path, delimiter=",", header=False, schema=None, datetime_format="yyyy-MM-dd'T'HH:mm:ss.SSSX", tc=TkContext.implicit):
    """
    Creates a frame with data from a csv file.

    Parameters
    ----------

    :param path: (str) Full path to the csv file
    :param delimiter: (Optional[str]) A string which indicates the separation of data fields.  This is usually a
                      single character and could be a non-visible character, such as a tab. The default delimiter
                      is a comma (,).
    :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns
                   (unless a schema is provided), and not be included in the data.  The default value is false.
    :param schema: (Optional(list[tuple(str, type)] or list[str])) The are different options for specifying a schema:

    * Provide the full schema for the frame as a list of tuples (string column name and data type)
    * Provide the column names as a list of strings.  Column data types will be inferred, based on thedata.  The column names specified will override column names that are found in the header row.
    * None, where the schema is automatically inferred based on the data.  Columns are named based on the header, or will be named generically ("C0", "C1", "C2", etc).

    :param datetime_format: (str) String specifying how date/time columns are formatted, using the java.text.SimpleDateFormat
                        specified at https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html
    :return: (Frame) Frame that contains the data from the csv file

    Examples
    --------

    Load a frame from a csv file by specifying the path to the file, delimiter

        >>> file_path = "../datasets/cities.csv"

        >>> frame = tc.frame.import_csv(file_path, "|", header=True)
        -etc-

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]     1  Portland              609456           583776  4.40%   Multnomah
        [1]     2  Salem                 160614           154637  3.87%   Marion
        [2]     3  Eugene                159190           156185  1.92%   Lane
        [3]     4  Gresham               109397           105594  3.60%   Multnomah
        [4]     5  Hillsboro              97368            91611  6.28%   Washington
        [5]     6  Beaverton              93542            89803  4.16%   Washington
        [6]    15  Grants Pass            35076            34533  1.57%   Josephine
        [7]    16  Oregon City            34622            31859  8.67%   Clackamas
        [8]    17  McMinnville            33131            32187  2.93%   Yamhill
        [9]    18  Redmond                27427            26215  4.62%   Deschutes

        >>> frame.schema
        [('rank', <type 'int'>), ('city', <type 'str'>), ('population_2013', <type 'int'>), ('population_2010', <type 'int'>), ('change', <type 'str'>), ('county', <type 'str'>)]

    The schema parameter can be used to specify a custom schema (column names and data types) or column names (and the
    data types are inferred based on the data).  Here, we will specify the column names, which will override the
    header from the csv file.

        >>> column_names = ["Rank", "City", "2013", "2010", "Percent_Change", "County"]
        >>> frame = tc.frame.import_csv(file_path, "|", header=True, schema=column_names)
        -etc-

        >>> frame.schema
        [('Rank', <type 'int'>), ('City', <type 'str'>), ('2013', <type 'int'>), ('2010', <type 'int'>), ('Percent_Change', <type 'str'>), ('County', <type 'str'>)]

        <hide>
        >>> file_path = "../datasets/unicode.csv"
        >>> schema = [("a", unicode),("b", unicode),("c",unicode)]
        >>> frame = tc.frame.import_csv(file_path, schema=schema, header=False)
        -etc-

        >>> frame.inspect()
        [#]  a  b  c
        ============
        [0]  à  ë  ñ
        [1]  ã  ê  ü

        </hide>

    """
    TkContext.validate(tc)
    require_type.non_empty_str(path, "path")
    require_type.non_empty_str(delimiter, "delimiter")
    require_type(bool, header, "header")
    require_type(str, datetime_format, "datetime_format")

    infer_schema = True
    column_names = []   # custom column names

    if schema is not None:
        if not isinstance(schema, list):
            raise TypeError("Unsupported type %s for schema parameter." % type(schema))
        elif all(isinstance(item, basestring) for item in schema):
            # schema is just column names
            column_names = schema
            schema = None
        else:
            infer_schema = False   # if a custom schema is provided, don't waste time inferring the schema during load
            sparktk_schema.validate(schema)

    header_str = str(header).lower()
    infer_schema_str = str(infer_schema).lower()
    pyspark_schema = None

    if schema is not None:
        fields = []
        for column in schema:
            if dtypes._data_type_to_pyspark_type_table.has_key(column[1]):
                fields.append(StructField(column[0], dtypes._data_type_to_pyspark_type_table[column[1]], True))
            else:
                raise TypeError("Unsupported type {0} in schema for column {1}.".format(column[1], column[0]))
        pyspark_schema = StructType(fields)

    df = tc.sql_context.read.format(
        "com.databricks.spark.csv.org.trustedanalytics.sparktk").options(
            delimiter=delimiter,
            header=header_str,
            dateformat=datetime_format,
            inferschema=infer_schema_str).load(path, schema=pyspark_schema)

    df_schema = []

    if schema is None:
        for i, column in enumerate(df.schema.fields):
            try:
                datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(type(column.dataType))
            except ValueError:
                raise TypeError("Unsupported data type ({0}) for column {1}.".format(str(column.dataType), column.name))
            column_name = column_names[i] if (i < len(column_names)) else column.name
            df_schema.append((column_name, datatype))
    else:
        df_column_count = len(df.schema.fields)
        custom_column_count = len(schema)
        if (df_column_count != custom_column_count):
            raise ValueError("Bad schema value.  The number of columns in the custom schema ({0}) must match the"
                             "number of columns in the csv file data ({1}).".format(custom_column_count, df_column_count))
        df_schema = schema

    def cast_datetime(row):
        """
        The spark data frame gives uses datetime objects.  Convert them to long (ms since epoch) for our frame.
        """
        data = []
        for column_index in xrange(0, len(df_schema)):
            if df_schema[column_index][1] == dtypes.datetime and isinstance(row[column_index], datetime):
                data.append(long(dtypes.datetime_to_ms(row[column_index])))
            else:
                data.append(row[column_index])
        return data

    jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(df._jdf.rdd())
    rdd = RDD(jrdd, tc.sc)

    if any(c[1] == dtypes.datetime for c in df_schema):
        # If any columns are date/time we must do this map
        rdd = df.rdd.map(cast_datetime)

    from sparktk.frame.frame import Frame  # circular dependency, so import late
    return Frame(tc, rdd, df_schema)

コード例 #27

0

ファイルを表示

def import_orientdb_graph(orient_conf,
                          db_name,
                          db_properties=None,
                          tc=TkContext.implicit):
    """
    Import graph from OrientDB to spark-tk as spark-tk graph (Spark GraphFrame)

    Parameters
    ----------

    :param orient_conf: (OrientConf) configuration settings for the OrientDB database
    :param db_name: (str) the database name
    :param db_properties: (Optional(dict(str,any))) additional properties for OrientDB database, for more OrientDB
                            database properties options. See http://orientdb.com/docs/2.1/Configuration.html

    Example
    -------

        >>> v = tc.frame.create([("a", "Alice", 34,"F"),
        ...     ("b", "Bob", 36,"M"),
        ...     ("c", "Charlie", 30,"M"),
        ...     ("d", "David", 29,"M"),
        ...     ("e", "Esther", 32,"F"),
        ...     ("f", "Fanny", 36,"F"),
        ...     ], ["id", "name", "age","gender"])

        >>> e = tc.frame.create([("a", "b", "friend"),
        ...     ("b", "c", "follow"),
        ...     ("c", "b", "follow"),
        ...     ("f", "c", "follow"),
        ...     ("e", "f", "follow"),
        ...     ("e", "d", "friend"),
        ...     ("d", "a", "friend"),
        ...     ("a", "e", "friend")
        ...     ], ["src", "dst", "relationship"])

        >>> sparktk_graph = tc.graph.create(v,e)

  <skip>

        >>> hostname = "localhost"

        >>> port_number = "2424"

        >>> db_name = "GraphDatabase"

        >>> root_password = "******"

        >>> orient_conf = tc.graph.create_orientdb_conf(hostname, port_number, "admin", "admin", root_password)

        >>> sparktk_graph.export_to_orientdb(orient_conf,
        ...                                  db_name,
        ...                                  vertex_type_column_name= "gender",
        ...                                  edge_type_column_name="relationship")

        >>> imported_gf = tc.graph.import_orientdb_graph(orient_conf, db_name, db_properties = ({"db.validation":"false"}))

        >>> imported_gf.graphframe.vertices.show()

        +-------+------+---+---+
        |   name|gender| id|age|
        +-------+------+---+---+
        |    Bob|     M|  b| 36|
        |  David|     M|  d| 29|
        |Charlie|     M|  c| 30|
        |  Alice|     F|  a| 34|
        | Esther|     F|  e| 32|
        |  Fanny|     F|  f| 36|
        +-------+------+---+---+

        >>> imported_gf.graphframe.edges.show()

        +---+------------+---+
        |dst|relationship|src|
        +---+------------+---+
        |  f|      follow|  e|
        |  b|      follow|  c|
        |  c|      follow|  b|
        |  c|      follow|  f|
        |  b|      friend|  a|
        |  a|      friend|  d|
        |  d|      friend|  e|
        |  e|      friend|  a|
        +---+------------+---+

  </skip>
    """
    TkContext.validate(tc)
    scala_obj = tc.sc._jvm.org.trustedanalytics.sparktk.graph.internal.constructors.fromorientdb.ImportFromOrientdb
    scala_graph = scala_obj.importOrientdbGraph(
        tc.jutils.get_scala_sc(), orient_conf._scala, db_name,
        tc.jutils.convert.to_scala_option_map(db_properties))
    from sparktk.graph.graph import Graph
    return Graph(tc, scala_graph)

コード例 #28

0

ファイルを表示

ファイル: import_pandas.py プロジェクト: aayushidwivedi01/spark-tk

def import_pandas(pandas_frame, schema=None, row_index=True, validate_schema=False, tc=TkContext.implicit):
    """
    Imports data from the specified pandas data frame.

    Parameters
    ----------

    :param pandas_frame: (pandas.DataFrame)  pandas dataframe object
    :param schema: (Optional(list[tuples(string, type)])) Schema description of the fields for a given line.  It is a
                   list of tuples which describe each field, (field name, field type), where the field name is a
                   string, and file is a supported type.  If no schema is provided, the schema will be inferred based
                   on the column names and types from the pandas_frame.
    :param row_index: (Optional(bool)) Indicates if the row_index is present in the pandas dataframe and needs to be
                      ignored when looking at the data values. Default value is True.
    :param validate_schema: (Optional(bool)) If true, validates the data against the schema and attempts to cast the
                            data to the specified type, if it does not match the schema.  Defaults to False.
    :return: (Frame) spark-tk frame that contains data from the pandas_frame

    Examples
    --------

    Create a pandas data frame:

        >>> import pandas
        >>> ratings_data = [[0, "invalid"], [1, "Very Poor"], [2, "Poor"], [3, "Average"], [4, "Good"], [5, "Very Good"]]
        >>> df = pandas.DataFrame(ratings_data, columns=['rating_id', 'rating_text'])

        >>> df
           rating_id rating_text
        0          0     invalid
        1          1   Very Poor
        2          2        Poor
        3          3     Average
        4          4        Good
        5          5   Very Good

        >>> df.columns.tolist()
        ['rating_id', 'rating_text']

        >>> df.dtypes
        rating_id       int64
        rating_text    object
        dtype: object

    When using import_pandas by just passing the pandas data frame, it will use the column names and types from the
    pandas data frame to generate the schema.

        >>> frame = tc.frame.import_pandas(df)

        >>> frame.inspect()
        [#]  rating_id  rating_text
        ===========================
        [0]          0  invalid
        [1]          1  Very Poor
        [2]          2  Poor
        [3]          3  Average
        [4]          4  Good
        [5]          5  Very Good

        >>> frame.schema
        [('rating_id', long), ('rating_text', str)]

    Alternatively, you can specify a schema when importing the pandas data frame.  There is also the option to validate
    the data against the schema.  If this option is enabled, we will attempt to cast the data to the column's data type,
    if it does not match the schema.

    For example, here we will specify a schema where the rating_id column will instead be called 'rating_float' and it's
    data type will be a float.  We will also enable the validate_schema option so that the rating_id value will get
    casted to a float:

        >>> frame = tc.frame.import_pandas(df, schema, validate_schema=True)

        >>> frame.inspect()
        [#]  rating_float  rating_str
        =============================
        [0]           0.0  invalid
        [1]           1.0  Very Poor
        [2]           2.0  Poor
        [3]           3.0  Average
        [4]           4.0  Good
        [5]           5.0  Very Good

        >>> frame.schema
        [('rating_float', float), ('rating_str', unicode)]

    """
    try:
        import pandas
    except:
        raise RuntimeError("pandas module not found, unable to download.  Install pandas or try the take command.")

    if not isinstance(pandas_frame, pandas.DataFrame):
        raise TypeError("data_frame must be a pandas DataFrame.")
    TkContext.validate(tc)
    if schema is not None:
        schema = _validate(schema)
    else:
        schema = _get_schema_from_df(pandas_frame)

    if not row_index:
        pandas_frame = pandas_frame.reset_index()

    pandas_frame = pandas_frame.dropna(thresh=len(pandas_frame.columns))
    field_names = [x[0] for x in schema]
    if len(pandas_frame.columns) != len(field_names):
        raise ValueError("Number of columns in Pandasframe {0} does not match the number of columns in the"
                         " schema provided {1}.".format(len(pandas_frame.columns), len(field_names)))

    date_time_columns = [i for i, x in enumerate(pandas_frame.dtypes) if x == "datetime64[ns]"]
    has_date_time = len(date_time_columns) > 0

    # pandas gives us the date/time in nm or as a Timestamp, and spark-tk expects it as ms, so we need to do the conversion
    def pandas_datetime_to_ms(row):
        for i in date_time_columns:
            if isinstance(row[i], long):
                row[i] = row[i] / 1000000
            elif isinstance(row[i], pandas.tslib.Timestamp) or isinstance(row[i], datetime):
                dt = row[i]
                # get number of seconds since epoch (%s) and multiply by 1000 for ms then get the
                # microseconds to get the ms precision.
                row[i] = long((long(dt.strftime("%s")) * 1000) + (dt.microsecond // 1000))
        return row

    pandas_rows = pandas_frame[0:len(pandas_frame.index)].values.tolist()

    # if the dataframe has date/time columns, map them to ms
    if (has_date_time):
        pandas_rows = map(pandas_datetime_to_ms, pandas_rows)

    # create frame with the pandas_rows
    frame = tc.frame.create(pandas_rows, schema)

    if validate_schema:
        frame = tc.frame.create(frame.rdd, schema, validate_schema)

    return frame

コード例 #29

0

ファイルを表示

def import_jdbc(connection_url, table_name, tc=TkContext.implicit):
    """
    Import data from jdbc table into frame.

    Parameters
    ----------

    :param connection_url: (str) JDBC connection url to database server
    :param table_name: (str) JDBC table name
    :return: (Frame) returns frame with jdbc table data

    Examples
    --------
    Load a frame from a jdbc table specifying the connection url to the database server.

    <skip>
        >>> url = "jdbc:postgresql://localhost/postgres"
        >>> tb_name = "demo_test"

        >>> frame = tc.frame.import_jdbc(url, tb_name)
        -etc-

        >>> frame.inspect()
        [#]  a  b    c   d
        ==================
        [0]  1  0.2  -2  5
        [1]  2  0.4  -1  6
        [2]  3  0.6   0  7
        [3]  4  0.8   1  8

        >>> frame.schema
        [(u'a', int), (u'b', float), (u'c', int), (u'd', int)]
    </skip>

    Notes
    -----

        java.sql.SQLException: No suitable driver found for <jdbcUrl>

    If this error is encountered while running your application, then your JDBC library cannot be found by the node
    running the application. If you're running in Local mode, make sure that you have used the --driver-class-path
    parameter. If a Spark cluster is involved, make sure that each cluster member has a copy of library, and that
    each node of the cluster has been restarted since you modified the spark-defaults.conf file.  See this
    [site](https://sparkour.urizone.net/recipes/using-jdbc/).

    Sparktk does not come with any JDBC drivers.  A driver compatible with the JDBC data source must be supplied when
    creating the TkContext instance:

        <skip>
        >>> tc = sparktk.TkContext(pyspark_submit_args='--jars myJDBCDriver.jar')
        </skip>

    """
    if not isinstance(connection_url, basestring):
        raise ValueError(
            "connection url parameter must be a string, but is {0}.".format(
                type(connection_url)))
    if not isinstance(table_name, basestring):
        raise ValueError(
            "table name parameter must be a string, but is {0}.".format(
                type(table_name)))
    TkContext.validate(tc)

    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importJdbc(
        tc.jutils.get_scala_sc(), connection_url, table_name)

    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)

コード例 #30

0

ファイルを表示

def import_pandas(pandas_frame,
                  schema=None,
                  row_index=True,
                  validate_schema=False,
                  tc=TkContext.implicit):
    """
    Imports data from the specified pandas data frame.

    Parameters
    ----------

    :param pandas_frame: (pandas.DataFrame)  pandas dataframe object
    :param schema: (Optional(list[tuples(string, type)])) Schema description of the fields for a given line.  It is a
                   list of tuples which describe each field, (field name, field type), where the field name is a
                   string, and file is a supported type.  If no schema is provided, the schema will be inferred based
                   on the column names and types from the pandas_frame.
    :param row_index: (Optional(bool)) Indicates if the row_index is present in the pandas dataframe and needs to be
                      ignored when looking at the data values. Default value is True.
    :param validate_schema: (Optional(bool)) If true, validates the data against the schema and attempts to cast the
                            data to the specified type, if it does not match the schema.  Defaults to False.
    :return: (Frame) spark-tk frame that contains data from the pandas_frame

    Examples
    --------

    Create a pandas data frame:

        >>> import pandas
        >>> ratings_data = [[0, "invalid"], [1, "Very Poor"], [2, "Poor"], [3, "Average"], [4, "Good"], [5, "Very Good"]]
        >>> df = pandas.DataFrame(ratings_data, columns=['rating_id', 'rating_text'])

        >>> df
           rating_id rating_text
        0          0     invalid
        1          1   Very Poor
        2          2        Poor
        3          3     Average
        4          4        Good
        5          5   Very Good

        >>> df.columns.tolist()
        ['rating_id', 'rating_text']

        >>> df.dtypes
        rating_id       int64
        rating_text    object
        dtype: object

    When using import_pandas by just passing the pandas data frame, it will use the column names and types from the
    pandas data frame to generate the schema.

        >>> frame = tc.frame.import_pandas(df)

        >>> frame.inspect()
        [#]  rating_id  rating_text
        ===========================
        [0]          0  invalid
        [1]          1  Very Poor
        [2]          2  Poor
        [3]          3  Average
        [4]          4  Good
        [5]          5  Very Good

        >>> frame.schema
        [('rating_id', long), ('rating_text', str)]

    Alternatively, you can specify a schema when importing the pandas data frame.  There is also the option to validate
    the data against the schema.  If this option is enabled, we will attempt to cast the data to the column's data type,
    if it does not match the schema.

    For example, here we will specify a schema where the rating_id column will instead be called 'rating_float' and it's
    data type will be a float.  We will also enable the validate_schema option so that the rating_id value will get
    casted to a float:

        >>> frame = tc.frame.import_pandas(df, schema, validate_schema=True)

        >>> frame.inspect()
        [#]  rating_float  rating_str
        =============================
        [0]           0.0  invalid
        [1]           1.0  Very Poor
        [2]           2.0  Poor
        [3]           3.0  Average
        [4]           4.0  Good
        [5]           5.0  Very Good

        >>> frame.schema
        [('rating_float', float), ('rating_str', unicode)]

    """
    try:
        import pandas
    except:
        raise RuntimeError(
            "pandas module not found, unable to download.  Install pandas or try the take command."
        )

    if not isinstance(pandas_frame, pandas.DataFrame):
        raise TypeError("data_frame must be a pandas DataFrame.")
    TkContext.validate(tc)
    if schema is not None:
        schema = _validate(schema)
    else:
        schema = _get_schema_from_df(pandas_frame)

    if not row_index:
        pandas_frame = pandas_frame.reset_index()

    pandas_frame = pandas_frame.dropna(thresh=len(pandas_frame.columns))
    field_names = [x[0] for x in schema]
    if len(pandas_frame.columns) != len(field_names):
        raise ValueError(
            "Number of columns in Pandasframe {0} does not match the number of columns in the"
            " schema provided {1}.".format(len(pandas_frame.columns),
                                           len(field_names)))

    date_time_columns = [
        i for i, x in enumerate(pandas_frame.dtypes) if x == "datetime64[ns]"
    ]
    has_date_time = len(date_time_columns) > 0

    # pandas gives us the date/time in nm or as a Timestamp, and spark-tk expects it as ms, so we need to do the conversion
    def pandas_datetime_to_ms(row):
        for i in date_time_columns:
            if isinstance(row[i], long):
                row[i] = row[i] / 1000000
            elif isinstance(row[i], pandas.tslib.Timestamp) or isinstance(
                    row[i], datetime):
                dt = row[i]
                # get number of seconds since epoch (%s) and multiply by 1000 for ms then get the
                # microseconds to get the ms precision.
                row[i] = long((long(dt.strftime("%s")) * 1000) +
                              (dt.microsecond // 1000))
        return row

    pandas_rows = pandas_frame[0:len(pandas_frame.index)].values.tolist()

    # if the dataframe has date/time columns, map them to ms
    if (has_date_time):
        pandas_rows = map(pandas_datetime_to_ms, pandas_rows)

    # create frame with the pandas_rows
    frame = tc.frame.create(pandas_rows, schema)

    if validate_schema:
        frame = tc.frame.create(frame.rdd, schema, validate_schema)

    return frame

コード例 #31

0

ファイルを表示

ファイル: import_json.py プロジェクト: lewisc/spark-tk-1

def import_json(file_name, tc=TkContext.implicit):
    """
    Imports a file of JSON records

    JSON records can span multiple lines.  Returns a Frame of one column containing a JSON string per row

    Parameters
    ----------

    :param file_name: file path
    :return: Frame

    Examples
    --------

    Consider a file of JSON records:

        { "obj": {
            "color": "blue",
            "size": 4,
            "shape": "square" }
          }
          { "obj": {
          "color": "green",
          "size": 3,
          "shape": "triangle" }
          }
          { "obj": { "color": "yellow", "size": 5, "shape": "pentagon" } }
          { "obj": {
          "color": "orange",
          "size": 2,
          "shape": "lentil" }
        }

    We can parse this file into a frame of records:

        >>> f = tc.frame.import_json("../datasets/shapes.json")
        >>> f.inspect()
        [#]  records
        =====================================================================
        [0]  { "obj": {
               "color": "blue",
               "size": 4,
               "shape": "square" }
             }
        [1]  { "obj": {
             "color": "green",
             "size": 3,
             "shape": "triangle" }
             }
        [2]  { "obj": { "color": "yellow", "size": 5, "shape": "pentagon" } }
        [3]  { "obj": {
             "color": "orange",
             "size": 2,
             "shape": "lentil" }
             }


    We can further break the JSON records into individual columns with a map_columns (or add_columns) operation:

        >>> import json
        >>> def parse_my_json(row):
        ...     record = json.loads(row.records)['obj']
        ...     return [record['color'], record['size'], record['shape']]

        >>> f2 = f.map_columns(parse_my_json, [('color', str), ('size', int), ('shape', str)])
        >>> f2.inspect()
        [#]  color   size  shape
        ===========================
        [0]  blue       4  square
        [1]  green      3  triangle
        [2]  yellow     5  pentagon
        [3]  orange     2  lentil

    """

    TkContext.validate(tc)
    require_type.non_empty_str(file_name, "file_name")
    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.\
        frame.internal.constructors.ImportMultiLineRecords.importJson(tc.jutils.get_scala_sc(), file_name)
    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)

コード例 #32

0

ファイルを表示

ファイル: import_csv.py プロジェクト: aayushidwivedi01/spark-tk

def import_csv(path, delimiter=",", header=False, infer_schema=True, schema=None, tc=TkContext.implicit):
    """
    Creates a frame with data from a csv file.

    Parameters
    ----------

    :param path: (str) Full path to the csv file
    :param delimiter: (Optional[str]) A string which indicates the separation of data fields.  This is usually a
                      single character and could be a non-visible character, such as a tab. The default delimiter
                      is a comma (,).
    :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns,
                   and not be included in the data.  The default value is false.
    :param infer_schema:(Optional[bool]) Boolean value indicating if the column types will be automatically inferred.
                       It requires one extra pass over the data and is false by default.
    :param: schema: (Optional[List[tuple(str, type)]]) Optionally specify the schema for the dataset.  Number of
                    columns specified in the schema must match the number of columns in the csv file provided.  If the
                    value from the csv file cannot be converted to the data type specified by the schema (for example,
                    if the csv file has a string, and the schema specifies an int), the value will show up as missing
                    (None) in the frame.
    :return: (Frame) Frame that contains the data from the csv file

    Examples
    --------
    Load a frame from a csv file by specifying the path to the file, delimiter, and options that specify that
    there is a header and to infer the schema based on the data.

        >>> file_path = "../integration-tests/datasets/cities.csv"

        >>> frame = tc.load_frame_from_csv(file_path, "|", header=True, infer_schema=True)
        -etc-

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]     1  Portland              609456           583776  4.40%   Multnomah
        [1]     2  Salem                 160614           154637  3.87%   Marion
        [2]     3  Eugene                159190           156185  1.92%   Lane
        [3]     4  Gresham               109397           105594  3.60%   Multnomah
        [4]     5  Hillsboro              97368            91611  6.28%   Washington
        [5]     6  Beaverton              93542            89803  4.16%   Washington
        [6]    15  Grants Pass            35076            34533  1.57%   Josephine
        [7]    16  Oregon City            34622            31859  8.67%   Clackamas
        [8]    17  McMinnville            33131            32187  2.93%   Yamhill
        [9]    18  Redmond                27427            26215  4.62%   Deschutes

        >>> frame.schema
        [('rank', int),
         ('city', str),
         ('population_2013', int),
         ('population_2010', int),
         ('change', str),
         ('county', str)]
    """


    if schema is not None:
        infer_schema = False   # if a custom schema is provided, don't waste time inferring the schema during load
    if not isinstance(header, bool):
        raise ValueError("header parameter must be a boolean, but is {0}.".format(type(header)))
    if not isinstance(infer_schema, bool):
        raise ValueError("infer_schema parameter must be a boolean, but is {0}.".format(type(infer_schema)))
    TkContext.validate(tc)

    header_str = str(header).lower()
    infer_schema_str = str(infer_schema).lower()
    pyspark_schema = None

    if (not infer_schema) and (schema is not None):
        fields = []
        for column in schema:
            if dtypes._data_type_to_pyspark_type_table.has_key(column[1]):
                fields.append(StructField(column[0], dtypes._data_type_to_pyspark_type_table[column[1]], True))
            else:
                raise TypeError("Unsupported type {0} in schema for column {1}.".format(column[1], column[0]))
        pyspark_schema = StructType(fields)

    df = tc.sql_context.read.format(
        "com.databricks.spark.csv.org.trustedanalytics.sparktk").options(
            delimiter=delimiter,
            header=header_str,
            dateformat="yyyy-MM-dd'T'HH:mm:ss.SSSX",
            inferschema=infer_schema_str).load(path, schema=pyspark_schema)

    df_schema = []

    if schema is None:
        for column in df.schema.fields:
            try:
                datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(type(column.dataType))
            except ValueError:
                raise TypeError("Unsupported data type ({0}) for column {1}.".format(str(column.dataType), column.name))
            df_schema.append((column.name, datatype))
    else:
        df_column_count = len(df.schema.fields)
        custom_column_count = len(schema)
        if (df_column_count != custom_column_count):
            raise ValueError("Bad schema value.  The number of columns in the custom schema ({0}) must match the"
                             "number of columns in the csv file data ({1}).".format(custom_column_count, df_column_count))
        df_schema = schema

    def cast_datetime(row):
        """
        The spark data frame gives uses datetime objects.  Convert them to long (ms since epoch) for our frame.
        """
        data = []
        for column_index in xrange(0, len(df_schema)):
            if df_schema[column_index][1] == dtypes.datetime and isinstance(row[column_index], datetime):
                data.append(long(dtypes.datetime_to_ms(row[column_index])))
            else:
                data.append(row[column_index])
        return data

    jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(df._jdf.rdd())
    rdd = RDD(jrdd, tc.sc)

    if any(c[1] == dtypes.datetime for c in df_schema):
        # If any columns are date/time we must do this map
        rdd = df.rdd.map(cast_datetime)

    from sparktk.frame.frame import Frame  # circular dependency, so import late
    return Frame(tc, rdd, df_schema)

コード例 #33

0

ファイルを表示

ファイル: import_orientdb_graph.py プロジェクト: aayushidwivedi01/spark-tk-old

def import_orientdb_graph(db_url,
                          user_name,
                          password,
                          root_password,
                          tc=TkContext.implicit):
    """
    Import graph from OrientDB to spark-tk as spark-tk graph (Spark GraphFrame)

    Parameters
    ----------
    :param:(str) db_url: OrientDB URI
    :param:(str) user_name: the database username
    :param:(str) password: the database password
    :param :(str)root_password: OrientDB server password

    Example
    -------

       >>> v = tc.frame.create([("a", "Alice", 34,"F"),
        ...     ("b", "Bob", 36,"M"),
        ...     ("c", "Charlie", 30,"M"),
        ...     ("d", "David", 29,"M"),
        ...     ("e", "Esther", 32,"F"),
        ...     ("f", "Fanny", 36,"F"),
        ...     ], ["id", "name", "age","gender"])

        >>> e = tc.frame.create([("a", "b", "friend"),
        ...     ("b", "c", "follow"),
        ...     ("c", "b", "follow"),
        ...     ("f", "c", "follow"),
        ...     ("e", "f", "follow"),
        ...     ("e", "d", "friend"),
        ...     ("d", "a", "friend"),
        ...     ("a", "e", "friend")
        ...     ], ["src", "dst", "relationship"])

        >>> sparktk_graph = tc.graph.create(v,e)

  <skip>
        >>> db = "test_db"

        >>> sparktk_graph.export_to_orientdb(db_url="remote:hostname:2424/%s" % db,user_name= "admin",password = "******",root_password = "******",vertex_type_column_name= "gender",edge_type_column_name="relationship")

        >>> imported_gf = tc.graph.import_orientdb_graph(db_url="remote:hostname:2424/%s" % db,user_name= "admin",password = "******",root_password = "******")

        >>> imported_gf.graphframe.vertices.show()

+-------+------+---+---+
|   name|gender| id|age|
+-------+------+---+---+
|    Bob|     M|  b| 36|
|  David|     M|  d| 29|
|Charlie|     M|  c| 30|
|  Alice|     F|  a| 34|
| Esther|     F|  e| 32|
|  Fanny|     F|  f| 36|
+-------+------+---+---+

        >>> imported_gf.graphframe.edges.show()

+---+------------+---+
|dst|relationship|src|
+---+------------+---+
|  f|      follow|  e|
|  b|      follow|  c|
|  c|      follow|  b|
|  c|      follow|  f|
|  b|      friend|  a|
|  a|      friend|  d|
|  d|      friend|  e|
|  e|      friend|  a|
+---+------------+---+

  </skip>
    """
    TkContext.validate(tc)
    scala_graph = tc.sc._jvm.org.trustedanalytics.sparktk.graph.internal.constructors.fromorientdb.ImportFromOrientdb.importOrientdbGraph(
        tc.jutils.get_scala_sc(), db_url, user_name, password, root_password)
    from sparktk.graph.graph import Graph
    return Graph(tc, scala_graph)