def import_dcm(dicom_dir_path, tc=TkContext.implicit): """ Creates a dicom object with metadataFrame and pixeldataFrame from a dcm file(s) Parameters ---------- :param dicom_dir_path: (str) Local/HDFS path of the dcm file(s) :return: (Dicom) returns a dicom object with metadata and pixeldata frames Examples -------- #Path can be local/hdfs to dcm file(s) >>> dicom_path = "../datasets/dicom_uncompressed" #use import_dcm available inside dicom module to create a dicom object from given dicom_path >>> dicom = tc.dicom.import_dcm(dicom_path) #Type of dicom object created >>> type(dicom) <class 'sparktk.dicom.dicom.Dicom'> #Inspect metadata property to see dicom metadata xml content <skip> >>> dicom.metadata.inspect(truncate=30) [#] id metadata ======================================= [0] 0 <?xml version="1.0" encodin... [1] 1 <?xml version="1.0" encodin... [2] 2 <?xml version="1.0" encodin... </skip> #pixeldata property is sparktk frame >>> pixeldata = dicom.pixeldata.take(1) <skip> >>> pixeldata [[0L, array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 7., 5., ..., 5., 7., 8.], [ 0., 7., 6., ..., 5., 6., 7.], ..., [ 0., 6., 7., ..., 5., 5., 6.], [ 0., 2., 5., ..., 5., 5., 4.], [ 1., 1., 3., ..., 1., 1., 0.]])]] </skip> """ if not isinstance(dicom_dir_path, basestring): raise ValueError( "dicom_dir_path parameter must be a string, but is {0}.".format( type(dicom_dir_path))) TkContext.validate(tc) scala_dicom = tc.sc._jvm.org.trustedanalytics.sparktk.dicom.internal.constructors.Import.importDcm( tc.jutils.get_scala_sc(), dicom_dir_path) from sparktk.dicom.dicom import Dicom return Dicom._from_scala(tc, scala_dicom)
def create(source_or_vertices_frame, edges_frame=None, tc=TkContext.implicit): """ Create a sparktk Graph from two sparktk Frames (or some other source) Parameters ---------- :param source_or_vertices_frame: a graph source or a vertices frame Valid sources include: a python and spark GraphFrame, or a scala Graph Otherwise if a vertices frame is provided, then the edges_frame arg must also be supplied. A vertices frame defines the vertices for the graph and must have a schema with a column named "id" which provides unique vertex ID. All other columns are treated as vertex properties. If a column is also found named "vertex_type", it will be used as a special label to denote the type of vertex, for example, when interfacing with logic (such as a graph DB) which expects a specific vertex type. :param edges_frame: (valid only if the source_or_vertices_frame arg is a vertices Frame) An edge frame defines the edges of the graph; schema must have columns names "src" and "dst" which provide the vertex ids of the edge. All other columns are treated as edge properties. If a column is also found named "edge_type", it will be used as a special label to denote the type of edge, for example, when interfacing with logic (such as a graph DB) which expects a specific edge type. """ TkContext.validate(tc) from sparktk.graph.graph import Graph return Graph(tc, source_or_vertices_frame, edges_frame)
def import_dcm(dicom_dir_path, min_partitions=2, tc=TkContext.implicit): """ Creates a dicom object with metadataFrame and pixeldataFrame from a dcm file(s) Parameters ---------- :param dicom_dir_path: (str) Local/HDFS path of the dcm file(s) :param min_partitions: (int) Minimum no.of HDFS partitions to use for import dcm :return: (Dicom) returns a dicom object with metadata and pixeldata frames Examples -------- #Path can be local/hdfs to dcm file(s) >>> dicom_path = "../datasets/dicom_uncompressed" #use import_dcm available inside dicom module to create a dicom object from given dicom_path >>> dicom = tc.dicom.import_dcm(dicom_path) #Type of dicom object created >>> type(dicom) <class 'sparktk.dicom.dicom.Dicom'> #Inspect metadata property to see dicom metadata xml content <skip> >>> dicom.metadata.inspect(truncate=30) [#] id metadata ======================================= [0] 0 <?xml version="1.0" encodin... [1] 1 <?xml version="1.0" encodin... [2] 2 <?xml version="1.0" encodin... </skip> #pixeldata property is sparktk frame >>> pixeldata = dicom.pixeldata.take(1) <skip> >>> pixeldata [[0L, array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 7., 5., ..., 5., 7., 8.], [ 0., 7., 6., ..., 5., 6., 7.], ..., [ 0., 6., 7., ..., 5., 5., 6.], [ 0., 2., 5., ..., 5., 5., 4.], [ 1., 1., 3., ..., 1., 1., 0.]])]] </skip> """ require_type.non_empty_str(dicom_dir_path, "dicom_dir_path") require_type.non_negative_int(min_partitions, "min_partitions") TkContext.validate(tc) scala_dicom = tc.sc._jvm.org.trustedanalytics.sparktk.dicom.internal.constructors.Import.importDcm( tc.jutils.get_scala_sc(), dicom_dir_path, min_partitions) from sparktk.dicom.dicom import Dicom return Dicom._from_scala(tc, scala_dicom)
def import_dcm(dicom_dir_path, min_partitions=2, tc=TkContext.implicit): """ Creates a dicom object with metadataFrame and pixeldataFrame from a dcm file(s) Parameters ---------- :param dicom_dir_path: (str) Local/HDFS path of the dcm file(s) :param min_partitions: (int) Minimum no.of HDFS partitions to use for import dcm :return: (Dicom) returns a dicom object with metadata and pixeldata frames Examples -------- #Path can be local/hdfs to dcm file(s) >>> dicom_path = "../datasets/dicom_uncompressed" #use import_dcm available inside dicom module to create a dicom object from given dicom_path >>> dicom = tc.dicom.import_dcm(dicom_path) #Type of dicom object created >>> type(dicom) <class 'sparktk.dicom.dicom.Dicom'> #Inspect metadata property to see dicom metadata xml content <skip> >>> dicom.metadata.inspect(truncate=30) [#] id metadata ======================================= [0] 0 <?xml version="1.0" encodin... [1] 1 <?xml version="1.0" encodin... [2] 2 <?xml version="1.0" encodin... </skip> #pixeldata property is sparktk frame >>> pixeldata = dicom.pixeldata.take(1) <skip> >>> pixeldata [[0L, array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 7., 5., ..., 5., 7., 8.], [ 0., 7., 6., ..., 5., 6., 7.], ..., [ 0., 6., 7., ..., 5., 5., 6.], [ 0., 2., 5., ..., 5., 5., 4.], [ 1., 1., 3., ..., 1., 1., 0.]])]] </skip> """ require_type.non_empty_str(dicom_dir_path, "dicom_dir_path") require_type.non_negative_int(min_partitions, "min_partitions") TkContext.validate(tc) scala_dicom = tc.sc._jvm.org.trustedanalytics.sparktk.dicom.internal.constructors.Import.importDcm(tc.jutils.get_scala_sc(), dicom_dir_path, min_partitions) from sparktk.dicom.dicom import Dicom return Dicom._from_scala(tc, scala_dicom)
def import_dcm(dicom_dir_path, tc=TkContext.implicit): """ Creates a dicom object with metadataFrame and pixeldataFrame from a dcm file(s) Parameters ---------- :param dicom_dir_path: (str) Local/HDFS path of the dcm file(s) :return: (Dicom) returns a dicom object with metadata and pixeldata frames Examples -------- #Path can be local/hdfs to dcm file(s) >>> dicom_path = "../datasets/dicom_uncompressed" #use import_dcm available inside dicom module to create a dicom object from given dicom_path >>> dicom = tc.dicom.import_dcm(dicom_path) #Type of dicom object created >>> type(dicom) <class 'sparktk.dicom.dicom.Dicom'> #Inspect metadata property to see dicom metadata xml content <skip> >>> dicom.metadata.inspect(truncate=30) [#] id metadata ======================================= [0] 0 <?xml version="1.0" encodin... [1] 1 <?xml version="1.0" encodin... [2] 2 <?xml version="1.0" encodin... </skip> #pixeldata property is sparktk frame >>> pixeldata = dicom.pixeldata.take(1) <skip> >>> pixeldata [[0L, array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 7., 5., ..., 5., 7., 8.], [ 0., 7., 6., ..., 5., 6., 7.], ..., [ 0., 6., 7., ..., 5., 5., 6.], [ 0., 2., 5., ..., 5., 5., 4.], [ 1., 1., 3., ..., 1., 1., 0.]])]] </skip> """ if not isinstance(dicom_dir_path, basestring): raise ValueError("dicom_dir_path parameter must be a string, but is {0}.".format(type(dicom_dir_path))) TkContext.validate(tc) scala_dicom = tc.sc._jvm.org.trustedanalytics.sparktk.dicom.internal.constructors.Import.importDcm(tc.jutils.get_scala_sc(), dicom_dir_path) from sparktk.dicom.dicom import Dicom return Dicom._from_scala(tc, scala_dicom)
def create_orientdb_conf(hostname, port_number, db_user_name, db_password, root_password, tc=TkContext.implicit): """ Create OrientDB connection settings to be passed to export_to_orientdb and import_orientdb_graph APIs. Parameters ---------- :param hostname: (str) OrientDB server hostname :param port_number: (str) OrientDB server port number :param db_user_name: (str) OrientDB database user name :param db_password: (str) the database password :param root_password: (str) OrientDB server root password :return (OrientConf) OrientDB connection settings Example ------- >>> hostname = "localhost" >>> port_number = "2424" >>> root_password = "******" >>> orient_conf = tc.graph.create_orientdb_config(hostname, ... port_number, ... "admin", ... "admin", ... root_password) >>> orient_conf db_password = admin db_user_name = admin hostname = localhost port_number = 2424 root_password = root """ TkContext.validate(tc) scala_obj = tc.sc._jvm.org.trustedanalytics.sparktk.graph.internal.ops.orientdb.OrientdbConnection return OrientdbConf(tc, scala_obj.createOrientdbConf(hostname, port_number, db_user_name, db_password, root_password))
def set_orientdb_configurations(hostname, port_number, db_user_name, db_password, root_password, db_properties = None,batch_size = 1000, tc=TkContext.implicit): """ Set OrientDB configurations to be passed to export_to_orientdb and import_orientdb_graph APIs. Parameters ---------- :param:(str) hostname: OrientDB server hostname :param:(str) port_number: OrientDB server port number :param:(str) db_user_name: OrientDB database user name :param:(str) password: the database password :param:(str) root_password: OrientDB server root password :param:(int) batch_size: batch size for graph ETL to OrientDB database :param:(Optional(dict(str,any))) db_properties: additional properties for OrientDB database :return:(OrientConf) OrientDB configurations Example ------- >>> hostname = "localhost" >>> port_number = "2424" >>> root_password = "******" >>> orient_conf = tc.graph.set_orientdb_configurations(hostname,port_number,"admin","admin",root_password) >>> orient_conf batch_size = 1000 db_password = admin db_properties = None db_user_name = admin hostname = localhost port_number = 2424 root_password = root """ TkContext.validate(tc) scala_obj = tc.sc._jvm.org.trustedanalytics.sparktk.graph.internal.ops.orientdb.ExportToOrientdb return OrientConf(tc, scala_obj.setOrientdbConfigurations(hostname, port_number, db_user_name, db_password, root_password, tc.jutils.convert.to_scala_option_map(db_properties), batch_size))
def create_orientdb_conf(hostname, port_number, db_user_name, db_password, root_password, tc=TkContext.implicit): """ Create OrientDB connection settings to be passed to export_to_orientdb and import_orientdb_graph APIs. Parameters ---------- :param hostname: (str) OrientDB server hostname :param port_number: (str) OrientDB server port number :param db_user_name: (str) OrientDB database user name :param db_password: (str) the database password :param root_password: (str) OrientDB server root password :return (OrientConf) OrientDB connection settings Example ------- >>> hostname = "localhost" >>> port_number = "2424" >>> root_password = "******" >>> orient_conf = tc.graph.create_orientdb_config(hostname, ... port_number, ... "admin", ... "admin", ... root_password) >>> orient_conf db_password = admin db_user_name = admin hostname = localhost port_number = 2424 root_password = root """ TkContext.validate(tc) scala_obj = tc.sc._jvm.org.trustedanalytics.sparktk.graph.internal.ops.orientdb.OrientdbConnection return OrientdbConf( tc, scala_obj.createOrientdbConf(hostname, port_number, db_user_name, db_password, root_password))
def import_jdbc(connection_url, table_name, tc=TkContext.implicit): """ Import data from jdbc table into frame. Parameters ---------- :param connection_url: (str) JDBC connection url to database server :param table_name: (str) JDBC table name :return: (Frame) returns frame with jdbc table data Examples -------- Load a frame from a jdbc table specifying the connection url to the database server. <skip> >>> url = "jdbc:postgresql://localhost/postgres" >>> tb_name = "demo_test" >>> frame = tc.frame.import_jdbc(url, tb_name) -etc- >>> frame.inspect() [#] a b c d ================== [0] 1 0.2 -2 5 [1] 2 0.4 -1 6 [2] 3 0.6 0 7 [3] 4 0.8 1 8 >>> frame.schema [(u'a', int), (u'b', float), (u'c', int), (u'd', int)] </skip> """ if not isinstance(connection_url, basestring): raise ValueError( "connection url parameter must be a string, but is {0}.".format( type(connection_url))) if not isinstance(table_name, basestring): raise ValueError( "table name parameter must be a string, but is {0}.".format( type(table_name))) TkContext.validate(tc) scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importJdbc( tc.jutils.get_scala_sc(), connection_url, table_name) from sparktk.frame.frame import Frame return Frame(tc, scala_frame)
def import_orientdb_graph(db_url, user_name, password, root_password,tc=TkContext.implicit): """ Import graph from OrientDB to spark-tk as spark-tk graph (Spark graph frame) Parameters ---------- :param db_url: OrientDB URI :param user_name: the database username :param password: the database password :param root_password: OrientDB server password """ TkContext.validate(tc) scala_graph = tc.sc._jvm.org.trustedanalytics.sparktk.graph.internal.constructors.fromorientdb.ImportFromOrientdb.importOrientdbGraph(tc.jutils.get_scala_sc(), db_url,user_name,password,root_password) from sparktk.graph.graph import Graph return Graph(tc, scala_graph)
def import_jdbc(connection_url, table_name, tc=TkContext.implicit): """ Import data from jdbc table into frame. Parameters ---------- :param connection_url: (str) JDBC connection url to database server :param table_name: (str) JDBC table name :return: (Frame) returns frame with jdbc table data Examples -------- Load a frame from a jdbc table specifying the connection url to the database server. <skip> >>> url = "jdbc:postgresql://localhost/postgres" >>> tb_name = "demo_test" >>> frame = tc.frame.import_jdbc(url, tb_name) -etc- >>> frame.inspect() [#] a b c d ================== [0] 1 0.2 -2 5 [1] 2 0.4 -1 6 [2] 3 0.6 0 7 [3] 4 0.8 1 8 >>> frame.schema [(u'a', int), (u'b', float), (u'c', int), (u'd', int)] </skip> """ if not isinstance(connection_url, basestring): raise ValueError("connection url parameter must be a string, but is {0}.".format(type(connection_url))) if not isinstance(table_name, basestring): raise ValueError("table name parameter must be a string, but is {0}.".format(type(table_name))) TkContext.validate(tc) scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importJdbc(tc.jutils.get_scala_sc(), connection_url, table_name) from sparktk.frame.frame import Frame return Frame(tc, scala_frame)
def import_hbase(table_name, schema, start_tag=None, end_tag=None, tc=TkContext.implicit): """ Import data from hbase table into frame :param table_name: (str) hbase table name :param schema: (list[list[str, str, type]]) hbase schema as a List of List(string) (columnFamily, columnName, dataType for cell value) :param start_tag: (Optional(str)) optional start tag for filtering :param end_tag: (Optional(str)) optional end tag for filtering :return: (Frame) frame with data from hbase table Example --------- Load data into frame from a hbase table <skip> >>> frame = tc.frame.import_hbase("demo_test_hbase", [["test_family", "a", int],["test_family", "b", float], ["test_family", "c", int],["test_family", "d", int]]) -etc- >>> frame.inspect() [#] test_family_a test_family_b test_family_c test_family_d =============================================================== [0] 1 0.2 -2 5 [1] 2 0.4 -1 6 [2] 3 0.6 0 7 [3] 4 0.8 1 8 Use of start_tag and end_tag. (Hbase creates a unique row id for data in hbase tables) start_tag: It is the unique row id from where row scan should start end_tag: It is the unique row id where row scan should end Assuming you already have data on hbase table "test_startendtag" under "startendtag" family name with single column named "number". data: column contains values from 1 to 99. Here rowid is generated by hbase. Sample hbase data. Few rows from hbase table looks as below. hbase(main):002:0> scan "test_startendtag" ROW COLUMN+CELL 0 column=startendtag:number, timestamp=1465342524846, value=1 1 column=startendtag:number, timestamp=1465342524846, value=25 10 column=startendtag:number, timestamp=1465342524847, value=51 103 column=startendtag:number, timestamp=1465342524851, value=98 107 column=startendtag:number, timestamp=1465342524851, value=99 11 column=startendtag:number, timestamp=1465342524851, value=75 12 column=startendtag:number, timestamp=1465342524846, value=4 13 column=startendtag:number, timestamp=1465342524846, value=28 14 column=startendtag:number, timestamp=1465342524847, value=52 15 column=startendtag:number, timestamp=1465342524851, value=76 16 column=startendtag:number, timestamp=1465342524846, value=5 17 column=startendtag:number, timestamp=1465342524846, value=29 18 column=startendtag:number, timestamp=1465342524847, value=53 19 column=startendtag:number, timestamp=1465342524851, value=77 2 column=startendtag:number, timestamp=1465342524847, value=49 20 column=startendtag:number, timestamp=1465342524846, value=6 21 column=startendtag:number, timestamp=1465342524846, value=30 >>> frame = tc.frame.import_hbase("test_startendtag", [["startendtag", "number", int]], start_tag="20", end_tag="50") -etc- >>> frame.count() 33 >>> frame.inspect(33) [##] startendtag_number ======================== [0] 6 [1] 30 [2] 54 [3] 78 [4] 7 [5] 31 [6] 55 [7] 79 [8] 8 [9] 32 [10] 73 [11] 56 [12] 80 [13] 9 [14] 33 [15] 57 [16] 81 [17] 10 [18] 34 [19] 58 [##] startendtag_number ======================== [20] 82 [21] 2 [22] 11 [23] 35 [24] 59 [25] 83 [26] 12 [27] 36 [28] 60 [29] 84 [30] 13 [31] 37 [32] 26 </skip> """ if not isinstance(table_name, basestring): raise ValueError( "table name parameter must be a string, but is {0}.".format( type(table_name))) if not isinstance(schema, list): raise ValueError("schema parameter must be a list, but is {0}.".format( type(table_name))) TkContext.validate(tc) inner_lists = [ tc._jutils.convert.to_scala_list( [item[0], item[1], dtypes.to_string(item[2])]) for item in schema ] scala_final_schema = tc.jutils.convert.to_scala_list(inner_lists) scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importHbase( tc.jutils.get_scala_sc(), table_name, scala_final_schema, tc._jutils.convert.to_scala_option(start_tag), tc._jutils.convert.to_scala_option(end_tag)) from sparktk.frame.frame import Frame return Frame(tc, scala_frame)
def import_csv_raw(path, delimiter=",", header=False, tc=TkContext.implicit): """ Creates a frame by importing the data as strings from the specified csv file. If the csv file has a header row, those values will be used as column names. Otherwise, columns will be named generically, like 'C0', 'C1', 'C2', etc. Parameters ---------- :param path: (str) Full path to the csv file :param delimiter: (str) A string which indicates the separation of data fields. This is usually a single character and could be a non-visible character, such as a tab. The default delimiter is a comma (,). :param header: (bool) Boolean value indicating if the first line of the file will be used to name columns, and not be included in the data. The default value is false. :return: (Frame) Frame that contains the data from the csv file Examples -------- Import raw data from a csv file by specifying the path to the file, delimiter, and header option. All data will be brought in the frame as strings, and columns will be named according to the header row, if there was one. >>> file_path = "../datasets/cities.csv" >>> frame = tc.frame.import_csv_raw(file_path, delimiter="|", header=True) -etc- >>> frame.inspect() [#] rank city population_2013 population_2010 change county ============================================================================ [0] 1 Portland 609456 583776 4.40% Multnomah [1] 2 Salem 160614 154637 3.87% Marion [2] 3 Eugene 159190 156185 1.92% Lane [3] 4 Gresham 109397 105594 3.60% Multnomah [4] 5 Hillsboro 97368 91611 6.28% Washington [5] 6 Beaverton 93542 89803 4.16% Washington [6] 15 Grants Pass 35076 34533 1.57% Josephine [7] 16 Oregon City 34622 31859 8.67% Clackamas [8] 17 McMinnville 33131 32187 2.93% Yamhill [9] 18 Redmond 27427 26215 4.62% Deschutes >>> frame.schema [('rank', <type 'str'>), ('city', <type 'str'>), ('population_2013', <type 'str'>), ('population_2010', <type 'str'>), ('change', <type 'str'>), ('county', <type 'str'>)] """ TkContext.validate(tc) require_type.non_empty_str(path, "path") require_type.non_empty_str(delimiter, "delimiter") require_type(bool, header, "header") df = tc.sql_context.read.format( "com.databricks.spark.csv.org.trustedanalytics.sparktk").options( delimiter=delimiter, header=str(header).lower(), inferschema="false").load(path, schema=None) df_schema = [] for column in df.schema.fields: try: datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type( type(column.dataType)) except ValueError: raise TypeError( "Unsupported data type ({0}) for column {1}.".format( str(column.dataType), column.name)) df_schema.append((column.name, datatype)) jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython( df._jdf.rdd()) rdd = RDD(jrdd, tc.sc) from sparktk.frame.frame import Frame # circular dependency, so import late return Frame(tc, rdd, df_schema)
def import_csv(path, delimiter=",", header=False, infer_schema=True, schema=None, tc=TkContext.implicit): """ Creates a frame with data from a csv file. Parameters ---------- :param path: (str) Full path to the csv file :param delimiter: (Optional[str]) A string which indicates the separation of data fields. This is usually a single character and could be a non-visible character, such as a tab. The default delimiter is a comma (,). :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns, and not be included in the data. The default value is false. :param infer_schema:(Optional[bool]) Boolean value indicating if the column types will be automatically inferred. It requires one extra pass over the data and is false by default. :param: schema: (Optional[List[tuple(str, type)]]) Optionally specify the schema for the dataset. Number of columns specified in the schema must match the number of columns in the csv file provided. :return: (Frame) Frame that contains the data from the csv file Examples -------- Load a frame from a csv file by specifying the path to the file, delimiter, and options that specify that there is a header and to infer the schema based on the data. >>> file_path = "../integration-tests/datasets/cities.csv" >>> frame = tc.load_frame_from_csv(file_path, "|", header=True, infer_schema=True) -etc- >>> frame.inspect() [#] rank city population_2013 population_2010 change county ============================================================================ [0] 1 Portland 609456 583776 4.40% Multnomah [1] 2 Salem 160614 154637 3.87% Marion [2] 3 Eugene 159190 156185 1.92% Lane [3] 4 Gresham 109397 105594 3.60% Multnomah [4] 5 Hillsboro 97368 91611 6.28% Washington [5] 6 Beaverton 93542 89803 4.16% Washington [6] 15 Grants Pass 35076 34533 1.57% Josephine [7] 16 Oregon City 34622 31859 8.67% Clackamas [8] 17 McMinnville 33131 32187 2.93% Yamhill [9] 18 Redmond 27427 26215 4.62% Deschutes >>> frame.schema [('rank', int), ('city', str), ('population_2013', int), ('population_2010', int), ('change', str), ('county', str)] """ if schema is not None: infer_schema = False # if a custom schema is provided, don't waste time inferring the schema during load if not isinstance(header, bool): raise ValueError( "header parameter must be a boolean, but is {0}.".format( type(header))) if not isinstance(infer_schema, bool): raise ValueError( "infer_schema parameter must be a boolean, but is {0}.".format( type(infer_schema))) TkContext.validate(tc) header_str = str(header).lower() infer_schema_str = str(infer_schema).lower() pyspark_schema = None if (not infer_schema) and (schema is not None): fields = [] for column in schema: if dtypes._data_type_to_pyspark_type_table.has_key(column[1]): fields.append( StructField( column[0], dtypes._data_type_to_pyspark_type_table[column[1]], True)) else: raise TypeError( "Unsupported type {0} in schema for column {1}.".format( column[1], column[0])) pyspark_schema = StructType(fields) df = tc.sql_context.read.format( "com.databricks.spark.csv.org.trustedanalytics.sparktk").options( delimiter=delimiter, header=header_str, dateformat="yyyy-MM-dd'T'HH:mm:ss.SSSX", inferschema=infer_schema_str).load(path, schema=pyspark_schema) df_schema = [] if schema is None: for column in df.schema.fields: try: datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type( type(column.dataType)) except ValueError: raise TypeError( "Unsupported data type ({0}) for column {1}.".format( str(column.dataType), column.name)) df_schema.append((column.name, datatype)) else: df_column_count = len(df.schema.fields) custom_column_count = len(schema) if (df_column_count != custom_column_count): raise ValueError( "Bad schema value. The number of columns in the custom schema ({0}) must match the" "number of columns in the csv file data ({1}).".format( custom_column_count, df_column_count)) df_schema = schema def cast_datetime(row): """ The spark data frame gives uses datetime objects. Convert them to long (ms since epoch) for our frame. """ data = [] for column_index in xrange(0, len(df_schema)): if df_schema[column_index][1] == dtypes.datetime and isinstance( row[column_index], datetime): data.append(long(dtypes.datetime_to_ms(row[column_index]))) else: data.append(row[column_index]) return data jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython( df._jdf.rdd()) rdd = RDD(jrdd, tc.sc) if any(c[1] == dtypes.datetime for c in df_schema): # If any columns are date/time we must do this map rdd = df.rdd.map(cast_datetime) from sparktk.frame.frame import Frame # circular dependency, so import late return Frame(tc, rdd, df_schema)
def import_hive(hive_query, tc=TkContext.implicit): """ Import data from hive table into frame. Define the sql query to retrieve the data from a hive table. Only a subset of Hive data types are supported: DataType Support ---------- ------------------------------------ boolean cast to int bigint native support int native support tinyint cast to int smallint cast to int decimal cast to double, may lose precision double native support float native support date cast to string string native support timestamp cast to string varchar cast to string arrays not supported binary not supported char not supported maps not supported structs not supported union not supported Parameters ---------- :param hive_query: (str) hive query to fetch data from table :param tc: (TkContext) TK context :return: (Frame) returns frame with hive table data Examples -------- Load data into frame from a hive table based on hive query <skip> >>> h_query = "select * from demo_test" >>> frame = tc.frame.import_hive(h_query) -etc- >>> frame.inspect() [#] number strformat ====================== [0] 1 one [1] 2 two [2] 3 three [3] 4 four </skip> """ if not isinstance(hive_query, basestring): raise ValueError("hive query parameter must be a string, but is {0}.".format(type(hive_query))) TkContext.validate(tc) scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importHive(tc.jutils.get_scala_sc(), hive_query) from sparktk.frame.frame import Frame return Frame(tc, scala_frame)
def import_csv(path, delimiter=",", header=False, schema=None, datetime_format="yyyy-MM-dd'T'HH:mm:ss.SSSX", tc=TkContext.implicit): """ Creates a frame with data from a csv file. Parameters ---------- :param path: (str) Full path to the csv file :param delimiter: (Optional[str]) A string which indicates the separation of data fields. This is usually a single character and could be a non-visible character, such as a tab. The default delimiter is a comma (,). :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns (unless a schema is provided), and not be included in the data. The default value is false. :param schema: (Optional(list[tuple(str, type)] or list[str])) The are different options for specifying a schema: * Provide the full schema for the frame as a list of tuples (string column name and data type) * Provide the column names as a list of strings. Column data types will be inferred, based on the data. The column names specified will override column names that are found in the header row. * None, where the schema is automatically inferred based on the data. Columns are named based on the header, or will be named generically ("C0", "C1", "C2", etc). :param datetime_format: (str) String specifying how date/time columns are formatted, using the java.text.SimpleDateFormat specified at https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html :return: (Frame) Frame that contains the data from the csv file Examples -------- Load a frame from a csv file by specifying the path to the file, delimiter >>> file_path = "../datasets/cities.csv" >>> frame = tc.frame.import_csv(file_path, "|", header=True) -etc- >>> frame.inspect() [#] rank city population_2013 population_2010 change county ============================================================================ [0] 1 Portland 609456 583776 4.40% Multnomah [1] 2 Salem 160614 154637 3.87% Marion [2] 3 Eugene 159190 156185 1.92% Lane [3] 4 Gresham 109397 105594 3.60% Multnomah [4] 5 Hillsboro 97368 91611 6.28% Washington [5] 6 Beaverton 93542 89803 4.16% Washington [6] 15 Grants Pass 35076 34533 1.57% Josephine [7] 16 Oregon City 34622 31859 8.67% Clackamas [8] 17 McMinnville 33131 32187 2.93% Yamhill [9] 18 Redmond 27427 26215 4.62% Deschutes >>> frame.schema [('rank', <type 'int'>), ('city', <type 'str'>), ('population_2013', <type 'int'>), ('population_2010', <type 'int'>), ('change', <type 'str'>), ('county', <type 'str'>)] The schema parameter can be used to specify a custom schema (column names and data types) or column names (and the data types are inferred based on the data). Here, we will specify the column names, which will override the header from the csv file. >>> column_names = ["Rank", "City", "2013", "2010", "Percent_Change", "County"] >>> frame = tc.frame.import_csv(file_path, "|", header=True, schema=column_names) -etc- >>> frame.schema [('Rank', <type 'int'>), ('City', <type 'str'>), ('2013', <type 'int'>), ('2010', <type 'int'>), ('Percent_Change', <type 'str'>), ('County', <type 'str'>)] <hide> >>> file_path = "../datasets/unicode.csv" >>> schema = [("a", unicode),("b", unicode),("c",unicode)] >>> frame = tc.frame.import_csv(file_path, schema=schema, header=False) -etc- >>> print unicode(frame.get_inspect()).encode('utf-8') # because this file is UT8 and this docstring is str [#] a b c ============ [0] à ë ñ [1] ã ê ü </hide> """ TkContext.validate(tc) require_type.non_empty_str(path, "path") require_type.non_empty_str(delimiter, "delimiter") require_type(bool, header, "header") require_type(str, datetime_format, "datetime_format") infer_schema = True column_names = [] # custom column names if schema is not None: if not isinstance(schema, list): raise TypeError("Unsupported type %s for schema parameter." % type(schema)) elif all(isinstance(item, basestring) for item in schema): # schema is just column names column_names = schema schema = None else: infer_schema = False # if a custom schema is provided, don't waste time inferring the schema during load sparktk_schema.validate(schema) header_str = str(header).lower() infer_schema_str = str(infer_schema).lower() pyspark_schema = None if schema is not None: fields = [] for column in schema: if dtypes._data_type_to_pyspark_type_table.has_key(column[1]): fields.append(StructField(column[0], dtypes._data_type_to_pyspark_type_table[column[1]], True)) else: raise TypeError("Unsupported type {0} in schema for column {1}.".format(column[1], column[0])) pyspark_schema = StructType(fields) df = tc.sql_context.read.format( "com.databricks.spark.csv.org.trustedanalytics.sparktk").options( delimiter=delimiter, header=header_str, dateformat=datetime_format, inferschema=infer_schema_str).load(path, schema=pyspark_schema) df_schema = [] if schema is None: for i, column in enumerate(df.schema.fields): try: datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(type(column.dataType)) except ValueError: raise TypeError("Unsupported data type ({0}) for column {1}.".format(str(column.dataType), column.name)) column_name = column_names[i] if (i < len(column_names)) else column.name df_schema.append((column_name, datatype)) else: df_column_count = len(df.schema.fields) custom_column_count = len(schema) if (df_column_count != custom_column_count): raise ValueError("Bad schema value. The number of columns in the custom schema ({0}) must match the" "number of columns in the csv file data ({1}).".format(custom_column_count, df_column_count)) df_schema = schema def cast_datetime(row): """ The spark data frame gives uses datetime objects. Convert them to long (ms since epoch) for our frame. """ data = [] for column_index in xrange(0, len(df_schema)): if df_schema[column_index][1] == dtypes.datetime and isinstance(row[column_index], datetime): data.append(long(dtypes.datetime_to_ms(row[column_index]))) else: data.append(row[column_index]) return data jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(df._jdf.rdd()) rdd = RDD(jrdd, tc.sc) if any(c[1] == dtypes.datetime for c in df_schema): # If any columns are date/time we must do this map rdd = df.rdd.map(cast_datetime) from sparktk.frame.frame import Frame # circular dependency, so import late return Frame(tc, rdd, df_schema)
def import_xml(file_name, record_tag, tc=TkContext.implicit): """ Imports a file of XML records XML records can span multiple lines. Returns a Frame of one column containing a XML string per row Note: Only records which start with the given tag will be included (multiple different tags not supported) Parameters ---------- :param file_name: file path :param record_tag: value of the XML element which contains a record :return: Frame Examples -------- Consider a file of XML records: <?xml version="1.0" encoding="UTF-8"?> <table> <shape type="triangle"> <x>0</x> <y>0</y> <size>12</size> </shape> <shape type="square"> <x>8</x> <y>0</y> <size>4</size> </shape> <shape color="blue" type="pentagon"> <x>0</x> <y>10</y> <size>2</size> </shape> <shape type="square"> <x>-4</x> <y>6</y> <size>7</size> </shape> </table> We can parse this file into a frame of records: >>> f = tc.frame.import_xml("../datasets/shapes1.xml", "shape") >>> f.inspect() [#] records ========================================= [0] <shape type="triangle"> <x>0</x> <y>0</y> <size>12</size> </shape> [1] <shape type="square"> <x>8</x> <y>0</y> <size>4</size> </shape> [2] <shape color="blue" type="pentagon"> <x>0</x> <y>10</y> <size>2</size> </shape> [3] <shape type="square"> <x>-4</x> <y>6</y> <size>7</size> </shape> We can further break the XML records into individual columns with a map_columns (or add_columns) operation: >>> import xml.etree.ElementTree as ET >>> def parse_my_xml(row): ... ele = ET.fromstring(row[0]) ... return [ele.get("type"), int(ele.find("x").text), int(ele.find("y").text), int(ele.find("size").text)] >>> f2 = f.map_columns(parse_my_xml, [('shape', str), ('x', int), ('y', int), ('size', int)]) >>> f2.inspect() [#] shape x y size =========================== [0] triangle 0 0 12 [1] square 8 0 4 [2] pentagon 0 10 2 [3] square -4 6 7 Consider another file of XML records, this time with different element names for the records: <?xml version="1.0" encoding="UTF-8"?> <shapes> <triangle> <x>0</x> <y>0</y> <size>12</size> </triangle> <square> <x>8</x> <y>0</y> <size>4</size> </square> <pentagon color="blue"> <x>0</x> <y>10</y> <size>2</size> </pentagon> <square> <x>-4</x> <y>6</y> <size>7</size> </square> </shapes> We can parse this file into a frame of records of a single type. We must pick only one. The others will be filtered out: >>> f3 = tc.frame.import_xml("../datasets/shapes2.xml", "square") >>> f3.inspect() [#] records =========================== [0] <square> <x>8</x> <y>0</y> <size>4</size> </square> [1] <square> <x>-4</x> <y>6</y> <size>7</size> </square> We can further break the XML records into individual columns with a map_columns (or add_columns) operation: >>> def parse_my_squares(row): ... ele = ET.fromstring(row[0]) ... return [int(ele.find("x").text), int(ele.find("y").text), int(ele.find("size").text)] >>> f4 = f3.map_columns(parse_my_squares, [('x', int), ('y', int), ('size', int)]) >>> f4.inspect() [#] x y size ================ [0] 8 0 4 [1] -4 6 7 """ TkContext.validate(tc) require_type.non_empty_str(file_name, "file_name") require_type.non_empty_str(record_tag, "record_tag") scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.\ frame.internal.constructors.ImportMultiLineRecords.importXml(tc.jutils.get_scala_sc(), file_name, record_tag) from sparktk.frame.frame import Frame return Frame(tc, scala_frame)
def import_json(file_name, tc=TkContext.implicit): """ Imports a file of JSON records JSON records can span multiple lines. Returns a Frame of one column containing a JSON string per row Parameters ---------- :param file_name: file path :return: Frame Examples -------- Consider a file of JSON records: { "obj": { "color": "blue", "size": 4, "shape": "square" } } { "obj": { "color": "green", "size": 3, "shape": "triangle" } } { "obj": { "color": "yellow", "size": 5, "shape": "pentagon" } } { "obj": { "color": "orange", "size": 2, "shape": "lentil" } } We can parse this file into a frame of records: >>> f = tc.frame.import_json("../datasets/shapes.json") >>> f.inspect() [#] records ===================================================================== [0] { "obj": { "color": "blue", "size": 4, "shape": "square" } } [1] { "obj": { "color": "green", "size": 3, "shape": "triangle" } } [2] { "obj": { "color": "yellow", "size": 5, "shape": "pentagon" } } [3] { "obj": { "color": "orange", "size": 2, "shape": "lentil" } } We can further break the JSON records into individual columns with a map_columns (or add_columns) operation: >>> import json >>> def parse_my_json(row): ... record = json.loads(row.records)['obj'] ... return [record['color'], record['size'], record['shape']] >>> f2 = f.map_columns(parse_my_json, [('color', str), ('size', int), ('shape', str)]) >>> f2.inspect() [#] color size shape =========================== [0] blue 4 square [1] green 3 triangle [2] yellow 5 pentagon [3] orange 2 lentil """ TkContext.validate(tc) require_type.non_empty_str(file_name, "file_name") scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.\ frame.internal.constructors.ImportMultiLineRecords.importJson(tc.jutils.get_scala_sc(), file_name) from sparktk.frame.frame import Frame return Frame(tc, scala_frame)
def import_jdbc(connection_url, table_name, tc=TkContext.implicit): """ Import data from jdbc table into frame. Parameters ---------- :param connection_url: (str) JDBC connection url to database server :param table_name: (str) JDBC table name :return: (Frame) returns frame with jdbc table data Examples -------- Load a frame from a jdbc table specifying the connection url to the database server. <skip> >>> url = "jdbc:postgresql://localhost/postgres" >>> tb_name = "demo_test" >>> frame = tc.frame.import_jdbc(url, tb_name) -etc- >>> frame.inspect() [#] a b c d ================== [0] 1 0.2 -2 5 [1] 2 0.4 -1 6 [2] 3 0.6 0 7 [3] 4 0.8 1 8 >>> frame.schema [(u'a', int), (u'b', float), (u'c', int), (u'd', int)] </skip> Notes ----- java.sql.SQLException: No suitable driver found for <jdbcUrl> If this error is encountered while running your application, then your JDBC library cannot be found by the node running the application. If you're running in Local mode, make sure that you have used the --driver-class-path parameter. If a Spark cluster is involved, make sure that each cluster member has a copy of library, and that each node of the cluster has been restarted since you modified the spark-defaults.conf file. See this [site](https://sparkour.urizone.net/recipes/using-jdbc/). Sparktk does not come with any JDBC drivers. A driver compatible with the JDBC data source must be supplied when creating the TkContext instance: <skip> >>> tc = sparktk.TkContext(pyspark_submit_args='--jars myJDBCDriver.jar') </skip> """ if not isinstance(connection_url, basestring): raise ValueError("connection url parameter must be a string, but is {0}.".format(type(connection_url))) if not isinstance(table_name, basestring): raise ValueError("table name parameter must be a string, but is {0}.".format(type(table_name))) TkContext.validate(tc) scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importJdbc(tc.jutils.get_scala_sc(), connection_url, table_name) from sparktk.frame.frame import Frame return Frame(tc, scala_frame)
def import_orientdb_graph(db_url, user_name, password, root_password,tc=TkContext.implicit): """ Import graph from OrientDB to spark-tk as spark-tk graph (Spark GraphFrame) Parameters ---------- :param:(str) db_url: OrientDB URI :param:(str) user_name: the database username :param:(str) password: the database password :param :(str)root_password: OrientDB server password Example ------- >>> v = tc.frame.create([("a", "Alice", 34,"F"), ... ("b", "Bob", 36,"M"), ... ("c", "Charlie", 30,"M"), ... ("d", "David", 29,"M"), ... ("e", "Esther", 32,"F"), ... ("f", "Fanny", 36,"F"), ... ], ["id", "name", "age","gender"]) >>> e = tc.frame.create([("a", "b", "friend"), ... ("b", "c", "follow"), ... ("c", "b", "follow"), ... ("f", "c", "follow"), ... ("e", "f", "follow"), ... ("e", "d", "friend"), ... ("d", "a", "friend"), ... ("a", "e", "friend") ... ], ["src", "dst", "relationship"]) >>> sparktk_graph = tc.graph.create(v,e) <skip> >>> db = "test_db" >>> sparktk_graph.export_to_orientdb(db_url="remote:hostname:2424/%s" % db,user_name= "admin",password = "******",root_password = "******",vertex_type_column_name= "gender",edge_type_column_name="relationship") >>> imported_gf = tc.graph.import_orientdb_graph(db_url="remote:hostname:2424/%s" % db,user_name= "admin",password = "******",root_password = "******") >>> imported_gf.graphframe.vertices.show() +-------+------+---+---+ | name|gender| id|age| +-------+------+---+---+ | Bob| M| b| 36| | David| M| d| 29| |Charlie| M| c| 30| | Alice| F| a| 34| | Esther| F| e| 32| | Fanny| F| f| 36| +-------+------+---+---+ >>> imported_gf.graphframe.edges.show() +---+------------+---+ |dst|relationship|src| +---+------------+---+ | f| follow| e| | b| follow| c| | c| follow| b| | c| follow| f| | b| friend| a| | a| friend| d| | d| friend| e| | e| friend| a| +---+------------+---+ </skip> """ TkContext.validate(tc) scala_graph = tc.sc._jvm.org.trustedanalytics.sparktk.graph.internal.constructors.fromorientdb.ImportFromOrientdb.importOrientdbGraph(tc.jutils.get_scala_sc(), db_url,user_name,password,root_password) from sparktk.graph.graph import Graph return Graph(tc, scala_graph)
def import_hbase(table_name, schema, start_tag=None, end_tag=None, tc=TkContext.implicit): """ Import data from hbase table into frame :param table_name: (str) hbase table name :param schema: (list[list[str, str, type]]) hbase schema as a List of List(string) (columnFamily, columnName, dataType for cell value) :param start_tag: (Optional(str)) optional start tag for filtering :param end_tag: (Optional(str)) optional end tag for filtering :return: (Frame) frame with data from hbase table Example --------- Load data into frame from a hbase table <skip> >>> frame = tc.frame.import_hbase("demo_test_hbase", [["test_family", "a", int],["test_family", "b", float], ["test_family", "c", int],["test_family", "d", int]]) -etc- >>> frame.inspect() [#] test_family_a test_family_b test_family_c test_family_d =============================================================== [0] 1 0.2 -2 5 [1] 2 0.4 -1 6 [2] 3 0.6 0 7 [3] 4 0.8 1 8 Use of start_tag and end_tag. (Hbase creates a unique row id for data in hbase tables) start_tag: It is the unique row id from where row scan should start end_tag: It is the unique row id where row scan should end Assuming you already have data on hbase table "test_startendtag" under "startendtag" family name with single column named "number". data: column contains values from 1 to 99. Here rowid is generated by hbase. Sample hbase data. Few rows from hbase table looks as below. hbase(main):002:0> scan "test_startendtag" ROW COLUMN+CELL 0 column=startendtag:number, timestamp=1465342524846, value=1 1 column=startendtag:number, timestamp=1465342524846, value=25 10 column=startendtag:number, timestamp=1465342524847, value=51 103 column=startendtag:number, timestamp=1465342524851, value=98 107 column=startendtag:number, timestamp=1465342524851, value=99 11 column=startendtag:number, timestamp=1465342524851, value=75 12 column=startendtag:number, timestamp=1465342524846, value=4 13 column=startendtag:number, timestamp=1465342524846, value=28 14 column=startendtag:number, timestamp=1465342524847, value=52 15 column=startendtag:number, timestamp=1465342524851, value=76 16 column=startendtag:number, timestamp=1465342524846, value=5 17 column=startendtag:number, timestamp=1465342524846, value=29 18 column=startendtag:number, timestamp=1465342524847, value=53 19 column=startendtag:number, timestamp=1465342524851, value=77 2 column=startendtag:number, timestamp=1465342524847, value=49 20 column=startendtag:number, timestamp=1465342524846, value=6 21 column=startendtag:number, timestamp=1465342524846, value=30 >>> frame = tc.frame.import_hbase("test_startendtag", [["startendtag", "number", int]], start_tag="20", end_tag="50") -etc- >>> frame.count() 33 >>> frame.inspect(33) [##] startendtag_number ======================== [0] 6 [1] 30 [2] 54 [3] 78 [4] 7 [5] 31 [6] 55 [7] 79 [8] 8 [9] 32 [10] 73 [11] 56 [12] 80 [13] 9 [14] 33 [15] 57 [16] 81 [17] 10 [18] 34 [19] 58 [##] startendtag_number ======================== [20] 82 [21] 2 [22] 11 [23] 35 [24] 59 [25] 83 [26] 12 [27] 36 [28] 60 [29] 84 [30] 13 [31] 37 [32] 26 </skip> """ if not isinstance(table_name, basestring): raise ValueError("table name parameter must be a string, but is {0}.".format(type(table_name))) if not isinstance(schema, list): raise ValueError("schema parameter must be a list, but is {0}.".format(type(table_name))) TkContext.validate(tc) inner_lists=[tc._jutils.convert.to_scala_list([item[0], item[1], dtypes.to_string(item[2])]) for item in schema] scala_final_schema = tc.jutils.convert.to_scala_list(inner_lists) scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importHbase(tc.jutils.get_scala_sc(), table_name, scala_final_schema, tc._jutils.convert.to_scala_option(start_tag), tc._jutils.convert.to_scala_option(end_tag)) from sparktk.frame.frame import Frame return Frame(tc, scala_frame)
def import_csv_raw(path, delimiter=",", header=False, tc=TkContext.implicit): """ Creates a frame by importing the data as strings from the specified csv file. If the csv file has a header row, those values will be used as column names. Otherwise, columns will be named generically, like 'C0', 'C1', 'C2', etc. Parameters ---------- :param path: (str) Full path to the csv file :param delimiter: (str) A string which indicates the separation of data fields. This is usually a single character and could be a non-visible character, such as a tab. The default delimiter is a comma (,). :param header: (bool) Boolean value indicating if the first line of the file will be used to name columns, and not be included in the data. The default value is false. :return: (Frame) Frame that contains the data from the csv file Examples -------- Import raw data from a csv file by specifying the path to the file, delimiter, and header option. All data will be brought in the frame as strings, and columns will be named according to the header row, if there was one. >>> file_path = "../datasets/cities.csv" >>> frame = tc.frame.import_csv_raw(file_path, delimiter="|", header=True) -etc- >>> frame.inspect() [#] rank city population_2013 population_2010 change county ============================================================================ [0] 1 Portland 609456 583776 4.40% Multnomah [1] 2 Salem 160614 154637 3.87% Marion [2] 3 Eugene 159190 156185 1.92% Lane [3] 4 Gresham 109397 105594 3.60% Multnomah [4] 5 Hillsboro 97368 91611 6.28% Washington [5] 6 Beaverton 93542 89803 4.16% Washington [6] 15 Grants Pass 35076 34533 1.57% Josephine [7] 16 Oregon City 34622 31859 8.67% Clackamas [8] 17 McMinnville 33131 32187 2.93% Yamhill [9] 18 Redmond 27427 26215 4.62% Deschutes >>> frame.schema [('rank', <type 'str'>), ('city', <type 'str'>), ('population_2013', <type 'str'>), ('population_2010', <type 'str'>), ('change', <type 'str'>), ('county', <type 'str'>)] """ TkContext.validate(tc) require_type.non_empty_str(path, "path") require_type.non_empty_str(delimiter, "delimiter") require_type(bool, header, "header") df = tc.sql_context.read.format( "com.databricks.spark.csv.org.trustedanalytics.sparktk").options( delimiter=delimiter, header=str(header).lower(), inferschema="false").load(path, schema=None) df_schema = [] for column in df.schema.fields: try: datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(type(column.dataType)) except ValueError: raise TypeError("Unsupported data type ({0}) for column {1}.".format(str(column.dataType), column.name)) df_schema.append((column.name, datatype)) jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(df._jdf.rdd()) rdd = RDD(jrdd, tc.sc) from sparktk.frame.frame import Frame # circular dependency, so import late return Frame(tc, rdd, df_schema)
def import_orientdb_graph(orient_conf, db_name, db_properties=None, tc=TkContext.implicit): """ Import graph from OrientDB to spark-tk as spark-tk graph (Spark GraphFrame) Parameters ---------- :param orient_conf: (OrientConf) configuration settings for the OrientDB database :param db_name: (str) the database name :param db_properties: (Optional(dict(str,any))) additional properties for OrientDB database, for more OrientDB database properties options. See http://orientdb.com/docs/2.1/Configuration.html Example ------- >>> v = tc.frame.create([("a", "Alice", 34,"F"), ... ("b", "Bob", 36,"M"), ... ("c", "Charlie", 30,"M"), ... ("d", "David", 29,"M"), ... ("e", "Esther", 32,"F"), ... ("f", "Fanny", 36,"F"), ... ], ["id", "name", "age","gender"]) >>> e = tc.frame.create([("a", "b", "friend"), ... ("b", "c", "follow"), ... ("c", "b", "follow"), ... ("f", "c", "follow"), ... ("e", "f", "follow"), ... ("e", "d", "friend"), ... ("d", "a", "friend"), ... ("a", "e", "friend") ... ], ["src", "dst", "relationship"]) >>> sparktk_graph = tc.graph.create(v,e) <skip> >>> hostname = "localhost" >>> port_number = "2424" >>> db_name = "GraphDatabase" >>> root_password = "******" >>> orient_conf = tc.graph.create_orientdb_conf(hostname, port_number, "admin", "admin", root_password) >>> sparktk_graph.export_to_orientdb(orient_conf, ... db_name, ... vertex_type_column_name= "gender", ... edge_type_column_name="relationship") >>> imported_gf = tc.graph.import_orientdb_graph(orient_conf, db_name, db_properties = ({"db.validation":"false"})) >>> imported_gf.graphframe.vertices.show() +-------+------+---+---+ | name|gender| id|age| +-------+------+---+---+ | Bob| M| b| 36| | David| M| d| 29| |Charlie| M| c| 30| | Alice| F| a| 34| | Esther| F| e| 32| | Fanny| F| f| 36| +-------+------+---+---+ >>> imported_gf.graphframe.edges.show() +---+------------+---+ |dst|relationship|src| +---+------------+---+ | f| follow| e| | b| follow| c| | c| follow| b| | c| follow| f| | b| friend| a| | a| friend| d| | d| friend| e| | e| friend| a| +---+------------+---+ </skip> """ TkContext.validate(tc) scala_obj = tc.sc._jvm.org.trustedanalytics.sparktk.graph.internal.constructors.fromorientdb.ImportFromOrientdb scala_graph = scala_obj.importOrientdbGraph(tc.jutils.get_scala_sc(), orient_conf._scala, db_name, tc.jutils.convert.to_scala_option_map(db_properties)) from sparktk.graph.graph import Graph return Graph(tc, scala_graph)
def import_csv(path, delimiter=",", header=False, schema=None, datetime_format="yyyy-MM-dd'T'HH:mm:ss.SSSX", tc=TkContext.implicit): """ Creates a frame with data from a csv file. Parameters ---------- :param path: (str) Full path to the csv file :param delimiter: (Optional[str]) A string which indicates the separation of data fields. This is usually a single character and could be a non-visible character, such as a tab. The default delimiter is a comma (,). :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns (unless a schema is provided), and not be included in the data. The default value is false. :param schema: (Optional(list[tuple(str, type)] or list[str])) The are different options for specifying a schema: * Provide the full schema for the frame as a list of tuples (string column name and data type) * Provide the column names as a list of strings. Column data types will be inferred, based on thedata. The column names specified will override column names that are found in the header row. * None, where the schema is automatically inferred based on the data. Columns are named based on the header, or will be named generically ("C0", "C1", "C2", etc). :param datetime_format: (str) String specifying how date/time columns are formatted, using the java.text.SimpleDateFormat specified at https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html :return: (Frame) Frame that contains the data from the csv file Examples -------- Load a frame from a csv file by specifying the path to the file, delimiter >>> file_path = "../datasets/cities.csv" >>> frame = tc.frame.import_csv(file_path, "|", header=True) -etc- >>> frame.inspect() [#] rank city population_2013 population_2010 change county ============================================================================ [0] 1 Portland 609456 583776 4.40% Multnomah [1] 2 Salem 160614 154637 3.87% Marion [2] 3 Eugene 159190 156185 1.92% Lane [3] 4 Gresham 109397 105594 3.60% Multnomah [4] 5 Hillsboro 97368 91611 6.28% Washington [5] 6 Beaverton 93542 89803 4.16% Washington [6] 15 Grants Pass 35076 34533 1.57% Josephine [7] 16 Oregon City 34622 31859 8.67% Clackamas [8] 17 McMinnville 33131 32187 2.93% Yamhill [9] 18 Redmond 27427 26215 4.62% Deschutes >>> frame.schema [('rank', <type 'int'>), ('city', <type 'str'>), ('population_2013', <type 'int'>), ('population_2010', <type 'int'>), ('change', <type 'str'>), ('county', <type 'str'>)] The schema parameter can be used to specify a custom schema (column names and data types) or column names (and the data types are inferred based on the data). Here, we will specify the column names, which will override the header from the csv file. >>> column_names = ["Rank", "City", "2013", "2010", "Percent_Change", "County"] >>> frame = tc.frame.import_csv(file_path, "|", header=True, schema=column_names) -etc- >>> frame.schema [('Rank', <type 'int'>), ('City', <type 'str'>), ('2013', <type 'int'>), ('2010', <type 'int'>), ('Percent_Change', <type 'str'>), ('County', <type 'str'>)] <hide> >>> file_path = "../datasets/unicode.csv" >>> schema = [("a", unicode),("b", unicode),("c",unicode)] >>> frame = tc.frame.import_csv(file_path, schema=schema, header=False) -etc- >>> frame.inspect() [#] a b c ============ [0] à ë ñ [1] ã ê ü </hide> """ TkContext.validate(tc) require_type.non_empty_str(path, "path") require_type.non_empty_str(delimiter, "delimiter") require_type(bool, header, "header") require_type(str, datetime_format, "datetime_format") infer_schema = True column_names = [] # custom column names if schema is not None: if not isinstance(schema, list): raise TypeError("Unsupported type %s for schema parameter." % type(schema)) elif all(isinstance(item, basestring) for item in schema): # schema is just column names column_names = schema schema = None else: infer_schema = False # if a custom schema is provided, don't waste time inferring the schema during load sparktk_schema.validate(schema) header_str = str(header).lower() infer_schema_str = str(infer_schema).lower() pyspark_schema = None if schema is not None: fields = [] for column in schema: if dtypes._data_type_to_pyspark_type_table.has_key(column[1]): fields.append(StructField(column[0], dtypes._data_type_to_pyspark_type_table[column[1]], True)) else: raise TypeError("Unsupported type {0} in schema for column {1}.".format(column[1], column[0])) pyspark_schema = StructType(fields) df = tc.sql_context.read.format( "com.databricks.spark.csv.org.trustedanalytics.sparktk").options( delimiter=delimiter, header=header_str, dateformat=datetime_format, inferschema=infer_schema_str).load(path, schema=pyspark_schema) df_schema = [] if schema is None: for i, column in enumerate(df.schema.fields): try: datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(type(column.dataType)) except ValueError: raise TypeError("Unsupported data type ({0}) for column {1}.".format(str(column.dataType), column.name)) column_name = column_names[i] if (i < len(column_names)) else column.name df_schema.append((column_name, datatype)) else: df_column_count = len(df.schema.fields) custom_column_count = len(schema) if (df_column_count != custom_column_count): raise ValueError("Bad schema value. The number of columns in the custom schema ({0}) must match the" "number of columns in the csv file data ({1}).".format(custom_column_count, df_column_count)) df_schema = schema def cast_datetime(row): """ The spark data frame gives uses datetime objects. Convert them to long (ms since epoch) for our frame. """ data = [] for column_index in xrange(0, len(df_schema)): if df_schema[column_index][1] == dtypes.datetime and isinstance(row[column_index], datetime): data.append(long(dtypes.datetime_to_ms(row[column_index]))) else: data.append(row[column_index]) return data jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(df._jdf.rdd()) rdd = RDD(jrdd, tc.sc) if any(c[1] == dtypes.datetime for c in df_schema): # If any columns are date/time we must do this map rdd = df.rdd.map(cast_datetime) from sparktk.frame.frame import Frame # circular dependency, so import late return Frame(tc, rdd, df_schema)
def import_orientdb_graph(orient_conf, db_name, db_properties=None, tc=TkContext.implicit): """ Import graph from OrientDB to spark-tk as spark-tk graph (Spark GraphFrame) Parameters ---------- :param orient_conf: (OrientConf) configuration settings for the OrientDB database :param db_name: (str) the database name :param db_properties: (Optional(dict(str,any))) additional properties for OrientDB database, for more OrientDB database properties options. See http://orientdb.com/docs/2.1/Configuration.html Example ------- >>> v = tc.frame.create([("a", "Alice", 34,"F"), ... ("b", "Bob", 36,"M"), ... ("c", "Charlie", 30,"M"), ... ("d", "David", 29,"M"), ... ("e", "Esther", 32,"F"), ... ("f", "Fanny", 36,"F"), ... ], ["id", "name", "age","gender"]) >>> e = tc.frame.create([("a", "b", "friend"), ... ("b", "c", "follow"), ... ("c", "b", "follow"), ... ("f", "c", "follow"), ... ("e", "f", "follow"), ... ("e", "d", "friend"), ... ("d", "a", "friend"), ... ("a", "e", "friend") ... ], ["src", "dst", "relationship"]) >>> sparktk_graph = tc.graph.create(v,e) <skip> >>> hostname = "localhost" >>> port_number = "2424" >>> db_name = "GraphDatabase" >>> root_password = "******" >>> orient_conf = tc.graph.create_orientdb_conf(hostname, port_number, "admin", "admin", root_password) >>> sparktk_graph.export_to_orientdb(orient_conf, ... db_name, ... vertex_type_column_name= "gender", ... edge_type_column_name="relationship") >>> imported_gf = tc.graph.import_orientdb_graph(orient_conf, db_name, db_properties = ({"db.validation":"false"})) >>> imported_gf.graphframe.vertices.show() +-------+------+---+---+ | name|gender| id|age| +-------+------+---+---+ | Bob| M| b| 36| | David| M| d| 29| |Charlie| M| c| 30| | Alice| F| a| 34| | Esther| F| e| 32| | Fanny| F| f| 36| +-------+------+---+---+ >>> imported_gf.graphframe.edges.show() +---+------------+---+ |dst|relationship|src| +---+------------+---+ | f| follow| e| | b| follow| c| | c| follow| b| | c| follow| f| | b| friend| a| | a| friend| d| | d| friend| e| | e| friend| a| +---+------------+---+ </skip> """ TkContext.validate(tc) scala_obj = tc.sc._jvm.org.trustedanalytics.sparktk.graph.internal.constructors.fromorientdb.ImportFromOrientdb scala_graph = scala_obj.importOrientdbGraph( tc.jutils.get_scala_sc(), orient_conf._scala, db_name, tc.jutils.convert.to_scala_option_map(db_properties)) from sparktk.graph.graph import Graph return Graph(tc, scala_graph)
def import_pandas(pandas_frame, schema=None, row_index=True, validate_schema=False, tc=TkContext.implicit): """ Imports data from the specified pandas data frame. Parameters ---------- :param pandas_frame: (pandas.DataFrame) pandas dataframe object :param schema: (Optional(list[tuples(string, type)])) Schema description of the fields for a given line. It is a list of tuples which describe each field, (field name, field type), where the field name is a string, and file is a supported type. If no schema is provided, the schema will be inferred based on the column names and types from the pandas_frame. :param row_index: (Optional(bool)) Indicates if the row_index is present in the pandas dataframe and needs to be ignored when looking at the data values. Default value is True. :param validate_schema: (Optional(bool)) If true, validates the data against the schema and attempts to cast the data to the specified type, if it does not match the schema. Defaults to False. :return: (Frame) spark-tk frame that contains data from the pandas_frame Examples -------- Create a pandas data frame: >>> import pandas >>> ratings_data = [[0, "invalid"], [1, "Very Poor"], [2, "Poor"], [3, "Average"], [4, "Good"], [5, "Very Good"]] >>> df = pandas.DataFrame(ratings_data, columns=['rating_id', 'rating_text']) >>> df rating_id rating_text 0 0 invalid 1 1 Very Poor 2 2 Poor 3 3 Average 4 4 Good 5 5 Very Good >>> df.columns.tolist() ['rating_id', 'rating_text'] >>> df.dtypes rating_id int64 rating_text object dtype: object When using import_pandas by just passing the pandas data frame, it will use the column names and types from the pandas data frame to generate the schema. >>> frame = tc.frame.import_pandas(df) >>> frame.inspect() [#] rating_id rating_text =========================== [0] 0 invalid [1] 1 Very Poor [2] 2 Poor [3] 3 Average [4] 4 Good [5] 5 Very Good >>> frame.schema [('rating_id', long), ('rating_text', str)] Alternatively, you can specify a schema when importing the pandas data frame. There is also the option to validate the data against the schema. If this option is enabled, we will attempt to cast the data to the column's data type, if it does not match the schema. For example, here we will specify a schema where the rating_id column will instead be called 'rating_float' and it's data type will be a float. We will also enable the validate_schema option so that the rating_id value will get casted to a float: >>> frame = tc.frame.import_pandas(df, schema, validate_schema=True) >>> frame.inspect() [#] rating_float rating_str ============================= [0] 0.0 invalid [1] 1.0 Very Poor [2] 2.0 Poor [3] 3.0 Average [4] 4.0 Good [5] 5.0 Very Good >>> frame.schema [('rating_float', float), ('rating_str', unicode)] """ try: import pandas except: raise RuntimeError("pandas module not found, unable to download. Install pandas or try the take command.") if not isinstance(pandas_frame, pandas.DataFrame): raise TypeError("data_frame must be a pandas DataFrame.") TkContext.validate(tc) if schema is not None: schema = _validate(schema) else: schema = _get_schema_from_df(pandas_frame) if not row_index: pandas_frame = pandas_frame.reset_index() pandas_frame = pandas_frame.dropna(thresh=len(pandas_frame.columns)) field_names = [x[0] for x in schema] if len(pandas_frame.columns) != len(field_names): raise ValueError("Number of columns in Pandasframe {0} does not match the number of columns in the" " schema provided {1}.".format(len(pandas_frame.columns), len(field_names))) date_time_columns = [i for i, x in enumerate(pandas_frame.dtypes) if x == "datetime64[ns]"] has_date_time = len(date_time_columns) > 0 # pandas gives us the date/time in nm or as a Timestamp, and spark-tk expects it as ms, so we need to do the conversion def pandas_datetime_to_ms(row): for i in date_time_columns: if isinstance(row[i], long): row[i] = row[i] / 1000000 elif isinstance(row[i], pandas.tslib.Timestamp) or isinstance(row[i], datetime): dt = row[i] # get number of seconds since epoch (%s) and multiply by 1000 for ms then get the # microseconds to get the ms precision. row[i] = long((long(dt.strftime("%s")) * 1000) + (dt.microsecond // 1000)) return row pandas_rows = pandas_frame[0:len(pandas_frame.index)].values.tolist() # if the dataframe has date/time columns, map them to ms if (has_date_time): pandas_rows = map(pandas_datetime_to_ms, pandas_rows) # create frame with the pandas_rows frame = tc.frame.create(pandas_rows, schema) if validate_schema: frame = tc.frame.create(frame.rdd, schema, validate_schema) return frame
def import_jdbc(connection_url, table_name, tc=TkContext.implicit): """ Import data from jdbc table into frame. Parameters ---------- :param connection_url: (str) JDBC connection url to database server :param table_name: (str) JDBC table name :return: (Frame) returns frame with jdbc table data Examples -------- Load a frame from a jdbc table specifying the connection url to the database server. <skip> >>> url = "jdbc:postgresql://localhost/postgres" >>> tb_name = "demo_test" >>> frame = tc.frame.import_jdbc(url, tb_name) -etc- >>> frame.inspect() [#] a b c d ================== [0] 1 0.2 -2 5 [1] 2 0.4 -1 6 [2] 3 0.6 0 7 [3] 4 0.8 1 8 >>> frame.schema [(u'a', int), (u'b', float), (u'c', int), (u'd', int)] </skip> Notes ----- java.sql.SQLException: No suitable driver found for <jdbcUrl> If this error is encountered while running your application, then your JDBC library cannot be found by the node running the application. If you're running in Local mode, make sure that you have used the --driver-class-path parameter. If a Spark cluster is involved, make sure that each cluster member has a copy of library, and that each node of the cluster has been restarted since you modified the spark-defaults.conf file. See this [site](https://sparkour.urizone.net/recipes/using-jdbc/). Sparktk does not come with any JDBC drivers. A driver compatible with the JDBC data source must be supplied when creating the TkContext instance: <skip> >>> tc = sparktk.TkContext(pyspark_submit_args='--jars myJDBCDriver.jar') </skip> """ if not isinstance(connection_url, basestring): raise ValueError( "connection url parameter must be a string, but is {0}.".format( type(connection_url))) if not isinstance(table_name, basestring): raise ValueError( "table name parameter must be a string, but is {0}.".format( type(table_name))) TkContext.validate(tc) scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importJdbc( tc.jutils.get_scala_sc(), connection_url, table_name) from sparktk.frame.frame import Frame return Frame(tc, scala_frame)
def import_pandas(pandas_frame, schema=None, row_index=True, validate_schema=False, tc=TkContext.implicit): """ Imports data from the specified pandas data frame. Parameters ---------- :param pandas_frame: (pandas.DataFrame) pandas dataframe object :param schema: (Optional(list[tuples(string, type)])) Schema description of the fields for a given line. It is a list of tuples which describe each field, (field name, field type), where the field name is a string, and file is a supported type. If no schema is provided, the schema will be inferred based on the column names and types from the pandas_frame. :param row_index: (Optional(bool)) Indicates if the row_index is present in the pandas dataframe and needs to be ignored when looking at the data values. Default value is True. :param validate_schema: (Optional(bool)) If true, validates the data against the schema and attempts to cast the data to the specified type, if it does not match the schema. Defaults to False. :return: (Frame) spark-tk frame that contains data from the pandas_frame Examples -------- Create a pandas data frame: >>> import pandas >>> ratings_data = [[0, "invalid"], [1, "Very Poor"], [2, "Poor"], [3, "Average"], [4, "Good"], [5, "Very Good"]] >>> df = pandas.DataFrame(ratings_data, columns=['rating_id', 'rating_text']) >>> df rating_id rating_text 0 0 invalid 1 1 Very Poor 2 2 Poor 3 3 Average 4 4 Good 5 5 Very Good >>> df.columns.tolist() ['rating_id', 'rating_text'] >>> df.dtypes rating_id int64 rating_text object dtype: object When using import_pandas by just passing the pandas data frame, it will use the column names and types from the pandas data frame to generate the schema. >>> frame = tc.frame.import_pandas(df) >>> frame.inspect() [#] rating_id rating_text =========================== [0] 0 invalid [1] 1 Very Poor [2] 2 Poor [3] 3 Average [4] 4 Good [5] 5 Very Good >>> frame.schema [('rating_id', long), ('rating_text', str)] Alternatively, you can specify a schema when importing the pandas data frame. There is also the option to validate the data against the schema. If this option is enabled, we will attempt to cast the data to the column's data type, if it does not match the schema. For example, here we will specify a schema where the rating_id column will instead be called 'rating_float' and it's data type will be a float. We will also enable the validate_schema option so that the rating_id value will get casted to a float: >>> frame = tc.frame.import_pandas(df, schema, validate_schema=True) >>> frame.inspect() [#] rating_float rating_str ============================= [0] 0.0 invalid [1] 1.0 Very Poor [2] 2.0 Poor [3] 3.0 Average [4] 4.0 Good [5] 5.0 Very Good >>> frame.schema [('rating_float', float), ('rating_str', unicode)] """ try: import pandas except: raise RuntimeError( "pandas module not found, unable to download. Install pandas or try the take command." ) if not isinstance(pandas_frame, pandas.DataFrame): raise TypeError("data_frame must be a pandas DataFrame.") TkContext.validate(tc) if schema is not None: schema = _validate(schema) else: schema = _get_schema_from_df(pandas_frame) if not row_index: pandas_frame = pandas_frame.reset_index() pandas_frame = pandas_frame.dropna(thresh=len(pandas_frame.columns)) field_names = [x[0] for x in schema] if len(pandas_frame.columns) != len(field_names): raise ValueError( "Number of columns in Pandasframe {0} does not match the number of columns in the" " schema provided {1}.".format(len(pandas_frame.columns), len(field_names))) date_time_columns = [ i for i, x in enumerate(pandas_frame.dtypes) if x == "datetime64[ns]" ] has_date_time = len(date_time_columns) > 0 # pandas gives us the date/time in nm or as a Timestamp, and spark-tk expects it as ms, so we need to do the conversion def pandas_datetime_to_ms(row): for i in date_time_columns: if isinstance(row[i], long): row[i] = row[i] / 1000000 elif isinstance(row[i], pandas.tslib.Timestamp) or isinstance( row[i], datetime): dt = row[i] # get number of seconds since epoch (%s) and multiply by 1000 for ms then get the # microseconds to get the ms precision. row[i] = long((long(dt.strftime("%s")) * 1000) + (dt.microsecond // 1000)) return row pandas_rows = pandas_frame[0:len(pandas_frame.index)].values.tolist() # if the dataframe has date/time columns, map them to ms if (has_date_time): pandas_rows = map(pandas_datetime_to_ms, pandas_rows) # create frame with the pandas_rows frame = tc.frame.create(pandas_rows, schema) if validate_schema: frame = tc.frame.create(frame.rdd, schema, validate_schema) return frame
def import_csv(path, delimiter=",", header=False, infer_schema=True, schema=None, tc=TkContext.implicit): """ Creates a frame with data from a csv file. Parameters ---------- :param path: (str) Full path to the csv file :param delimiter: (Optional[str]) A string which indicates the separation of data fields. This is usually a single character and could be a non-visible character, such as a tab. The default delimiter is a comma (,). :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns, and not be included in the data. The default value is false. :param infer_schema:(Optional[bool]) Boolean value indicating if the column types will be automatically inferred. It requires one extra pass over the data and is false by default. :param: schema: (Optional[List[tuple(str, type)]]) Optionally specify the schema for the dataset. Number of columns specified in the schema must match the number of columns in the csv file provided. If the value from the csv file cannot be converted to the data type specified by the schema (for example, if the csv file has a string, and the schema specifies an int), the value will show up as missing (None) in the frame. :return: (Frame) Frame that contains the data from the csv file Examples -------- Load a frame from a csv file by specifying the path to the file, delimiter, and options that specify that there is a header and to infer the schema based on the data. >>> file_path = "../integration-tests/datasets/cities.csv" >>> frame = tc.load_frame_from_csv(file_path, "|", header=True, infer_schema=True) -etc- >>> frame.inspect() [#] rank city population_2013 population_2010 change county ============================================================================ [0] 1 Portland 609456 583776 4.40% Multnomah [1] 2 Salem 160614 154637 3.87% Marion [2] 3 Eugene 159190 156185 1.92% Lane [3] 4 Gresham 109397 105594 3.60% Multnomah [4] 5 Hillsboro 97368 91611 6.28% Washington [5] 6 Beaverton 93542 89803 4.16% Washington [6] 15 Grants Pass 35076 34533 1.57% Josephine [7] 16 Oregon City 34622 31859 8.67% Clackamas [8] 17 McMinnville 33131 32187 2.93% Yamhill [9] 18 Redmond 27427 26215 4.62% Deschutes >>> frame.schema [('rank', int), ('city', str), ('population_2013', int), ('population_2010', int), ('change', str), ('county', str)] """ if schema is not None: infer_schema = False # if a custom schema is provided, don't waste time inferring the schema during load if not isinstance(header, bool): raise ValueError("header parameter must be a boolean, but is {0}.".format(type(header))) if not isinstance(infer_schema, bool): raise ValueError("infer_schema parameter must be a boolean, but is {0}.".format(type(infer_schema))) TkContext.validate(tc) header_str = str(header).lower() infer_schema_str = str(infer_schema).lower() pyspark_schema = None if (not infer_schema) and (schema is not None): fields = [] for column in schema: if dtypes._data_type_to_pyspark_type_table.has_key(column[1]): fields.append(StructField(column[0], dtypes._data_type_to_pyspark_type_table[column[1]], True)) else: raise TypeError("Unsupported type {0} in schema for column {1}.".format(column[1], column[0])) pyspark_schema = StructType(fields) df = tc.sql_context.read.format( "com.databricks.spark.csv.org.trustedanalytics.sparktk").options( delimiter=delimiter, header=header_str, dateformat="yyyy-MM-dd'T'HH:mm:ss.SSSX", inferschema=infer_schema_str).load(path, schema=pyspark_schema) df_schema = [] if schema is None: for column in df.schema.fields: try: datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(type(column.dataType)) except ValueError: raise TypeError("Unsupported data type ({0}) for column {1}.".format(str(column.dataType), column.name)) df_schema.append((column.name, datatype)) else: df_column_count = len(df.schema.fields) custom_column_count = len(schema) if (df_column_count != custom_column_count): raise ValueError("Bad schema value. The number of columns in the custom schema ({0}) must match the" "number of columns in the csv file data ({1}).".format(custom_column_count, df_column_count)) df_schema = schema def cast_datetime(row): """ The spark data frame gives uses datetime objects. Convert them to long (ms since epoch) for our frame. """ data = [] for column_index in xrange(0, len(df_schema)): if df_schema[column_index][1] == dtypes.datetime and isinstance(row[column_index], datetime): data.append(long(dtypes.datetime_to_ms(row[column_index]))) else: data.append(row[column_index]) return data jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(df._jdf.rdd()) rdd = RDD(jrdd, tc.sc) if any(c[1] == dtypes.datetime for c in df_schema): # If any columns are date/time we must do this map rdd = df.rdd.map(cast_datetime) from sparktk.frame.frame import Frame # circular dependency, so import late return Frame(tc, rdd, df_schema)
def import_orientdb_graph(db_url, user_name, password, root_password, tc=TkContext.implicit): """ Import graph from OrientDB to spark-tk as spark-tk graph (Spark GraphFrame) Parameters ---------- :param:(str) db_url: OrientDB URI :param:(str) user_name: the database username :param:(str) password: the database password :param :(str)root_password: OrientDB server password Example ------- >>> v = tc.frame.create([("a", "Alice", 34,"F"), ... ("b", "Bob", 36,"M"), ... ("c", "Charlie", 30,"M"), ... ("d", "David", 29,"M"), ... ("e", "Esther", 32,"F"), ... ("f", "Fanny", 36,"F"), ... ], ["id", "name", "age","gender"]) >>> e = tc.frame.create([("a", "b", "friend"), ... ("b", "c", "follow"), ... ("c", "b", "follow"), ... ("f", "c", "follow"), ... ("e", "f", "follow"), ... ("e", "d", "friend"), ... ("d", "a", "friend"), ... ("a", "e", "friend") ... ], ["src", "dst", "relationship"]) >>> sparktk_graph = tc.graph.create(v,e) <skip> >>> db = "test_db" >>> sparktk_graph.export_to_orientdb(db_url="remote:hostname:2424/%s" % db,user_name= "admin",password = "******",root_password = "******",vertex_type_column_name= "gender",edge_type_column_name="relationship") >>> imported_gf = tc.graph.import_orientdb_graph(db_url="remote:hostname:2424/%s" % db,user_name= "admin",password = "******",root_password = "******") >>> imported_gf.graphframe.vertices.show() +-------+------+---+---+ | name|gender| id|age| +-------+------+---+---+ | Bob| M| b| 36| | David| M| d| 29| |Charlie| M| c| 30| | Alice| F| a| 34| | Esther| F| e| 32| | Fanny| F| f| 36| +-------+------+---+---+ >>> imported_gf.graphframe.edges.show() +---+------------+---+ |dst|relationship|src| +---+------------+---+ | f| follow| e| | b| follow| c| | c| follow| b| | c| follow| f| | b| friend| a| | a| friend| d| | d| friend| e| | e| friend| a| +---+------------+---+ </skip> """ TkContext.validate(tc) scala_graph = tc.sc._jvm.org.trustedanalytics.sparktk.graph.internal.constructors.fromorientdb.ImportFromOrientdb.importOrientdbGraph( tc.jutils.get_scala_sc(), db_url, user_name, password, root_password) from sparktk.graph.graph import Graph return Graph(tc, scala_graph)