def import_jdbc(connection_url, table_name, tc=implicit): """ Import data from jdbc table into frame. Parameters ---------- :param connection_url: JDBC connection url to database server :param table_name: JDBC table name :return: returns frame with jdbc table data Examples -------- Load a frame from a jdbc table specifying the connection url to the database server. <skip> >>> url = "jdbc:postgresql://localhost/postgres" >>> tb_name = "demo_test" >>> frame = tc.frame.import_jdbc(url, tb_name) -etc- >>> frame.inspect() [#] a b c d ================== [0] 1 0.2 -2 5 [1] 2 0.4 -1 6 [2] 3 0.6 0 7 [3] 4 0.8 1 8 >>> frame.schema [(u'a', int), (u'b', float), (u'c', int), (u'd', int)] </skip> """ if not isinstance(connection_url, basestring): raise ValueError("connection url parameter must be a string, but is {0}.".format(type(connection_url))) if not isinstance(table_name, basestring): raise ValueError("table name parameter must be a string, but is {0}.".format(type(table_name))) if tc is implicit: implicit.error("tc") if not isinstance(tc, TkContext): raise ValueError("tc must be type TkContext, received %s" % type(tc)) scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importJdbc(tc.jutils.get_scala_sc(), connection_url, table_name) from sparktk.frame.frame import Frame return Frame(tc, scala_frame)
def import_hive(hive_query, tc=implicit): """ Import data from hive table into frame. Define the sql query to retrieve the data from a hive table. Only a subset of Hive data types are supported. Data Type Support ___________ ___________________________ boolean cast to int bigint native support int native support tinyint cast to int smallint cast to int decimal cast to double, may lose precision double native support float native support date cast to string string native support timestamp cast to string varchar cast to string arrays not supported binary not supported char not supported maps not supported structs not supported union not supported Parameters ---------- :param hive_query: (str) hive query to fetch data from table :return: (Frame) returns frame with hive table data Examples -------- Load data into frame from a hive table based on hive query <skip> >>> h_query = "select * from demo_test" >>> frame = tc.frame.import_hive(h_query) -etc- >>> frame.inspect() [#] number strformat ====================== [0] 1 one [1] 2 two [2] 3 three [3] 4 four </skip> """ if not isinstance(hive_query, basestring): raise ValueError( "hive query parameter must be a string, but is {0}.".format( type(hive_query))) if tc is implicit: implicit.error("tc") if not isinstance(tc, TkContext): raise ValueError("tc must be type TkContext, received %s" % type(tc)) scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importHive( tc.jutils.get_scala_sc(), hive_query) from sparktk.frame.frame import Frame return Frame(tc, scala_frame)
def import_csv(path, delimiter=",", header=False, inferschema=True, schema=None, tc=implicit): """ Creates a frame with data from a csv file. :param path: Full path to the csv file :param delimiter: A string which indicates the separation of data fields. This is usually a single character and could be a non-visible character, such as a tab. The default delimiter is a comma (,). :param header: Boolean value indicating if the first line of the file will be used to name columns, and not be included in the data. The default value is false. :param inferschema: Boolean value indicating if the column types will be automatically inferred. It requires one extra pass over the data and is false by default. :param: schema: Optionally specify the schema for the dataset. Number of columns specified in the schema must match the number of columns in the csv file provided. :return: Frame that contains the data from the csv file Examples -------- Load a frame from a csv file by specifying the path to the file, delimiter, and options that specify that there is a header and to infer the schema based on the data. .. code:: >>> file_path = "../integration-tests/datasets/cities.csv" >>> frame = tc.load_frame_from_csv(file_path, "|", header=True, inferschema=True) -etc- >>> frame.inspect() [#] rank city population_2013 population_2010 change county ============================================================================ [0] 1 Portland 609456 583776 4.40% Multnomah [1] 2 Salem 160614 154637 3.87% Marion [2] 3 Eugene 159190 156185 1.92% Lane [3] 4 Gresham 109397 105594 3.60% Multnomah [4] 5 Hillsboro 97368 91611 6.28% Washington [5] 6 Beaverton 93542 89803 4.16% Washington [6] 15 Grants Pass 35076 34533 1.57% Josephine [7] 16 Oregon City 34622 31859 8.67% Clackamas [8] 17 McMinnville 33131 32187 2.93% Yamhill [9] 18 Redmond 27427 26215 4.62% Deschutes >>> frame.schema [('rank', int), ('city', str), ('population_2013', int), ('population_2010', int), ('change', str), ('county', str)] """ from pyspark.sql import SQLContext if schema is not None: inferschema = False # if a custom schema is provided, don't waste time inferring the schema during load if not isinstance(header, bool): raise ValueError( "header parameter must be a boolean, but is {0}.".format( type(header))) if not isinstance(inferschema, bool): raise ValueError( "inferschema parameter must be a boolean, but is {0}.".format( type(inferschema))) if tc is implicit: implicit.error('tc') if not isinstance(tc, TkContext): raise ValueError("tc must be type TkContext, received %s" % type(tc)) header_str = str(header).lower() inferschema_str = str(inferschema).lower() sqlContext = SQLContext(tc.sc) df = sqlContext.read.format("com.databricks.spark.csv").options( delimiter=delimiter, header=header_str, inferschema=inferschema_str).load(path) df_schema = [] if schema is None: for column in df.schema.fields: datatype = str import sparktk.dtypes as dtypes try: datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type( type(column.dataType)) except ValueError: print "Warning: No mapping for type: {0}. Column '{1}' will default to use strings.".format( str(column.dataType), column.name) df_schema.append((column.name, datatype)) else: df_column_count = len(df.schema.fields) custom_column_count = len(schema) if (df_column_count != custom_column_count): raise ValueError( "Bad schema value. The number of columns in the custom schema ({0}) must match the" "number of columns in the csv file data ({1}).".format( custom_column_count, df_column_count)) df_schema = schema from sparktk.frame.frame import Frame # circular dependency, so import late return Frame(tc, df.rdd, df_schema)
def create(data, schema=None, validate_schema=False, tc=implicit): """ Creates a frame from the given data and schema. If no schema data types are provided, the schema is inferred based on the data in the first 100 rows. If schema validation is enabled, all data is is checked to ensure that it matches the schema. If the data does not match the schema's data type, it attempts to cast the data to the proper data type. When the data is unable to be casted to the schema's data type, the item will be missing (None) in the frame. :param data: Data source :param schema: Optionally specify a schema (list of tuples of string column names and data type), column names (list of strings, and the column data types will be inferred) or None (column data types will be inferred and column names will be numbered like C0, C1, C2, etc). :param validate_schema: When True, all data is is checked to ensure that it matches the schema. If the data does not match the schema's data type, it attempts to cast the data to the proper data type. When the data is unable to be casted to the schema's data type, a ValueError is raised. Defaults to False. :param tc: TkContext :return: Frame loaded with the specified data Examples -------- Create a frame with the specified data. >>> data = [["Bob", 30, 8], ["Jim", 45, 9.5], ["Sue", 25, 7], ["George", 15, 6], ["Jennifer", 18, 8.5]] >>> frame = tc.frame.create(data) Since no schema is provided, the schema will be inferred. Note that the data set had a mix of strings and integers in the third column. The schema will use the most general data type from the data that it sees, so in this example, the column is treated as a float. >>> frame.schema [('C0', str), ('C1', int), ('C2', float)] >>> frame.inspect() [#] C0 C1 C2 ====================== [0] Bob 30 8 [1] Jim 45 9.5 [2] Sue 25 7 [3] George 15 6 [4] Jennifer 18 8.5 We could also enable schema validation, which checks the data against the schema. If the data does not match the schema's data type, it attempts to cast the data to the proper data type. >>> frame = tc.frame.create(data, validate_schema=True) In this example with schema validation enabled, the integers in column C2 get casted to floats: >>> frame.inspect() [#] C0 C1 C2 ====================== [0] Bob 30 8.0 [1] Jim 45 9.5 [2] Sue 25 7.0 [3] George 15 6.0 [4] Jennifer 18 8.5 We could also provide a list of column names when creating the frame. When a list of column names is provided, the data types for the schema are still inferred, but the columns in the schema are labeled with the specified names. >>> frame = tc.frame.create(data, schema=["name", "age", "shoe_size"], validate_schema=True) >>> frame.schema [('name', str), ('age', int), ('shoe_size', float)] >>> frame.inspect() [#] name age shoe_size ============================= [0] Bob 30 8.0 [1] Jim 45 9.5 [2] Sue 25 7.0 [3] George 15 6.0 [4] Jennifer 18 8.5 Note that if a value cannot be parsed as the specified data type in the schema, it will show up as missing (None), if validate_schema is enabled. For example, consider the following frame where columns are defined as integers, but the data specified has a string in the second row. >>> data = [[1, 2, 3], [4, "five", 6]] >>> schema = [("a", int), ("b", int), ("c", int)] >>> frame = tc.frame.create(data, schema, validate_schema = True) >>> frame.inspect() [#] a b c =============== [0] 1 2 3 [1] 4 None 6 Note that the spot where the string was located, has it's value missing (None) since it couldn't be parsed to an integer. If validate_schema was disabled, no attempt is made to parse the data to the data type specified by the schema, and further frame operations may fail due to the data type discrepancy. """ if tc is implicit: implicit.error('tc') from sparktk.frame.frame import Frame return Frame(tc, data, schema, validate_schema)
def import_hive(hive_query, tc=implicit): """ Import data from hive table into frame. Define the sql query to retrieve the data from a hive table. Only a subset of Hive data types are supported. Data Type Support ___________ ___________________________ boolean cast to int bigint native support int native support tinyint cast to int smallint cast to int decimal cast to double, may lose precision double native support float native support date cast to string string native support timestamp cast to string varchar cast to string arrays not supported binary not supported char not supported maps not supported structs not supported union not supported Parameters ---------- :param hive_query: (str) hive query to fetch data from table :return: (Frame) returns frame with hive table data Examples -------- Load data into frame from a hive table based on hive query <skip> >>> h_query = "select * from demo_test" >>> frame = tc.frame.import_hive(h_query) -etc- >>> frame.inspect() [#] number strformat ====================== [0] 1 one [1] 2 two [2] 3 three [3] 4 four </skip> """ if not isinstance(hive_query, basestring): raise ValueError("hive query parameter must be a string, but is {0}.".format(type(hive_query))) if tc is implicit: implicit.error("tc") if not isinstance(tc, TkContext): raise ValueError("tc must be type TkContext, received %s" % type(tc)) scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importHive(tc.jutils.get_scala_sc(), hive_query) from sparktk.frame.frame import Frame return Frame(tc, scala_frame)
def import_hbase(table_name, schema, start_tag=None, end_tag=None, tc=implicit): """ Import data from hbase table into frame :param table_name: hbase table name :param schema: hbase schema as a List of List(string) (columnFamily, columnName, dataType for cell value) :param start_tag: optional start tag for filtering :param end_tag: optional end tag for filtering :return: frame with data from hbase table Example --------- Load data into frame from a hbase table <skip> >>> frame = tc.frame.import_hbase("demo_test_hbase", [["test_family", "a", int],["test_family", "b", float], ["test_family", "c", int],["test_family", "d", int]]) -etc- >>> frame.inspect() [#] test_family_a test_family_b test_family_c test_family_d =============================================================== [0] 1 0.2 -2 5 [1] 2 0.4 -1 6 [2] 3 0.6 0 7 [3] 4 0.8 1 8 Use of start_tag and end_tag. (Hbase creates a unique row id for data in hbase tables) start_tag: It is the unique row id from where row scan should start end_tag: It is the unique row id where row scan should end Assuming you already have data on hbase table "test_startendtag" under "startendtag" family name with single column named "number". data: column contains values from 1 to 99. Here rowid is generated by hbase. Sample hbase data. Few rows from hbase table looks as below. hbase(main):002:0> scan "test_startendtag" ROW COLUMN+CELL 0 column=startendtag:number, timestamp=1465342524846, value=1 1 column=startendtag:number, timestamp=1465342524846, value=25 10 column=startendtag:number, timestamp=1465342524847, value=51 103 column=startendtag:number, timestamp=1465342524851, value=98 107 column=startendtag:number, timestamp=1465342524851, value=99 11 column=startendtag:number, timestamp=1465342524851, value=75 12 column=startendtag:number, timestamp=1465342524846, value=4 13 column=startendtag:number, timestamp=1465342524846, value=28 14 column=startendtag:number, timestamp=1465342524847, value=52 15 column=startendtag:number, timestamp=1465342524851, value=76 16 column=startendtag:number, timestamp=1465342524846, value=5 17 column=startendtag:number, timestamp=1465342524846, value=29 18 column=startendtag:number, timestamp=1465342524847, value=53 19 column=startendtag:number, timestamp=1465342524851, value=77 2 column=startendtag:number, timestamp=1465342524847, value=49 20 column=startendtag:number, timestamp=1465342524846, value=6 21 column=startendtag:number, timestamp=1465342524846, value=30 >>> frame = tc.frame.import_hbase("test_startendtag", [["startendtag", "number", int]], start_tag="20", end_tag="50") -etc- >>> frame.row_count 33 >>> frame.inspect(frame.row_count) [##] startendtag_number ======================== [0] 6 [1] 30 [2] 54 [3] 78 [4] 7 [5] 31 [6] 55 [7] 79 [8] 8 [9] 32 [10] 73 [11] 56 [12] 80 [13] 9 [14] 33 [15] 57 [16] 81 [17] 10 [18] 34 [19] 58 [##] startendtag_number ======================== [20] 82 [21] 2 [22] 11 [23] 35 [24] 59 [25] 83 [26] 12 [27] 36 [28] 60 [29] 84 [30] 13 [31] 37 [32] 26 </skip> """ if not isinstance(table_name, basestring): raise ValueError( "table name parameter must be a string, but is {0}.".format( type(table_name))) if not isinstance(schema, list): raise ValueError("schema parameter must be a list, but is {0}.".format( type(table_name))) if tc is implicit: implicit.error("tc") if not isinstance(tc, TkContext): raise ValueError("tc must be type TkContext, received %s" % type(tc)) inner_lists = [ tc._jutils.convert.to_scala_list( [item[0], item[1], dtypes.to_string(item[2])]) for item in schema ] scala_final_schema = tc.jutils.convert.to_scala_list(inner_lists) scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importHbase( tc.jutils.get_scala_sc(), table_name, scala_final_schema, tc._jutils.convert.to_scala_option(start_tag), tc._jutils.convert.to_scala_option(end_tag)) from sparktk.frame.frame import Frame return Frame(tc, scala_frame)
def import_hbase(table_name, schema, start_tag=None, end_tag=None, tc=implicit): """ Import data from hbase table into frame :param table_name: hbase table name :param schema: hbase schema as a List of List(string) (columnFamily, columnName, dataType for cell value) :param start_tag: optional start tag for filtering :param end_tag: optional end tag for filtering :return: frame with data from hbase table Example --------- Load data into frame from a hbase table <skip> >>> frame = tc.frame.import_hbase("demo_test_hbase", [["test_family", "a", int],["test_family", "b", float], ["test_family", "c", int],["test_family", "d", int]]) -etc- >>> frame.inspect() [#] test_family_a test_family_b test_family_c test_family_d =============================================================== [0] 1 0.2 -2 5 [1] 2 0.4 -1 6 [2] 3 0.6 0 7 [3] 4 0.8 1 8 Use of start_tag and end_tag. (Hbase creates a unique row id for data in hbase tables) start_tag: It is the unique row id from where row scan should start end_tag: It is the unique row id where row scan should end Assuming you already have data on hbase table "test_startendtag" under "startendtag" family name with single column named "number". data: column contains values from 1 to 99. Here rowid is generated by hbase. Sample hbase data. Few rows from hbase table looks as below. hbase(main):002:0> scan "test_startendtag" ROW COLUMN+CELL 0 column=startendtag:number, timestamp=1465342524846, value=1 1 column=startendtag:number, timestamp=1465342524846, value=25 10 column=startendtag:number, timestamp=1465342524847, value=51 103 column=startendtag:number, timestamp=1465342524851, value=98 107 column=startendtag:number, timestamp=1465342524851, value=99 11 column=startendtag:number, timestamp=1465342524851, value=75 12 column=startendtag:number, timestamp=1465342524846, value=4 13 column=startendtag:number, timestamp=1465342524846, value=28 14 column=startendtag:number, timestamp=1465342524847, value=52 15 column=startendtag:number, timestamp=1465342524851, value=76 16 column=startendtag:number, timestamp=1465342524846, value=5 17 column=startendtag:number, timestamp=1465342524846, value=29 18 column=startendtag:number, timestamp=1465342524847, value=53 19 column=startendtag:number, timestamp=1465342524851, value=77 2 column=startendtag:number, timestamp=1465342524847, value=49 20 column=startendtag:number, timestamp=1465342524846, value=6 21 column=startendtag:number, timestamp=1465342524846, value=30 >>> frame = tc.frame.import_hbase("test_startendtag", [["startendtag", "number", int]], start_tag="20", end_tag="50") -etc- >>> frame.row_count 33 >>> frame.inspect(frame.row_count) [##] startendtag_number ======================== [0] 6 [1] 30 [2] 54 [3] 78 [4] 7 [5] 31 [6] 55 [7] 79 [8] 8 [9] 32 [10] 73 [11] 56 [12] 80 [13] 9 [14] 33 [15] 57 [16] 81 [17] 10 [18] 34 [19] 58 [##] startendtag_number ======================== [20] 82 [21] 2 [22] 11 [23] 35 [24] 59 [25] 83 [26] 12 [27] 36 [28] 60 [29] 84 [30] 13 [31] 37 [32] 26 </skip> """ if not isinstance(table_name, basestring): raise ValueError("table name parameter must be a string, but is {0}.".format(type(table_name))) if not isinstance(schema, list): raise ValueError("schema parameter must be a list, but is {0}.".format(type(table_name))) if tc is implicit: implicit.error("tc") if not isinstance(tc, TkContext): raise ValueError("tc must be type TkContext, received %s" % type(tc)) inner_lists=[tc._jutils.convert.to_scala_list([item[0], item[1], dtypes.to_string(item[2])]) for item in schema] scala_final_schema = tc.jutils.convert.to_scala_list(inner_lists) scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importHbase(tc.jutils.get_scala_sc(), table_name, scala_final_schema, tc._jutils.convert.to_scala_option(start_tag), tc._jutils.convert.to_scala_option(end_tag)) from sparktk.frame.frame import Frame return Frame(tc, scala_frame)
def import_csv(path, delimiter=",", header=False, inferschema=True, schema=None, tc=implicit): """ Creates a frame with data from a csv file. Parameters ---------- :param path: (str) Full path to the csv file :param delimiter: (Optional[str]) A string which indicates the separation of data fields. This is usually a single character and could be a non-visible character, such as a tab. The default delimiter is a comma (,). :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns, and not be included in the data. The default value is false. :param inferschema:(Optional[bool]) Boolean value indicating if the column types will be automatically inferred. It requires one extra pass over the data and is false by default. :param: schema: (Optional[List[tuple(str, type)]]) Optionally specify the schema for the dataset. Number of columns specified in the schema must match the number of columns in the csv file provided. :return: (Frame) Frame that contains the data from the csv file Examples -------- Load a frame from a csv file by specifying the path to the file, delimiter, and options that specify that there is a header and to infer the schema based on the data. >>> file_path = "../integration-tests/datasets/cities.csv" >>> frame = tc.load_frame_from_csv(file_path, "|", header=True, inferschema=True) -etc- >>> frame.inspect() [#] rank city population_2013 population_2010 change county ============================================================================ [0] 1 Portland 609456 583776 4.40% Multnomah [1] 2 Salem 160614 154637 3.87% Marion [2] 3 Eugene 159190 156185 1.92% Lane [3] 4 Gresham 109397 105594 3.60% Multnomah [4] 5 Hillsboro 97368 91611 6.28% Washington [5] 6 Beaverton 93542 89803 4.16% Washington [6] 15 Grants Pass 35076 34533 1.57% Josephine [7] 16 Oregon City 34622 31859 8.67% Clackamas [8] 17 McMinnville 33131 32187 2.93% Yamhill [9] 18 Redmond 27427 26215 4.62% Deschutes >>> frame.schema [('rank', int), ('city', str), ('population_2013', int), ('population_2010', int), ('change', str), ('county', str)] """ if schema is not None: inferschema = False # if a custom schema is provided, don't waste time inferring the schema during load if not isinstance(header, bool): raise ValueError("header parameter must be a boolean, but is {0}.".format(type(header))) if not isinstance(inferschema, bool): raise ValueError("inferschema parameter must be a boolean, but is {0}.".format(type(inferschema))) if tc is implicit: implicit.error('tc') if not isinstance(tc, TkContext): raise ValueError("tc must be type TkContext, received %s" % type(tc)) header_str = str(header).lower() inferschema_str = str(inferschema).lower() pyspark_schema = None if (not inferschema) and (schema is not None): fields = [] for column in schema: if dtypes._data_type_to_pyspark_type_table.has_key(column[1]): fields.append(StructField(column[0], dtypes._data_type_to_pyspark_type_table[column[1]], True)) else: raise TypeError("Unsupported type {0} in schema for column {1}.".format(column[1], column[0])) pyspark_schema = StructType(fields) sqlContext = SQLContext(tc.sc) df = sqlContext.read.format("com.databricks.spark.csv").options(delimiter=delimiter, header=header_str, dateformat="yyyy-MM-dd'T'HH:mm:ss.SSSX", inferschema=inferschema_str).load(path, schema=pyspark_schema) df_schema = [] if schema is None: for column in df.schema.fields: try: datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(type(column.dataType)) except ValueError: raise TypeError("Unsupported data type ({0}) for column {1}.".format(str(column.dataType), column.name)) df_schema.append((column.name, datatype)) else: df_column_count = len(df.schema.fields) custom_column_count = len(schema) if (df_column_count != custom_column_count): raise ValueError("Bad schema value. The number of columns in the custom schema ({0}) must match the" "number of columns in the csv file data ({1}).".format(custom_column_count, df_column_count)) df_schema = schema def cast_datetime(row): """ The spark data frame gives uses datetime objects. Convert them to long (ms since epoch) for our frame. """ data = [] for column_index in xrange(0, len(df_schema)): if df_schema[column_index][1] == dtypes.datetime and isinstance(row[column_index], datetime): data.append(long(dtypes.datetime_to_ms(row[column_index]))) else: data.append(row[column_index]) return data rdd = df.rdd if any(c[1] == dtypes.datetime for c in df_schema): # If any columns are date/time we must do this map rdd = df.rdd.map(cast_datetime) from sparktk.frame.frame import Frame # circular dependency, so import late jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(df._jdf.rdd()) rdd = RDD(jrdd, tc.sc) return Frame(tc, rdd, df_schema)
def load(path, tc=implicit): """load GaussianMixtureModel from given path""" if tc is implicit: implicit.error("tc") return tc.load(path, GaussianMixtureModel)
def load(path, tc=implicit): """load RandomForestClassifierModel from given path""" if tc is implicit: implicit.error("tc") return tc.load(path, RandomForestClassifierModel)
def import_csv(path, delimiter=",", header=False, inferschema=True, schema=None, tc=implicit): """ Creates a frame with data from a csv file. Parameters ---------- :param path: (str) Full path to the csv file :param delimiter: (Optional[str]) A string which indicates the separation of data fields. This is usually a single character and could be a non-visible character, such as a tab. The default delimiter is a comma (,). :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns, and not be included in the data. The default value is false. :param inferschema:(Optional[bool]) Boolean value indicating if the column types will be automatically inferred. It requires one extra pass over the data and is false by default. :param: schema: (Optional[List[tuple(str, type)]]) Optionally specify the schema for the dataset. Number of columns specified in the schema must match the number of columns in the csv file provided. :return: (Frame) Frame that contains the data from the csv file Examples -------- Load a frame from a csv file by specifying the path to the file, delimiter, and options that specify that there is a header and to infer the schema based on the data. >>> file_path = "../integration-tests/datasets/cities.csv" >>> frame = tc.load_frame_from_csv(file_path, "|", header=True, inferschema=True) -etc- >>> frame.inspect() [#] rank city population_2013 population_2010 change county ============================================================================ [0] 1 Portland 609456 583776 4.40% Multnomah [1] 2 Salem 160614 154637 3.87% Marion [2] 3 Eugene 159190 156185 1.92% Lane [3] 4 Gresham 109397 105594 3.60% Multnomah [4] 5 Hillsboro 97368 91611 6.28% Washington [5] 6 Beaverton 93542 89803 4.16% Washington [6] 15 Grants Pass 35076 34533 1.57% Josephine [7] 16 Oregon City 34622 31859 8.67% Clackamas [8] 17 McMinnville 33131 32187 2.93% Yamhill [9] 18 Redmond 27427 26215 4.62% Deschutes >>> frame.schema [('rank', int), ('city', str), ('population_2013', int), ('population_2010', int), ('change', str), ('county', str)] """ if schema is not None: inferschema = False # if a custom schema is provided, don't waste time inferring the schema during load if not isinstance(header, bool): raise ValueError( "header parameter must be a boolean, but is {0}.".format( type(header))) if not isinstance(inferschema, bool): raise ValueError( "inferschema parameter must be a boolean, but is {0}.".format( type(inferschema))) if tc is implicit: implicit.error('tc') if not isinstance(tc, TkContext): raise ValueError("tc must be type TkContext, received %s" % type(tc)) header_str = str(header).lower() inferschema_str = str(inferschema).lower() pyspark_schema = None if (not inferschema) and (schema is not None): fields = [] for column in schema: if dtypes._data_type_to_pyspark_type_table.has_key(column[1]): fields.append( StructField( column[0], dtypes._data_type_to_pyspark_type_table[column[1]], True)) else: raise TypeError( "Unsupported type {0} in schema for column {1}.".format( column[1], column[0])) pyspark_schema = StructType(fields) sqlContext = SQLContext(tc.sc) df = sqlContext.read.format("com.databricks.spark.csv").options( delimiter=delimiter, header=header_str, dateformat="yyyy-MM-dd'T'HH:mm:ss.SSSX", inferschema=inferschema_str).load(path, schema=pyspark_schema) df_schema = [] if schema is None: for column in df.schema.fields: try: datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type( type(column.dataType)) except ValueError: raise TypeError( "Unsupported data type ({0}) for column {1}.".format( str(column.dataType), column.name)) df_schema.append((column.name, datatype)) else: df_column_count = len(df.schema.fields) custom_column_count = len(schema) if (df_column_count != custom_column_count): raise ValueError( "Bad schema value. The number of columns in the custom schema ({0}) must match the" "number of columns in the csv file data ({1}).".format( custom_column_count, df_column_count)) df_schema = schema def cast_datetime(row): """ The spark data frame gives uses datetime objects. Convert them to long (ms since epoch) for our frame. """ data = [] for column_index in xrange(0, len(df_schema)): if df_schema[column_index][1] == dtypes.datetime and isinstance( row[column_index], datetime): data.append(long(dtypes.datetime_to_ms(row[column_index]))) else: data.append(row[column_index]) return data rdd = df.rdd if any(c[1] == dtypes.datetime for c in df_schema): # If any columns are date/time we must do this map rdd = df.rdd.map(cast_datetime) from sparktk.frame.frame import Frame # circular dependency, so import late jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython( df._jdf.rdd()) rdd = RDD(jrdd, tc.sc) return Frame(tc, rdd, df_schema)
def import_pandas(pandas_frame, schema=None, row_index=True, validate_schema=False, tc=implicit): """ Imports data from the specified pandas data frame. Parameters ---------- :param pandas_frame: (pandas.DataFrame) pandas dataframe object :param schema: (Optional(list[tuples(string, type)])) Schema description of the fields for a given line. It is a list of tuples which describe each field, (field name, field type), where the field name is a string, and file is a supported type. If no schema is provided, the schema will be inferred based on the column names and types from the pandas_frame. :param row_index: (Optional(bool)) Indicates if the row_index is present in the pandas dataframe and needs to be ignored when looking at the data values. Default value is True. :param validate_schema: (Optional(bool)) If true, validates the data against the schema and attempts to cast the data to the specified type, if it does not match the schema. Defaults to False. :return: (Frame) spark-tk frame that contains data from the pandas_frame Examples -------- Create a pandas data frame: >>> import pandas >>> ratings_data = [[0, "invalid"], [1, "Very Poor"], [2, "Poor"], [3, "Average"], [4, "Good"], [5, "Very Good"]] >>> df = pandas.DataFrame(ratings_data, columns=['rating_id', 'rating_text']) >>> df rating_id rating_text 0 0 invalid 1 1 Very Poor 2 2 Poor 3 3 Average 4 4 Good 5 5 Very Good >>> df.columns.tolist() ['rating_id', 'rating_text'] >>> df.dtypes rating_id int64 rating_text object dtype: object When using import_pandas by just passing the pandas data frame, it will use the column names and types from the pandas data frame to generate the schema. >>> frame = tc.frame.import_pandas(df) >>> frame.inspect() [#] rating_id rating_text =========================== [0] 0 invalid [1] 1 Very Poor [2] 2 Poor [3] 3 Average [4] 4 Good [5] 5 Very Good >>> frame.schema [('rating_id', long), ('rating_text', str)] Alternatively, you can specify a schema when importing the pandas data frame. There is also the option to validate the data against the schema. If this option is enabled, we will attempt to cast the data to the column's data type, if it does not match the schema. For example, here we will specify a schema where the rating_id column will instead be called 'rating_float' and it's data type will be a float. We will also enable the validate_schema option so that the rating_id value will get casted to a float: >>> frame = tc.frame.import_pandas(df, schema, validate_schema=True) >>> frame.inspect() [#] rating_float rating_str ============================= [0] 0.0 invalid [1] 1.0 Very Poor [2] 2.0 Poor [3] 3.0 Average [4] 4.0 Good [5] 5.0 Very Good >>> frame.schema [('rating_float', float), ('rating_str', unicode)] """ try: import pandas except: raise RuntimeError("pandas module not found, unable to download. Install pandas or try the take command.") if not isinstance(pandas_frame, pandas.DataFrame): raise TypeError("data_frame must be a pandas DataFrame.") if tc is implicit: implicit.error('tc') if not isinstance(tc, TkContext): raise ValueError("tc must be type TkContext, received %s" % type(tc)) if schema is not None: schema = _validate(schema) else: schema = _get_schema_from_df(pandas_frame) if not row_index: pandas_frame = pandas_frame.reset_index() pandas_frame = pandas_frame.dropna(thresh=len(pandas_frame.columns)) field_names = [x[0] for x in schema] if len(pandas_frame.columns) != len(field_names): raise ValueError("Number of columns in Pandasframe {0} does not match the number of columns in the" " schema provided {1}.".format(len(pandas_frame.columns), len(field_names))) date_time_columns = [i for i, x in enumerate(pandas_frame.dtypes) if x == "datetime64[ns]"] has_date_time = len(date_time_columns) > 0 # pandas gives us the date/time in nm or as a Timestamp, and spark-tk expects it as ms, so we need to do the conversion def pandas_datetime_to_ms(row): for i in date_time_columns: if isinstance(row[i], long): row[i] = row[i] / 1000000 elif isinstance(row[i], pandas.tslib.Timestamp) or isinstance(row[i], datetime): dt = row[i] # get number of seconds since epoch (%s) and multiply by 1000 for ms then get the # microseconds to get the ms precision. row[i] = long((long(dt.strftime("%s")) * 1000) + (dt.microsecond // 1000)) return row pandas_rows = pandas_frame[0:len(pandas_frame.index)].values.tolist() # if the dataframe has date/time columns, map them to ms if (has_date_time): pandas_rows = map(pandas_datetime_to_ms, pandas_rows) # create frame with the pandas_rows frame = tc.frame.create(pandas_rows, schema) if validate_schema: frame = tc.frame.create(frame.rdd, schema, validate_schema) return frame
def load(path, tc=implicit): """load MaxModel from given path""" if tc is implicit: implicit.error("tc") return tc.load(path, MaxModel)
def subject(a, b, c=implicit, d=4): """I am the subject""" if c is implicit: implicit.error('c') return ':'.join([str(a), str(b), str(c), str(d)])
def load(path, tc=implicit): """load LdaModel from given path""" if tc is implicit: implicit.error("tc") return tc.load(path, LdaModel)
def import_csv(path, delimiter=",", header=False, inferschema=True, schema=None, tc=implicit): """ Creates a frame with data from a csv file. :param path: Full path to the csv file :param delimiter: A string which indicates the separation of data fields. This is usually a single character and could be a non-visible character, such as a tab. The default delimiter is a comma (,). :param header: Boolean value indicating if the first line of the file will be used to name columns, and not be included in the data. The default value is false. :param inferschema: Boolean value indicating if the column types will be automatically inferred. It requires one extra pass over the data and is false by default. :param: schema: Optionally specify the schema for the dataset. Number of columns specified in the schema must match the number of columns in the csv file provided. :return: Frame that contains the data from the csv file Examples -------- Load a frame from a csv file by specifying the path to the file, delimiter, and options that specify that there is a header and to infer the schema based on the data. .. code:: >>> file_path = "../integration-tests/datasets/cities.csv" >>> frame = tc.load_frame_from_csv(file_path, "|", header=True, inferschema=True) -etc- >>> frame.inspect() [#] rank city population_2013 population_2010 change county ============================================================================ [0] 1 Portland 609456 583776 4.40% Multnomah [1] 2 Salem 160614 154637 3.87% Marion [2] 3 Eugene 159190 156185 1.92% Lane [3] 4 Gresham 109397 105594 3.60% Multnomah [4] 5 Hillsboro 97368 91611 6.28% Washington [5] 6 Beaverton 93542 89803 4.16% Washington [6] 15 Grants Pass 35076 34533 1.57% Josephine [7] 16 Oregon City 34622 31859 8.67% Clackamas [8] 17 McMinnville 33131 32187 2.93% Yamhill [9] 18 Redmond 27427 26215 4.62% Deschutes >>> frame.schema [('rank', int), ('city', str), ('population_2013', int), ('population_2010', int), ('change', str), ('county', str)] """ from pyspark.sql import SQLContext if schema is not None: inferschema = False # if a custom schema is provided, don't waste time inferring the schema during load if not isinstance(header, bool): raise ValueError("header parameter must be a boolean, but is {0}.".format(type(header))) if not isinstance(inferschema, bool): raise ValueError("inferschema parameter must be a boolean, but is {0}.".format(type(inferschema))) if tc is implicit: implicit.error('tc') if not isinstance(tc, TkContext): raise ValueError("tc must be type TkContext, received %s" % type(tc)) header_str = str(header).lower() inferschema_str = str(inferschema).lower() sqlContext = SQLContext(tc.sc) df = sqlContext.read.format("com.databricks.spark.csv").options(delimiter=delimiter, header=header_str, inferschema=inferschema_str).load(path) df_schema = [] if schema is None: for column in df.schema.fields: datatype = str import sparktk.dtypes as dtypes try: datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(type(column.dataType)) except ValueError: print "Warning: No mapping for type: {0}. Column '{1}' will default to use strings.".format(str(column.dataType), column.name) df_schema.append((column.name, datatype)) else: df_column_count = len(df.schema.fields) custom_column_count = len(schema) if (df_column_count != custom_column_count): raise ValueError("Bad schema value. The number of columns in the custom schema ({0}) must match the" "number of columns in the csv file data ({1}).".format(custom_column_count, df_column_count)) df_schema = schema from sparktk.frame.frame import Frame # circular dependency, so import late return Frame(tc, df.rdd, df_schema)
def load(path, tc=implicit): """load ARIMAXModel from given path""" if tc is implicit: implicit.error("tc") return tc.load(path, ArimaxModel)