def __init__(self): self.spark = SparkSession.builder.appName("DatasetClient").getOrCreate() # TODO make config, and per (linux) user for path in PATHS: if self.df is None: self.df = self.spark.read.schema(StructType.fromJson(json.loads(PATH_TO_SCHEMA[path]))).parquet(path) else: new_df = self.spark.read.schema(StructType.fromJson(json.loads(PATH_TO_SCHEMA[path]))).parquet(path) self.df = self.df.join(new_df, on='user_id')
def test_rmse(): # TODO: revised so that it will take user's inputs instead of hardcoded values movies_schema = None ratings_schema = None # load the schemas with open("movielens_20m_movies_schema.json", "r") as json_schema_file: movies_schema = StructType.fromJson(json.load(json_schema_file)) with open("movielens_20m_ratings_schema.json", "r") as json_schema_file: ratings_schema = StructType.fromJson(json.load(json_schema_file)) # create a hdfs directory os.system("hdfs dfs -mkdir datasets") # load the json file into the hdfs directory os.system("hdfs dfs -put movielens_10m_ratings.json.gz datasets/movielens_10m_ratings.json.gz") # create a DataFrame based on the content of the json file ratingsDF = scsingleton.sqlCtx.read.json("hdfs://localhost:9000/datasets/movielens_10m_ratings.json.gz", schema=ratings_schema) # explicitly repartition RDD after loading so that more tasks can run on it in parallel # by default, defaultMinPartitions == defaultParallelism == estimated # of cores across all of the machines in your cluster ratingsDF = ratingsDF.repartition(scsingleton.sc.defaultParallelism * 3) # parse ratings DataFrame into an RDD of [(userId, itemId, rating)] ratingsRDD = ratingsDF.map(lambda row: (row.user_id, row.movie_id, row.rating)) ratingsRDD.cache() # split data into train (60%), test (40%) # TODO: add validation in the future? train (60%), validation (20%), test(20%)? trainingRDD, testRDD = ratingsRDD.randomSplit([0.6, 0.4]) trainingRDD.cache() testRDD.cache() # run training algorithm to build the model # without validation with Timer() as t: model = ALS.train(trainingRDD, rank=3) print "ALS.train(trainingRDD, rank=3): %s seconds" % t.secs # make a prediction with Timer() as t: testPredRDD = model.predictAll( testRDD.map( lambda x: (x[0], x[1]) ) ).cache() print "testPredRDD: %s seconds" % t.secs # calculate RMSE with Timer() as t: testRmse = pm.calculate_rmse_using_rdd(testRDD, testPredRDD) print "testRmse: %s seconds" % t.secs print "testRmse", testRmse return
def _retrieve_schema(self, manifest_file): rdd = self.spark.sparkContext.wholeTextFiles(manifest_file) text = rdd.collect()[0][1] dict = json.loads(str(text)) custom_schema = StructType.fromJson(dict) return custom_schema
def get_dataset(sc, spark, base_path, connector, input_path, start_day, end_day): # Ship code to executors ship_dir(base_path + "/algos", sc, base_path) ship_dir(base_path + "/core", sc, base_path) ship_dir(base_path + "/connectors", sc, base_path) # Find connector connector_module = my_import(connector, sc) # Parse dates y1, m1, d1 = start_day.split("_") date1 = date(int(y1), int(m1), int(d1)) y2, m2, d2 = end_day.split("_") date2 = date(int(y2), int(m2), int(d2)) # Instantiate connector connector_instance = connector_module(input_path, date1, date2) # Get and enforce Schema output_type = connector_instance.output_type schema_file = base_path + "/schema/" + output_type + ".json" schema_json = json.load(open(schema_file, "r")) schema = StructType.fromJson(schema_json) connector_instance.set_schema(schema) # Get Dataset dataset = connector_instance.get_DF(sc, spark) # Return return dataset
def json_to_spark_schema(json_schema: Dict[str, JsonSchemaType]) -> StructType: """ Return Spark Schema for a JSON schema. Args: json_schema (Dict[str, JSON_SCHEMA_TYPE]): schema in json format. Returns: StructType: Spark Schema for the corresponding JSON schema. Raises: KeyError: Missing Schema key fields Name/Field/Nullable TypeError: Invalid json was provided """ try: return StructType.fromJson(json_schema) except KeyError as key_error: LOGGING.error(str(key_error)) raise KeyError('Missing key: {0}. Valid format: {1}'.format( str(key_error), 'All schema columns must have a name, type and nullable key')) except TypeError as key_error: LOGGING.error(str(key_error)) raise TypeError('Invalid json was provided')
def getSchema(dataPath, externalSystem, schema, table, stepLogGuid, basepath, samplingRatio=.5, timeout=6000, zone="silver", delimiter="", header=True, multiLine="False"): if zone == "silver": path = "{0}/query/schemas/{1}/{2}/{3}/schema.json".format(basepath, externalSystem, schema, table) args = { "stepLogGuid": stepLogGuid, "dataPath": dataPath, "externalSystem": externalSystem, "schemaName": schema, "tableName": table, "samplingRatio": samplingRatio, "schemaPath": path } elif zone == "bronze": path = "{0}/raw/schemas/{1}/{2}/schema.json".format(basepath, externalSystem, table) args = { "stepLogGuid": stepLogGuid, "dataPath": dataPath, "externalSystem": externalSystem, "tableName": tableName, "samplingRatio": samplingRatio, "delimiter": delimiter, "hasHeader": header, "schemaPath": path, "multiLine": multiLine } try: head = dbutils.fs.head(path, 256000) except Exception as e: dbutils.notebook.run("/Framework/Data Engineering/Silver Zone/Get Schema", timeout, args) head = dbutils.fs.head(path, 256000) import json from pyspark.sql.types import StructType return StructType.fromJson(json.loads(head))
def run_job(job_info, df): table_name = job_info.target_table_name if job_info.target_table_name else "ods_{}.sync_{}".format( job_info.source_db_name, job_info.source_table_name) schema = table_name.split(".")[0] if job_info.target_type == 'hive': spark.sql("create database if not exists {}".format(schema)) df.write.mode("overwrite").format("orc").saveAsTable(table_name) if job_info.target_type == 'phoenix': df.write \ .format("org.apache.phoenix.spark") \ .mode("overwrite") \ .option("table", table_name) \ .option("zkUrl", job_info.target_host) \ .save() if job_info.target_type in ('mysql', 'postgresql'): jdbc_url = "jdbc:{}://{}:{}/{}".format(job_info.target_type, job_info.target_host, job_info.target_port, job_info.target_db_name) properties = {"user": job_info.target_db_user, "password": job_info.target_db_psw} # struct array等转为string schema = df.schema.jsonValue() map(lambda x: x.update(type='string') if isinstance(x.get("type"), dict) else x, schema.get("fields")) struct = StructType.fromJson(schema) new_df = spark.createDataFrame(df.rdd, struct) new_df.write \ .mode("overwrite") \ .option("truncate", True) \ .jdbc(jdbc_url, table_name, properties=properties)
def schema(self): """load schema from the json file""" if (self.fullSchemaPath() is None): return None else: with open(self.fullSchemaPath(), "r") as sj: schema_st = sj.read() return StructType.fromJson(json.loads(schema_st))
def read_source(self, source): """ get spark dataframe from source :param source: a source that contains file_path, source_type, options and schema_json :return: spark dataframe """ if source.schema_json: if isinstance(source.schema_json, str): schema = StructType.fromJson(json.loads(source.schema_json)) elif isinstance(source.schema_json, dict): schema = StructType.fromJson(source.schema_json) else: raise TypeError("source schema should be str") return self.sparkSession.read.format(source.source_type) \ .options(**json.loads(source.options)).schema(schema).load(source.file_path).cache() else: return self.sparkSession.read.format(source.source_type) \ .options(**json.loads(source.options)).load(source.file_path).cache()
def load_df_schema(bucket, fname): import boto3, json from pyspark.sql.types import StructType # reload the schema from s3 s3 = boto3.client('s3') obj = s3.get_object(Bucket=bucket, Key=fname) json_schema = obj['Body'].read().decode('utf-8') return StructType.fromJson(json.loads(json_schema))
def schema_load(option): """Loads Spark DataFrame schema from JSON file. :param option: File name suffix for the DataFrame schema. :type option: string :returns: DataFrame schema. :rtype: StructType """ with open(f"trending_{option}.json", "r", encoding="UTF-8") as f_schema: return StructType.fromJson(load(f_schema))
def BuildSparkSchema(table, forceAllFieldsToString=False, useValidation=False, excludeComputed=False): ''' returns the schema for spark ''' from pyspark.sql.types import StructType #@UnresolvedImport schemaJson = SparkUtilities.BuildSparkSchemaJson( table, forceAllFieldsToString, useValidation, excludeComputed) schema = StructType.fromJson(schemaJson) return schema
def load_table(self, sc, spark, table_path, table_name): parquet_reader = spark.read.format('parquet') if self.args.table_schema is not None: self.get_logger(sc).info("Reading table schema from {}".format( self.args.table_schema)) with open(self.args.table_schema, 'r') as s: schema = StructType.fromJson(json.loads(s.read())) parquet_reader = parquet_reader.schema(schema) df = parquet_reader.load(table_path) df.createOrReplaceTempView(table_name) self.get_logger(sc).info("Schema of table {}:\n{}".format( table_name, df.schema))
def sample_data(spark): root = os.path.dirname(__file__) schema_path = os.path.join(root, "resources", "experiments-summary.schema.json") with open(schema_path) as f: d = json.load(f) schema = StructType.fromJson(d) rows_path = os.path.join(root, "resources", "experiments-summary-190-rows.json") # FAILFAST causes us to abort early if the data doesn't match # the given schema. Without this there was as very annoying # problem where dataframe.collect() would return an empty set. frame = spark.read.json(rows_path, schema, mode="FAILFAST") return frame
def process_json_to_dataframe(schema_name, paths): """Processes JSON to Spark DataFrame. :param schema_name: Schema name. :type schema_name: string :param paths: S3 paths to process. :type paths: list :returns: Spark DataFrame. :rtype: DataFrame """ drop_subset = [ "dut_type", "dut_version", "passed", "test_name_long", "test_name_short", "test_type", "version" ] # load schemas with open(f"iterative_{schema_name}.json", "r", encoding="UTF-8") as f_schema: schema = StructType.fromJson(load(f_schema)) # create empty DF out of schemas sdf = spark.createDataFrame([], schema) # filter list filtered = [path for path in paths if schema_name in path] # select for path in filtered: print(path) sdf_loaded = spark \ .read \ .option("multiline", "true") \ .schema(schema) \ .json(path) \ .withColumn("job", lit(path.split("/")[4])) \ .withColumn("build", lit(path.split("/")[5])) sdf = sdf.unionByName(sdf_loaded, allowMissingColumns=True) # drop rows with all nulls and drop rows with null in critical frames sdf = sdf.na.drop(how="all") sdf = sdf.na.drop(how="any", thresh=None, subset=drop_subset) # flatten frame sdf = flatten_frame(sdf) return sdf
def __init__(self, jresolved_table_schema): """ Create a resolved table schema from the underlying Java object. :param jresolved_table_schema: Java object of ResolvedTableSchema """ from pyspark.sql.types import StructType table_name = jresolved_table_schema.tableName() json_schema = json.loads(jresolved_table_schema.schema().json()) jschema = StructType.fromJson(json_schema) pk_columns = [] it = jresolved_table_schema.pkColumns().iterator() while it.hasNext(): pk_columns.append(it.next()) sharding_columns = [] it = jresolved_table_schema.shardingColumns().iterator() while it.hasNext(): sharding_columns.append(it.next()) self.pk_indexes = [] it = jresolved_table_schema.pkIndex().iterator() while it.hasNext(): jindex_spec = it.next() json_ispec = json.loads(jindex_spec.toJsonStr()) self.pk_indexes.append(IndexSpecification.from_json(json_ispec)) partition_columns = [] it_option_wrapper = jresolved_table_schema.partitionColumns().iterator() if "{}".format(it_option_wrapper) == "non-empty iterator": #remove the Option wrapper it_partCols = it_option_wrapper.next().iterator() if "{}".format(it_partCols) == "non-empty iterator": while it_partCols.hasNext(): partition_columns.append(it_partCols.next()) # else: an empty list for partition_columns elif "{}".format(it_option_wrapper) != "empty iterator": raise Exception("Expected returned value for partition_columns to be a JavaObject representing an iterator not {}".format(it)) #else: print ("a wrapper for None") if len(partition_columns) > 0: super(ResolvedTableSchema, self).__init__(table_name, jschema, sharding_columns, pk_columns, partition_columns) else: super(ResolvedTableSchema, self).__init__(table_name, jschema, sharding_columns, pk_columns) self.jresolved_table_schema = jresolved_table_schema
def init_source(self, spark_session, options): source_format = options.get("format", "text") schema = options["schema"] input_path = options["path"] max_files_in_batch = options["max-files-per-trigger"] print(schema) schema_type = StructType.fromJson(loads(schema)) return spark_session \ .readStream \ .format(source_format) \ .schema(schema_type) \ .option("path", input_path) \ .option("maxFilesPerTrigger", max_files_in_batch) \ .load()
def schema_from_json(path): """ Create a pyspark schema from the json representation. The json representation must be from a StructType. This can be generated from any StructType using the `.json()` method. The schema for a dataframe can be obtained using the `.schema` accessor. For example, to generate the json from the `topline_summary`, run the following in the pyspark repl: >>> path = 's3a://telemetry-parquet/topline_summary/v1/mode=weekly' >>> json_data = spark.read.parquet(path).schema.json() :path str: Path the the json data """ with pkg_resources.resource_stream(mozetl.topline.__name__, path) as f: data = json.load(f) return StructType.fromJson(data)
def do(self, workflow, etl_process): from pyspark.sql.types import StructType, StructField, StringType from json import load self.create() self.location = self.action_details.pop("location") self.format = self.action_details.pop("format") self.schema = self.action_details.pop("schema", None) # self.schema should be a relative filepath to a spark-compliant schema if self.schema: with open(self.schema) as fp: schema = StructType.fromJson(load(fp)) # if there is no spark-compliant schema, attempt to use the columns and set all # fields to string elif self.columns: schema = StructType( [StructField(i, StringType()) for i in self.columns]) # if there are no columns, then allow the schema to be inferred using the inference # rules native to the format else: schema = None schema_types = { "csv": self.spark.read.csv, "json": self.spark.read.json } schemaless_types = { "parquet": self.spark.read.parquet, "orc": self.spark.read.orc, } if self.format in schema_types: reader = schema_types[self.format] workflow.df = reader(self.location, schema=schema, **self.action_details) else: reader = schemaless_types[self.format] workflow.df = reader(self.location, **self.action_details)
def _generate_struct(message_definition): lexer = RosMessageLexer(InputStream(message_definition)) stream = CommonTokenStream(lexer) parser = RosMessageParser(stream) tree = parser.rosbag_input() visitor = RosMessageSchemaVisitor() visitor.visit(tree) struct_fields = [] for f in visitor.fields: struct_fields.append({ 'metadata': {}, 'name': f[0], 'nullable': True, 'type': 'integer' }) schema_dict = {'fields': struct_fields, 'type': 'struct'} return StructType.fromJson(schema_dict)
def generate_schema(columns, nullable_columns='all'): """ Parameters ---------- columns: dict of column names (keys) and types (values) nullables: list of nullable columns, optional, default is 'all' Returns ------- schema: StructType Spark DataFrame schema corresponding to Python/numpy types. """ columns = sorted(columns.items()) colnames = list(map(itemgetter(0), columns)) coltypes = list(map(itemgetter(1), columns)) invalid_types = [] new_types = [] keys = list(map(itemgetter(0), list(_mapping.items()))) for coltype in coltypes: if coltype not in keys: invalid_types.append(coltype) else: if coltype == np.dtype('O'): new_types.append(str) else: new_types.append(keys[keys.index(coltype)]) assert len(invalid_types) == 0, "Invalid type(s) specified: {}".format( str(invalid_types)) if nullable_columns == 'all': nullables = [True] * len(colnames) else: nullables = [col in nullable_columns for col in colnames] fields = [{ "metadata": {}, "name": name, "nullable": nullable, "type": _mapping[typ] } for name, typ, nullable in zip(colnames, new_types, nullables)] return StructType.fromJson({"type": "struct", "fields": fields})
def run(self, processor_context: ProcessorContext) -> Dependency: dependency_config = {} default_options = processor_context.get_property_group( self.DEFAULT_PROPS_GROUP) load_options = processor_context.get_property_group( self.LOAD_OPTIONS_GROUP) view_name = default_options.get_property(self.VIEW_NAME) if view_name is not None: dependency_config['view_name'] = view_name path = default_options.get_property(self.PATH) load_format = default_options.get_property(self.FORMAT) schema = default_options.get_property(self.SCHEMA) struct_type = StructType.fromJson(schema) df = processor_context.spark_session.readStream.load( path=path, format=load_format, schema=struct_type, **load_options) return Dependency(df, dependency_config)
def generate_schema(colnames, coltypes, nullables=None): """ Parameters ---------- colnames: list of string coltypes: list of type nullables: list of boolean, optional Returns ------- schema: StructType Spark DataFrame schema corresponding to Python/numpy types. """ assert len(colnames) == len( coltypes), "You must specify types for all columns." invalid_types = [] new_types = [] keys = list(map(itemgetter(0), list(_mapping.items()))) for coltype in coltypes: if coltype not in keys: invalid_types.append(coltype) else: if coltype == np.dtype('O'): new_types.append(str) else: new_types.append(keys[keys.index(coltype)]) assert len(invalid_types) == 0, "Invalid type(s) specified: {}".format( str(invalid_types)) if nullables is None: nullables = [True] * len(colnames) fields = [{ "metadata": {}, "name": name, "nullable": nullable, "type": _mapping[typ] } for name, typ, nullable in zip(colnames, new_types, nullables)] return StructType.fromJson({"type": "struct", "fields": fields})
def test_drop_dup_keep_latest(tsv_path, csv_schema_path, json_schema_path, id_col, date_col, keep_date_null): # setup spark = pytest.spark pwd = os.path.dirname(os.path.abspath(__file__)) with open(os.path.join(pwd, csv_schema_path)) as f: csv_schema = StructType.fromJson(json.load(f)) with open(os.path.join(pwd, json_schema_path)) as f: json_schema = ArrayType.fromJson(json.load(f)) df = spark.read.csv(os.path.join(pwd, tsv_path), header=True, sep='\t', schema=csv_schema) df = df.withColumn( 'tmp_payload', functions.explode( functions.from_json(functions.col('payload'), json_schema))) df = df.withColumn('status', functions.col('tmp_payload.status')) \ .withColumn('is_old', functions.col('tmp_payload.is_old')) \ .withColumn('order_date', functions.col('tmp_payload.order_date')) \ .withColumn('timestamp', functions.col('tmp_payload.timestamp')) \ .drop('payload', 'tmp_payload') # exec tmp_df = df.groupBy(id_col).agg(functions.max(date_col).alias(date_col))\ .sort(id_col) if not keep_date_null: tmp_df = tmp_df.dropna(subset=date_col) res_df = drop_dup_keep_latest(pytest.spark, df, id_col, date_col, keep_date_null) # assert ans = [list(row) for row in tmp_df.collect()] res = [list(row) for row in res_df.select(id_col, date_col).collect()] assert res == ans
def init_schema(json_location) -> StructType: with open(json_location) as source: data = json.load(source) return StructType.fromJson(data)
import ujson as json import os import pkg_resources from pyspark.sql.types import StructType import mozetl SCHEMA_DIR = 'json' MAIN_SUMMARY_SCHEMA_BASENAME = 'main_summary.v4.schema.json' main_summary_path = os.path.join(SCHEMA_DIR, MAIN_SUMMARY_SCHEMA_BASENAME) with pkg_resources.resource_stream(mozetl.__name__, main_summary_path) as f: d = json.load(f) MAIN_SUMMARY_SCHEMA = StructType.fromJson(d)
def str_to_schema(s): return StructType.fromJson(json.loads(s))
def get_twitter_schema(json_file_name): schema_dict = json.load(open(json_file_name)) schema_struct = StructType.fromJson(schema_dict) return schema_struct
import json import pyspark import configFile from pyspark.sql import functions from pyspark.sql.functions import explode from pyspark.sql.types import * from pyspark.sql.types import StructType, StructField, StringType #Create Spark Session sparkSession = pyspark.sql.SparkSession.builder.config(conf=configFile.sparkConfig).getOrCreate() dfDatabaseData = sparkSession.read.option("multiline", "true")\ .json(configFile.dbaasSourceFile) str = StructType.fromJson(json.load(open("/Users/prammitr/Documents/Doc/my_projects/pyspark/dbaas_excra_schema.json"))) # 1. dbaas_db_system_dim ## Dependencies: ### $.dbSystemData --> dbaas_db_system_dim dfdbaas_db_system_dim = sparkSession.read.schema(str).option("multiline", "true")\ .json(configFile.dbaasSourceFile)\ .withColumn("dbSystemData", explode("dbSystemData")) \ .select("dbSystemData.*") dfdbaas_db_system_dim.createOrReplaceTempView("dbaas_db_system_dim") dbaas_db_system_dim = sparkSession.sql( "select id,displayName,computeShape,dbSystemShape,databaseEdition,nodeCount,timeCreated,nodeCount,licenseType,tempHostSerial from dbaas_db_system_dim a") #print("Printing Data for ---- > dbSystemData --> dbaas_db_system_dim") #dbaas_db_system_dim.show(2)
dbutils.fs.help() # COMMAND ---------- dbutils.fs.put(untappd_raw_path, json.dumps(full_data), True) # COMMAND ---------- # MAGIC %md # MAGIC ### Ingest our schema file from the Data Lake # COMMAND ---------- head = dbutils.fs.head(untappd_raw_schema_path, 10000) schema = StructType.fromJson(json.loads(head)) # COMMAND ---------- df = spark.read.schema(schema).json(untappd_raw_path) # COMMAND ---------- # df.show() # COMMAND ---------- df.write.format('delta').mode("append").save(untappd_raw_delta_path) # COMMAND ----------
COUNTRIES_FILE_PATH = '/opt/SparkDatasets/geography/countries.csv' CITIES_FILE_PATH = '/opt/SparkDatasets/geography/cities.csv' CONTINENT_STRUCTURE = \ [ ( 'continent_id' , 'integer' ) , ( 'continent_name', 'string' ) ] COUNTRY_STRUCTURE = \ [ ( 'country_id' , 'integer' ) , ( 'continent_id', 'integer' ) , ( 'country_name', 'string' ) ] CITY_STRUCTURE = \ [ ( 'city_id' , 'integer' ) , ( 'country_id', 'integer' ) , ( 'city_name' , 'string' ) ] CONTINENT_SCHEMA = StructType.fromJson( generate_schema_dict(CONTINENT_STRUCTURE) ) COUNTRY_SCHEMA = StructType.fromJson( generate_schema_dict(COUNTRY_STRUCTURE) ) CITY_SCHEMA = StructType.fromJson( generate_schema_dict(CITY_STRUCTURE) ) spark = SparkSession.builder.getOrCreate() continents_df = generate_dataframe( spark, CONTINENT_SCHEMA, CONTINENTS_FILE_PATH ) countries_df = generate_dataframe( spark, COUNTRY_SCHEMA , COUNTRIES_FILE_PATH ) cities_df = generate_dataframe( spark, CITY_SCHEMA , CITIES_FILE_PATH ) continents_df.registerTempTable('continents') countries_df.registerTempTable('countries') cities_df.registerTempTable('cities') print continents_df.count() print countries_df.count()
df_json_vals = trainer_df.toJSON().collect()[0] df_json_schema = trainer_df.schema.jsonValue() full_scoring_record = {"vals":json.loads(df_json_vals),"schema":df_json_schema} json.dump({"vals":json.loads(df_json_vals),"schema":df_json_schema},"full_scoring_record.json") spark.createDataFrame(full_scoring_record['vals']) json.dumps({"vals":json.loads(df_json_vals),"schema":df_json_schema}) with open('json_vals.json','w') as json_vals_out_file: json_vals_out_file.write(df_json_vals) with open('json_schema.json','w') as json_schema_out_file: json.dump(df_json_schema,json_schema_out_file) with open('json_vals.json','r') as json_vals_in_file: in_df_json_vals = json.load(json_vals_in_file) with open('json_schema.json','r') as json_schema_in_file: in_df_json_schema = StructType.fromJson(json.load(json_schema_in_file)) in_df = spark.read.json( 'json_vals.json' ,schema = in_df_json_schema)