def _get_schema(header, schema): if schema is None or len(schema) == 0: # Use header to generate schema if header is None or len(header) == 0: return None elif len(header) > 4: warnings.warn(WARNING_MOVIE_LENS_HEADER) header = header[:4] schema = StructType() try: schema.add(StructField(header[0], IntegerType())).add( StructField(header[1], IntegerType()) ).add(StructField(header[2], FloatType())).add( StructField(header[3], LongType()) ) except IndexError: pass else: if header is not None: warnings.warn(WARNING_HAVE_SCHEMA_AND_HEADER) if len(schema) > 4: warnings.warn(WARNING_MOVIE_LENS_HEADER) schema = schema[:4] return schema
def get_spark_schema(header=DEFAULT_HEADER): ## create schema schema = StructType() ## do label + ints n_ints = 14 for i in range(n_ints): schema.add(StructField(header[i], IntegerType())) ## do categoricals for i in range(26): schema.add(StructField(header[i + n_ints], StringType())) return schema
def test_rmse(): # TODO: revised so that it will take user's inputs instead of hardcoded values movies_schema = None ratings_schema = None # load the schemas with open("movielens_20m_movies_schema.json", "r") as json_schema_file: movies_schema = StructType.fromJson(json.load(json_schema_file)) with open("movielens_20m_ratings_schema.json", "r") as json_schema_file: ratings_schema = StructType.fromJson(json.load(json_schema_file)) # create a hdfs directory os.system("hdfs dfs -mkdir datasets") # load the json file into the hdfs directory os.system("hdfs dfs -put movielens_10m_ratings.json.gz datasets/movielens_10m_ratings.json.gz") # create a DataFrame based on the content of the json file ratingsDF = scsingleton.sqlCtx.read.json("hdfs://localhost:9000/datasets/movielens_10m_ratings.json.gz", schema=ratings_schema) # explicitly repartition RDD after loading so that more tasks can run on it in parallel # by default, defaultMinPartitions == defaultParallelism == estimated # of cores across all of the machines in your cluster ratingsDF = ratingsDF.repartition(scsingleton.sc.defaultParallelism * 3) # parse ratings DataFrame into an RDD of [(userId, itemId, rating)] ratingsRDD = ratingsDF.map(lambda row: (row.user_id, row.movie_id, row.rating)) ratingsRDD.cache() # split data into train (60%), test (40%) # TODO: add validation in the future? train (60%), validation (20%), test(20%)? trainingRDD, testRDD = ratingsRDD.randomSplit([0.6, 0.4]) trainingRDD.cache() testRDD.cache() # run training algorithm to build the model # without validation with Timer() as t: model = ALS.train(trainingRDD, rank=3) print "ALS.train(trainingRDD, rank=3): %s seconds" % t.secs # make a prediction with Timer() as t: testPredRDD = model.predictAll( testRDD.map( lambda x: (x[0], x[1]) ) ).cache() print "testPredRDD: %s seconds" % t.secs # calculate RMSE with Timer() as t: testRmse = pm.calculate_rmse_using_rdd(testRDD, testPredRDD) print "testRmse: %s seconds" % t.secs print "testRmse", testRmse return
def __init__(self, predictionAndObservations): sc = predictionAndObservations.ctx sql_ctx = SQLContext.getOrCreate(sc) numCol = len(predictionAndObservations.first()) schema = StructType([ StructField("prediction", DoubleType(), nullable=False), StructField("observation", DoubleType(), nullable=False)]) if numCol == 3: schema.add("weight", DoubleType(), False) df = sql_ctx.createDataFrame(predictionAndObservations, schema=schema) java_class = sc._jvm.org.apache.spark.mllib.evaluation.RegressionMetrics java_model = java_class(df._jdf) super(RegressionMetrics, self).__init__(java_model)
def __init__(self, predAndLabelsWithOptWeight): sc = predAndLabelsWithOptWeight.ctx sql_ctx = SQLContext.getOrCreate(sc) numCol = len(predAndLabelsWithOptWeight.first()) schema = StructType([ StructField("prediction", DoubleType(), nullable=False), StructField("label", DoubleType(), nullable=False)]) if (numCol == 3): schema.add("weight", DoubleType(), False) df = sql_ctx.createDataFrame(predAndLabelsWithOptWeight, schema) java_class = sc._jvm.org.apache.spark.mllib.evaluation.MulticlassMetrics java_model = java_class(df._jdf) super(MulticlassMetrics, self).__init__(java_model)
def __init__(self, scoreAndLabels): sc = scoreAndLabels.ctx sql_ctx = SQLContext.getOrCreate(sc) numCol = len(scoreAndLabels.first()) schema = StructType([ StructField("score", DoubleType(), nullable=False), StructField("label", DoubleType(), nullable=False)]) if numCol == 3: schema.add("weight", DoubleType(), False) df = sql_ctx.createDataFrame(scoreAndLabels, schema=schema) java_class = sc._jvm.org.apache.spark.mllib.evaluation.BinaryClassificationMetrics java_model = java_class(df._jdf) super(BinaryClassificationMetrics, self).__init__(java_model)
def get_twitter_schema(json_file_name): schema_dict = json.load(open(json_file_name)) schema_struct = StructType.fromJson(schema_dict) return schema_struct
"<slide duration> must be less than or equal to <window duration>", file=sys.stderr) windowDuration = '{} seconds'.format(windowSize) slideDuration = '{} seconds'.format(slideSize) monitoring_dir = sys.argv[1] spark = SparkSession\ .builder\ .appName("InteractionCount")\ .config("spark.eventLog.enabled","true")\ .config("spark.eventLog.dir","hdfs://10.254.0.33:8020/user/ubuntu/applicationHistory")\ .master("spark://10.254.0.33:7077")\ .getOrCreate() userSchema = StructType().add("userA", "string").add("userB", "string").add( "timestamp", "timestamp").add("interaction", "string") csvDF = spark\ .readStream \ .schema(userSchema) \ .csv(monitoring_dir) interactions = csvDF.select(csvDF['interaction'], csvDF['timestamp']) windowedCounts = interactions.groupBy( window(interactions.timestamp, windowDuration, slideDuration), interactions.interaction).count() query = windowedCounts\ .writeStream\ .outputMode('complete')\
def remove_duplicate_encounters(ds, owner_name='user', transmitter_name='participant_identifier', start_time_name='start_time', end_time_name='end_time', centroid_id_name='centroid_id', distance_threshold=12): schema = StructType([ StructField('timestamp', TimestampType()), StructField('localtime', TimestampType()), StructField('start_time', DoubleType()), StructField('end_time', DoubleType()), StructField('user', StringType()), StructField('participant_identifier', StringType()), StructField('version', IntegerType()), StructField('os', StringType()), StructField('latitude', DoubleType()), StructField('distances', ArrayType(DoubleType())), StructField('longitude', DoubleType()), StructField('average_count', DoubleType()), StructField('centroid_longitude', DoubleType()), StructField('centroid_latitude', DoubleType()), StructField('centroid_id', IntegerType()), StructField('centroid_area', DoubleType()), StructField('distance_mean', DoubleType()), StructField('distance_std', DoubleType()), StructField('distance_count', DoubleType()) ]) @pandas_udf(schema, PandasUDFType.GROUPED_MAP) def remove_duplicates(data): data['distance_mean'] = data['distances'].apply(lambda a: np.mean(a)) data['distance_std'] = data['distances'].apply(lambda a: np.std(a)) data['distance_count'] = data['distances'].apply( lambda a: len(np.array(a)[np.array(a) < distance_threshold])) if data.shape[0] < 2: return data if len( np.intersect1d(data[owner_name].values, data[transmitter_name].values)) == 0: return data data = data.sort_values('distance_mean').reset_index(drop=True) not_visited = [] for i, row in data.iterrows(): if i in not_visited: continue temp_df = data[ data.participant_identifier.isin( [row[owner_name], row[transmitter_name]]) & data.user.isin([row[owner_name], row[transmitter_name]])] if temp_df.shape[0] == 0: continue else: indexes = [u for u in list(temp_df.index.values) if u != i] not_visited += indexes data = data[~data.index.isin(not_visited)] return data ds = ds.withColumn(start_time_name, F.col(start_time_name).cast('double')).withColumn( end_time_name, F.col(end_time_name).cast('double')) data = ds.groupBy([centroid_id_name, 'version']).apply(remove_duplicates) data = data.withColumn( start_time_name, F.col(start_time_name).cast('timestamp')).withColumn( end_time_name, F.col(end_time_name).cast('timestamp')) data = data.withColumn('covid', F.lit(0)) return DataStream(data=data, metadata=Metadata())
def test_load_spark_df(size, num_samples, num_movies, title_example, genres_example): """Test MovieLens dataset load into pySpark.DataFrame """ spark = start_or_get_spark("MovieLensLoaderTesting") # Check if the function load correct dataset df = movielens.load_spark_df(spark, size=size) assert df.count() == num_samples assert len(df.columns) == 4 # Test if can handle different size of header columns header = ["a"] df = movielens.load_spark_df(spark, header=header) assert len(df.columns) == len(header) header = ["a", "b", "c", "d", "e"] with pytest.warns(Warning): df = movielens.load_spark_df(spark, header=header) assert len(df.columns) == 4 # Test title load df = movielens.load_spark_df(spark, size=size, title_col="Title") assert len(df.columns) == 5 # Movie 1 is Toy Story title = df.filter( col(DEFAULT_ITEM_COL) == 1).select("Title").limit(2).collect() assert title[0][0] == title[1][0] assert title[0][0] == title_example # Test genres load df = movielens.load_spark_df(spark, size=size, genres_col="Genres") assert len(df.columns) == 5 # Movie 1 is Toy Story genres = df.filter( col(DEFAULT_ITEM_COL) == 1).select("Genres").limit(2).collect() assert genres[0][0] == genres[1][0] assert genres[0][0] == genres_example # Test movie data load (not rating data) df = movielens.load_spark_df(spark, size=size, header=None, title_col="Title", genres_col="Genres") assert df.count() == num_movies assert len(df.columns) == 3 # Test if can handle wrong size argument with pytest.raises(ValueError): movielens.load_spark_df(spark, size='10k') # Test if can handle wrong cache path argument with pytest.raises(ValueError): movielens.load_spark_df(spark, local_cache_path='.') # Test if use schema when both schema and header are provided header = ["1", "2"] schema = StructType([StructField("u", IntegerType())]) with pytest.warns(Warning): df = movielens.load_spark_df(spark, header=header, schema=schema) assert len(df.columns) == len(schema)
mama mia mama mia first thing we need to do is create the training data -check if we already get trained data -clean the data then we need to pull it throught the algorthim -gather the training data -gather the input sentences -run the analysis ''' the_trained_file_exist = path.isfile(trained_data) if the_trained_file_exist: trained_data = spark.read.csv(trained_data, header=False) else: the_schema = StructType([ StructField("classified", StringType()), StructField("content", StringType()) ]) raw_data = spark.read.csv(training_data, header=False, schema=the_schema) raw_data.printSchema() raw_data.show(5) #clean up data... raw_data = raw_data.withColumn('content', lower(col('content'))) raw_data = raw_data.withColumn('content', split(col('content'), ' ')) ''' from there, we just need to get rid of all the special chars and stuff so we can just have words. ''' raw_data.show(5) spam_clean_data = raw_data.filter(raw_data.classified == 'spam') ham_clean_data = raw_data.filter(raw_data.classified == 'ham') ham_clean_data.show(5)
from pyspark.sql import SparkSession from pyspark.sql.types import StructType spark = SparkSession.builder.appName("tweets").getOrCreate() spark.sparkContext.setLogLevel('WARN') userSchema = StructType()\ .add("id", "integer")\ .add("lang", "string")\ .add("date", "string")\ .add("source", "string")\ .add("len", "integer")\ .add("likes", "integer")\ .add("RT", "integer")\ .add("hashtags", "string")\ .add("usernames", "string")\ .add("userid", "string")\ .add("name", "string")\ .add("place", "string")\ .add("followers", "integer")\ .add("friends", "integer") dfCSV = spark.readStream.option("sep", ";").option( "header", "false").schema(userSchema).csv("/stream") dfCSV.createOrReplaceTempView("tweets") q2 = spark.sql( "select name,max(followers/friends) as FRRatio from tweets group by name order by FRRatio desc limit 5" ) query = q2.writeStream.outputMode("complete").format("console").start() query.awaitTermination(60) query.stop()
('spark.app.name', 'nyctaxi'), \ ('spark.cores.max', '16'), \ ('spark.driver.memory','124g')]) sc = SparkSession.builder.config(conf=conf).getOrCreate() #Need to define schema for NYC taxi data. schema = StructType([ \ StructField('VendorID', IntegerType(), True), \ StructField("tpep_pickup_datetime",StringType(),True), \ StructField("tpep_dropoff_datetime",StringType(),True), \ StructField("passenger_count", IntegerType(), True), \ StructField("trip_distance", DoubleType(), True), \ StructField('RateCodeID', IntegerType(), True), \ StructField('store_and_fwd_flag', StringType(), True), \ StructField("PULocationID",IntegerType(),True), \ StructField("DOLocationID", IntegerType(), True), \ StructField("payment_type", IntegerType(), True), \ StructField("fare_amount", DoubleType(), True), \ StructField("extra", DoubleType(), True), \ StructField("mta_tax", DoubleType(), True), \ StructField("tip_amount", DoubleType(), True), \ StructField("tolls_amount", DoubleType(), True), \ StructField("improvement_surcharge", DoubleType(), True), \ StructField("total_amount", DoubleType(), True), \ StructField("congestion_surcharge", DoubleType(), True) ]) df = sc.read.format("csv").options(header='True').schema(schema).load( "../../dan606/nyctaxi/trip data/yellow_tripdata_2019-11.csv") df.printSchema() #df = sqlContext.createDataFrame(df.head(500000), df.schema) #not a random sample df.count()
sqlContext.cacheTable('Ratings') tt = time() - t0 print "Data is loaded in %s seconds" % round(tt,3) rank = 8 seed = 5L iterations = 10 regularization_parameter = 0.1 t0 = time() print "Training the ALS model..." model = ALS.train(dfRates.rdd.map(lambda r: (int(r[0]), int(r[1]), r[2])).cache(), rank=rank, seed=seed, iterations=iterations, lambda_=regularization_parameter) tt = time() - t0 print "ALS model built!" print "New model trained in %s seconds" % round(tt,3) predictions = model.recommendProductsForUsers(10) \ .flatMap(lambda pair: pair[1]) \ .map(lambda rating: (rating.user, rating.product, rating.rating)) schema = StructType([StructField("userId", StringType(), True), StructField("movieId", StringType(), True), StructField("prediction", FloatType(), True)]) dfToSave = sqlContext.createDataFrame(predictions, schema) #dfToSave.write.jdbc(url=jdbcUrl, table=TABLE_RECOMMENDATIONS) t0 = time() dfToSave.write.option('driver', 'org.postgresql.Driver').jdbc(jdbcUrl, TABLE_RECOMMENDATIONS, mode='overwrite') tt = time() - t0 print "Recommendation saved to DB in %s seconds" % round(tt,3)
def _transform(self, df): import copy from pyspark.sql.types import StructField, StructType from pyspark.ml.linalg import VectorUDT model_pre_predict = self.getModel() deserialize = deserialize_fn() serialize = serialize_fn() serialized_model = serialize(model_pre_predict) input_shapes = self.getInputShapes() label_cols = self.getLabelColumns() output_cols = self.getOutputCols() feature_cols = self.getFeatureColumns() metadata = self._get_metadata() final_output_cols = util.get_output_cols(df.schema, output_cols) def predict(rows): from pyspark import Row from pyspark.ml.linalg import DenseVector, SparseVector model = deserialize(serialized_model) # Perform predictions. for row in rows: fields = row.asDict().copy() # Note: if the col is SparseVector, torch.tensor(col) correctly converts it to a # dense torch tensor. data = [ torch.tensor([row[col]]).reshape(shape) for col, shape in zip(feature_cols, input_shapes) ] with torch.no_grad(): preds = model(*data) if not isinstance(preds, list) and not isinstance( preds, tuple): preds = [preds] for label_col, output_col, pred in zip(label_cols, output_cols, preds): meta = metadata[label_col] col_type = meta['spark_data_type'] # dtype for dense and spark tensor is always np.float64 if col_type == DenseVector: shape = np.prod(pred.shape) flattened_pred = pred.reshape(shape, ) field = DenseVector(flattened_pred) elif col_type == SparseVector: shape = meta['shape'] flattened_pred = pred.reshape(shape, ) nonzero_indices = flattened_pred.nonzero()[0] field = SparseVector(shape, nonzero_indices, flattened_pred[nonzero_indices]) elif pred.shape.numel() == 1: # If the column is scalar type, int, float, etc. value = pred.item() python_type = util.spark_scalar_to_python_type( col_type) if issubclass(python_type, numbers.Integral): value = round(value) field = python_type(value) else: field = DenseVector(pred.reshape(-1)) fields[output_col] = field values = [fields[col] for col in final_output_cols] yield Row(*values) spark0 = SparkSession._instantiatedSession final_output_fields = [] # copy input schema for field in df.schema.fields: final_output_fields.append(copy.deepcopy(field)) # append output schema override_fields = df.limit(1).rdd.mapPartitions( predict).toDF().schema.fields[-len(output_cols):] for name, override, label in zip(output_cols, override_fields, label_cols): # default data type as label type data_type = metadata[label]['spark_data_type']() if type(override.dataType) == VectorUDT: # Override output to vector. This is mainly for torch's classification loss # where label is a scalar but model output is a vector. data_type = VectorUDT() final_output_fields.append( StructField(name=name, dataType=data_type, nullable=True)) final_output_schema = StructType(final_output_fields) pred_rdd = df.rdd.mapPartitions(predict) # Use the schema from previous section to construct the final DF with prediction return spark0.createDataFrame(pred_rdd, schema=final_output_schema)
def run( self, predicted_data_file_path: str, passed_paper_exam_data_file_path: str, send_scout_mail_data_file_path: str, aodrno_to_wjoid_data_file_path: str, applied_data_file_path: str, item_data_file_path: str ) -> bool: """execute.""" # check parameter self.__param_check( predicted_data_file_path, passed_paper_exam_data_file_path, send_scout_mail_data_file_path, aodrno_to_wjoid_data_file_path, applied_data_file_path, item_data_file_path ) # make spark context spark = SparkSession\ .builder\ .appName('aggregate_data')\ .config('spark.sql.crossJoin.enabled', 'true')\ .config('spark.debug.maxToStringFields', 500)\ .getOrCreate() sqlContext = SQLContext(sparkContext=spark.sparkContext, sparkSession=spark) # load predicted data custom_schema = StructType([ StructField('predicted_wjoid', StringType(), True), StructField('predicted_wsfid', StringType(), True), StructField('predicted_score', FloatType(), True), StructField('predicted_aodrno', StringType(), True), StructField('predicted_wsfno', StringType(), True) ]) predicted_df = sqlContext\ .read\ .format('csv')\ .options(header='true')\ .load(predicted_data_file_path, schema=custom_schema)\ .na\ .fill(0.0) # load aodrno to wjoid mapping data custom_schema = StructType([ StructField('mapping_aodrno', StringType(), True), StructField('mapping_aodrid', StringType(), True), StructField('mapping_wjoid', StringType(), True) ]) mapping_df = sqlContext\ .read\ .format('csv')\ .options(header='true')\ .load(aodrno_to_wjoid_data_file_path, schema=custom_schema)\ .na\ .fill(0.0) # join df = predicted_df.join( mapping_df, predicted_df['predicted_aodrno'] == mapping_df['mapping_aodrno'], 'left_outer' ) # delete unnecessary variables del(predicted_df) del(mapping_df) # load passed data custom_schema = StructType([ StructField('passed_astfid', StringType(), True), StructField('passed_aodrid', StringType(), True) ]) passed_df = sqlContext\ .read\ .format('csv')\ .options(header='true')\ .load(passed_paper_exam_data_file_path, schema=custom_schema)\ .na\ .fill(0.0) # join df = df.join( passed_df, ( (df['mapping_aodrid'] == passed_df['passed_aodrid']) & (df['predicted_wsfid'] == passed_df['passed_astfid']) ), 'left_outer' ) # delete unnecessary variables del(passed_df) # load send scout mail data custom_schema = StructType([ StructField('scout_mail_wsfid', StringType(), True), StructField('scout_mail_wjoid', StringType(), True) ]) scout_mail_df = sqlContext\ .read\ .format('csv')\ .options(header='true')\ .load(send_scout_mail_data_file_path, schema=custom_schema)\ .na\ .fill(0.0) # join df = df.join( scout_mail_df, ( (df['mapping_wjoid'] == scout_mail_df['scout_mail_wjoid']) & (df['predicted_wsfid'] == scout_mail_df['scout_mail_wsfid']) ), 'left_outer' ) # delete unnecessary variables del(scout_mail_df) # load applied data custom_schema = StructType([ StructField('applied_astfid', StringType(), True), StructField('applied_aodrid', StringType(), True) ]) applied_df = sqlContext\ .read\ .format('csv')\ .options(header='true')\ .load(applied_data_file_path, schema=custom_schema)\ .na\ .fill(0.0) # join df = df.join( applied_df, ( (df['mapping_aodrid'] == applied_df['applied_aodrid']) & (df['predicted_wsfid'] == applied_df['applied_astfid']) ), 'left_outer' ) # delete unnecessary variables del(applied_df) # load item data items_df = sqlContext\ .read\ .format('csv')\ .options(header='true')\ .load(item_data_file_path)\ .na\ .fill(0.0) # join df = df.join( items_df, df['predicted_aodrno'] == items_df['i_aodrno'], 'left_outer' ) # delete unnecessary variables del(items_df) df.cache() # make udf f_column_check = udf(lambda l_1, l_2: self.__column_check(l_1, l_2), StringType()) f_make_area_cd = udf(lambda l: self.__make_area_cd(l), StringType()) # passed check df = df.withColumn( 'passed_paper_exam', f_column_check(df['passed_astfid'], df['passed_aodrid']) ) df = df.withColumn( 'send_scout_mail', f_column_check(df['scout_mail_wsfid'], df['scout_mail_wjoid']) ) df = df.withColumn( 'applied', f_column_check(df['applied_astfid'], df['applied_aodrid']) ) # area code df = df.withColumn('area_cd', f_make_area_cd(df['i_atdfkid'])) df.cache() # aggregate # all data data = {} d = self.__calc_distribution(df) for k in d.keys(): data[k] = [d[k]] # send scout mail tmp_df = df.where(df['send_scout_mail'] == '1') d = self.__calc_distribution(tmp_df) for k in d.keys(): data[k].append(d[k]) # send scout mail -> applied tmp_df = df.where( (df['send_scout_mail'] == '1') & (df['applied'] == '1') ) d = self.__calc_distribution(tmp_df) for k in d.keys(): data[k].append(d[k]) # send scout mail -> passed paper exam tmp_df = df.where( (df['send_scout_mail'] == '1') & (df['passed_paper_exam'] == '1') ) d = self.__calc_distribution(tmp_df) for k in d.keys(): data[k].append(d[k]) # not send scout mail -> applied tmp_df = df.where( (df['send_scout_mail'] != '1') & (df['applied'] == '1') ) d = self.__calc_distribution(tmp_df) for k in d.keys(): data[k].append(d[k]) # not send scout mail -> passed paper exam tmp_df = df.where( (df['send_scout_mail'] != '1') & (df['passed_paper_exam'] == '1') ) d = self.__calc_distribution(tmp_df) for k in d.keys(): data[k].append(d[k]) print('all') for k, v in sorted(data.items()): if int(v[2]) == 0: div_1 = 0 else: div_1 = int(v[3]) / int(v[2]) if int(v[4]) == 0: div_2 = 0 else: div_2 = int(v[5]) / int(v[4]) t = [v[0], v[1], v[2], v[3], div_1, v[4], v[5], div_2] print('%s\t%s' % (k, '\t'.join(map(str, t)))) # count by each item for column in ['i_assyulid', 'area_cd']: print('') print(column) if column == 'i_assyulid': master = ['%02d' % (n) for n in range(1, 16)] elif column == 'area_cd': master = ['2', '3', '4', '5', '6', '7', '8', '0'] else: master = [] data = defaultdict(list) for m in master: tmp_data = {} column_df = df.where(df[column] == m) # send scout mail -> applied tmp_df = column_df.where( (column_df['send_scout_mail'] == '1') & (column_df['applied'] == '1') ) d = self.__calc_distribution(tmp_df) for k in d.keys(): tmp_data[k] = [d[k]] # send scout mail -> passed paper exam tmp_df = column_df.where( (column_df['send_scout_mail'] == '1') & (column_df['passed_paper_exam'] == '1') ) d = self.__calc_distribution(tmp_df) for k in d.keys(): tmp_data[k].append(d[k]) # not send scout mail -> applied tmp_df = column_df.where( (column_df['send_scout_mail'] != '1') & (column_df['applied'] == '1') ) d = self.__calc_distribution(tmp_df) for k in d.keys(): tmp_data[k].append(d[k]) # not send scout mail -> passed paper exam tmp_df = column_df.where( (column_df['send_scout_mail'] != '1') & (column_df['passed_paper_exam'] == '1') ) d = self.__calc_distribution(tmp_df) for k in d.keys(): tmp_data[k].append(d[k]) for k, v in tmp_data.items(): if int(v[0]) == 0: div_1 = 0 else: div_1 = int(v[1]) / int(v[0]) if int(v[2]) == 0: div_2 = 0 else: div_2 = int(v[3]) / int(v[2]) data[k].append(v[0]) data[k].append(v[1]) data[k].append(div_1) data[k].append(v[2]) data[k].append(v[3]) data[k].append(div_2) for k, v in sorted(data.items()): print('%s\t%s' % (k, '\t'.join(map(str, v)))) # delete unnecessary variables del(column_df) # job with experience data = defaultdict(list) i = 1 while i <= 2: if i == 1: must_assyulid_df = df.where(df['i_must_assyulid'].isNotNull()) elif i == 2: must_assyulid_df = df.where(df['i_must_assyulid'].isNull()) else: break # send scout mail -> applied tmp_df = must_assyulid_df.where( (must_assyulid_df['send_scout_mail'] == '1') & (must_assyulid_df['applied'] == '1') ) d = self.__calc_distribution(tmp_df) for k in d.keys(): data[k].append(d[k]) # send scout mail -> passed paper exam tmp_df = must_assyulid_df.where( (must_assyulid_df['send_scout_mail'] == '1') & (must_assyulid_df['passed_paper_exam'] == '1') ) d = self.__calc_distribution(tmp_df) for k in d.keys(): data[k].append(d[k]) # not send scout mail -> applied tmp_df = must_assyulid_df.where( (must_assyulid_df['send_scout_mail'] != '1') & (must_assyulid_df['applied'] == '1') ) d = self.__calc_distribution(tmp_df) for k in d.keys(): data[k].append(d[k]) # not send scout mail -> passed paper exam tmp_df = must_assyulid_df.where( (must_assyulid_df['send_scout_mail'] != '1') & (must_assyulid_df['passed_paper_exam'] == '1') ) d = self.__calc_distribution(tmp_df) for k in d.keys(): data[k].append(d[k]) i += 1 print('') print('job with experience') for k, v in sorted(data.items()): if int(v[0]) == 0: div_1 = 0 else: div_1 = int(v[1]) / int(v[0]) if int(v[2]) == 0: div_2 = 0 else: div_2 = int(v[3]) / int(v[2]) if int(v[4]) == 0: div_3 = 0 else: div_3 = int(v[5]) / int(v[4]) if int(v[6]) == 0: div_4 = 0 else: div_4 = int(v[7]) / int(v[6]) t = [v[0], v[1], div_1, v[2], v[3], div_2, v[4], v[5], div_3, v[6], v[7], div_4] print('%s\t%s' % (k, '\t'.join(map(str, t)))) # delete unnecessary variables del(must_assyulid_df) return True
ArrayType, BooleanType, DoubleType, LongType, StringType, StructField, StructType, ) HN_ITEMS_SCHEMA = StructType([ StructField("id", LongType()), StructField("parent", DoubleType()), StructField("time", LongType()), StructField("type", StringType()), StructField("by", StringType()), StructField("text", StringType()), StructField("kids", ArrayType(LongType())), StructField("dead", BooleanType()), StructField("score", DoubleType()), StructField("title", StringType()), StructField("descendants", DoubleType()), StructField("url", StringType()), ]) ITEM_FIELD_NAMES = [field.name for field in HN_ITEMS_SCHEMA.fields] @asset( io_manager_key="parquet_io_manager", required_resource_keys={"hn_client"}, description= "Items from the Hacker News API: each is a story or a comment on a story.",
def main(iso_date, base_path): APP_NAME = "make_predictions.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # Load each and every model in the pipeline # # Load the arrival delay bucketizer from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer.bin".format( base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # Load the departure delay bucketizer departure_bucketizer_path = "{}/models/departure_bucketizer.bin".format( base_path) departure_bucketizer = Bucketizer.load(departure_bucketizer_path) # Load all the string field vectorizer pipelines into a dict from pyspark.ml import PipelineModel string_vectorizer_pipeline_models = {} for column in [ "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "FlightNum", "DepDelayBucket" ]: string_pipeline_model_path = "{}/models/string_indexer_pipeline_model_{}.bin".format( base_path, column) string_pipeline_model = PipelineModel.load(string_pipeline_model_path) string_vectorizer_pipeline_models[column] = string_pipeline_model # Load the numeric vector assembler from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # Load the final assembler final_assembler_path = "{}/models/final_vector_assembler.bin".format( base_path) final_assembler = VectorAssembler.load(final_assembler_path) # Load the classifier model from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.bin".format( base_path) rfc = RandomForestClassificationModel.load(random_forest_model_path) # # Run the requests through the transformations from training # # Get today and tomorrow's dates as iso strings to scope query today_dt = iso8601.parse_date(iso_date) rounded_today = today_dt.date() iso_today = rounded_today.isoformat() # Build the day's input path: a date based primary key directory structure today_input_path = "{}/data/prediction_tasks_daily.json/{}".format( base_path, iso_today) from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), ]) prediction_requests = spark.read.json(today_input_path, schema=schema) prediction_requests.show() # Bucketize the departure and arrival delays for classification ml_bucketized_features = departure_bucketizer.transform( prediction_requests) # Check the buckets ml_bucketized_features.select("DepDelay", "DepDelayBucket").show() # Vectorize string fields with the corresponding pipeline for that column # Turn category fields into categoric feature vectors, then drop intermediate fields for column in [ "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "FlightNum", "DepDelayBucket" ]: string_pipeline_path = "{}/models/string_indexer_pipeline_{}.bin".format( base_path, column) string_pipeline_model = string_vectorizer_pipeline_models[column] ml_bucketized_features = string_pipeline_model.transform( ml_bucketized_features) ml_bucketized_features = ml_bucketized_features.drop(column + "_index") # Vectorize numeric columns ml_bucketized_features = vector_assembler.transform(ml_bucketized_features) # Drop the original numeric columns numeric_columns = ["DepDelay", "Distance"] # Combine various features into one feature vector, 'features' final_vectorized_features = final_assembler.transform( ml_bucketized_features) final_vectorized_features.show() # Drop the individual vector columns feature_columns = [ "Carrier_vec", "DayOfMonth_vec", "DayOfWeek_vec", "DayOfYear_vec", "Origin_vec", "Dest_vec", "FlightNum_vec", "DepDelayBucket_vec", "NumericFeatures_vec" ] for column in feature_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Make the prediction predictions = rfc.transform(final_vectorized_features) # Drop the features vector and prediction metadata to give the original fields predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop( "rawPrediction").drop("probability") # Inspect the output final_predictions.show() # Build the day's output path: a date based primary key directory structure today_output_path = "{}/data/prediction_results_daily.json/{}".format( base_path, iso_today) # Save the output to its daily bucket final_predictions.repartition(1).write.mode("overwrite").json( today_output_path)
schema = StructType() \ .add("conversationTitle", "string") \ .add("tapeNumber", "string") \ .add("conversationNumber", "string") \ .add("identifier", "string") \ .add("startDateTime", "string") \ .add("endDateTime", "string") \ .add("startDate", "string") \ .add("startTime", "string") \ .add("endDate", "string") \ .add("endTime", "string") \ .add("dateCertainty", "string") \ .add("timeCertainty", "string") \ .add("participants", "string") \ .add("description", "string") \ .add("locationCode", "string") \ .add("recordingDevice", "string") \ .add("latitudeEstimated", "string") \ .add("longitudeEstimated", "string") \ .add("collection", "string") \ .add("collectionURL", "string") \ .add("chronCode", "string") \ .add("chronRelease", "string") \ .add("chronReleaseDate", "string") \ .add("aogpRelease", "string") \ .add("aogpSegment", "string") \ .add("digitalAccess", "string") \ .add("physicalAccess", "string") \ .add("contactEmail", "string") \ .add("lastModified", "string") \
from pyspark.sql import SparkSession from pyspark.sql.functions import from_json, to_json, col, unbase64, base64, split, expr from pyspark.sql.types import StructField, StructType, StringType, BooleanType, ArrayType, DateType, FloatType # TO-DO: create a StructType for the Kafka redis-server topic which has all changes made to Redis - before Spark 3.0.0, schema inference is not automatic redisMessageSchema = StructType([ StructField('key', StringType()), StructField('value', StringType()), StructField('expiredType', StringType()), StructField('expiredValue', StringType()), StructField('existType', StringType()), StructField('ch', StringType()), StructField('incr', BooleanType()), StructField( 'zSetEntries', ArrayType( StructType([ StructField('element', StringType()), StructField('score', StringType()) ]))) ]) # TO-DO: create a StructType for the Customer JSON that comes from Redis- before Spark 3.0.0, schema inference is not automatic customerJSONSchema = StructType([ StructField('customerName', StringType()), StructField('email', StringType()), StructField('phone', StringType()), StructField('birthDay', StringType()) ]) # TO-DO: create a StructType for the Kafka stedi-events topic which has the Customer Risk JSON that comes from Redis- before Spark 3.0.0, schema inference is not automatic
def write_schema(src_name): """ define outgoing file's schema """ return { 'yellow': StructType([ StructField('VendorID', StringType(), True), StructField('tpep_pickup_datetime', TimestampType(), True), StructField('tpep_dropoff_datetime', TimestampType(), True), StructField('passenger_count', IntegerType(), True), StructField('trip_distance', DecimalType(10, 4), True), StructField('RatecodeID', StringType(), True), StructField('store_and_fwd_flag', StringType(), True), StructField('PULocationID', StringType(), True), StructField('DOLocationID', StringType(), True), StructField('payment_type', StringType(), True), StructField('fare_amount', DecimalType(10, 2), True), StructField('extra', DecimalType(10, 2), True), StructField('mta_tax', DecimalType(10, 2), True), StructField('tip_amount', DecimalType(10, 2), True), StructField('tolls_amount', DecimalType(10, 2), True), StructField('improvement_surcharge', DecimalType(10, 2), True), StructField('total_amount', DecimalType(10, 2), True), StructField('congestion_surcharge', DecimalType(10, 2), True), StructField('file_name', StringType(), True), StructField('update_date', TimestampType(), True), StructField('dt', StringType(), True) ]), 'green': StructType([ StructField('VendorID', StringType(), True), StructField('lpep_pickup_datetime', TimestampType(), True), StructField('lpep_dropoff_datetime', TimestampType(), True), StructField('store_and_fwd_flag', StringType(), True), StructField('RatecodeID', StringType(), True), StructField('PULocationID', StringType(), True), StructField('DOLocationID', StringType(), True), StructField('passenger_count', IntegerType(), True), StructField('trip_distance', DecimalType(10, 4), True), StructField('fare_amount', DecimalType(10, 2), True), StructField('extra', DecimalType(10, 2), True), StructField('mta_tax', DecimalType(10, 2), True), StructField('tip_amount', DecimalType(10, 2), True), StructField('tolls_amount', DecimalType(10, 2), True), StructField('ehail_fee', DecimalType(10, 2), True), StructField('improvement_surcharge', DecimalType(10, 2), True), StructField('total_amount', DecimalType(10, 2), True), StructField('payment_type', StringType(), True), StructField('trip_type', StringType(), True), StructField('congestion_surcharge', DecimalType(10, 2), True), StructField('file_name', StringType(), True), StructField('update_date', TimestampType(), True), StructField('dt', StringType(), True) ]), 'corrupted': StructType([ StructField('corrupted', StringType(), True), StructField('file_name', StringType(), True), StructField('update_date', TimestampType(), True) ]) }[src_name]
sc = SparkContext('local',"task2") #rdd1 = sc.textFile("shape_stat.csv") #rdd2 = sc.textFile("shape.csv") rdd1 = sc.textFile(data2) rdd2 = sc.textFile(data1) mappedRdd1 = rdd1.map(lambda x: x.split(',')) mappedRdd2 = rdd2.map(lambda x: x.split(',')) spark = SparkSession(sc) header=mappedRdd1.first() fields = [StructField(field_name, StringType(), True) for field_name in header] schema = StructType(fields) frdd1=mappedRdd1.filter(lambda row:row != header) df1=spark.createDataFrame(frdd1,schema=schema) header=mappedRdd2.first() fields = [StructField(field_name, StringType(), True) for field_name in header] schema = StructType(fields) frdd2=mappedRdd2.filter(lambda row:row != header) df2=spark.createDataFrame(frdd2,schema=schema) df1=df1[df1.word==word] df2=df2[df2.word==word] df1=df1.repartitionByRange(100,'key_id') df2=df2.repartitionByRange(100,'key_id')
#https://medium.com/@nitingupta.bciit/apache-spark-streaming-from-text-file-fa40155b7a21 #https://hackersandslackers.com/structured-streaming-in-pyspark/ from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType spark = SparkSession.builder.appName("Streaming").config('').getOrCreate() schema_1 = StructType([ StructField('Athlete', StringType(), True), StructField('Age', IntegerType(), True), StructField('Country', StringType(), True), StructField('Year', IntegerType(), True), StructField('Closing Ceremony Date', TimestampType(), True), StructField('Sport', StringType(), True), StructField('Medal and Number', StringType(), True), ]) activityDataSample = 'folder_path' #dont use max file = 1 in production streamingDF = (spark.readStream.schema(schema_1).option( "maxFilesPerTrigger", 1).csv(activityDataSample)) streamingDF = streamingDF.filter((streamingDF.Country != 'null') | (streamingDF.Country != 'Null') \ | (streamingDF.Country != '')) streamingActionCountsDF = streamingDF.groupBy('Country').count() print(streamingActionCountsDF.isStreaming) #As we are testing it in standalone
schema = StructType([ StructField( "results", ArrayType( StructType([ StructField( "user", StructType([ StructField("gender", StringType()), StructField( "name", StructType([ StructField("title", StringType()), StructField("first", StringType()), StructField("last", StringType()) ])), StructField( "location", StructType([ StructField("street", StringType()), StructField("city", StringType()), StructField("state", StringType()), StructField("zip", StringType()) ])), StructField("email", StringType()), StructField("username", StringType()), StructField("password", StringType()), StructField("salt", StringType()), StructField("md5", StringType()), StructField("sha1", StringType()), StructField("sha256", StringType()), StructField("registered", StringType()), StructField("dob", StringType()), StructField("phone", StringType()), StructField("cell", StringType()), StructField( "picture", StructType([ StructField("large", StringType()), StructField("medium", StringType()), StructField("thumbnail", StringType()) ])) ])) ]))), StructField("nationality", StringType()), StructField("seed", StringType()), StructField("version", StringType()) ])
def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True): """ Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`. When ``schema`` is a list of column names, the type of each column will be inferred from ``data``. When ``schema`` is ``None``, it will try to infer the schema (column names and types) from ``data``, which should be an RDD of :class:`Row`, or :class:`namedtuple`, or :class:`dict`. When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match the real data, or an exception will be thrown at runtime. If the given schema is not :class:`pyspark.sql.types.StructType`, it will be wrapped into a :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value", each record will also be wrapped into a tuple, which can be converted to row later. If schema inference is needed, ``samplingRatio`` is used to determined the ratio of rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``. :param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean, etc.), or :class:`list`, or :class:`pandas.DataFrame`. :param schema: a :class:`pyspark.sql.types.DataType` or a datatype string or a list of column names, default is ``None``. The data type string format equals to :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`. We can also use ``int`` as a short name for ``IntegerType``. :param samplingRatio: the sample ratio of rows used for inferring :param verifySchema: verify data types of every row against schema. :return: :class:`DataFrame` .. versionchanged:: 2.1 Added verifySchema. >>> l = [('Alice', 1)] >>> spark.createDataFrame(l).collect() [Row(_1=u'Alice', _2=1)] >>> spark.createDataFrame(l, ['name', 'age']).collect() [Row(name=u'Alice', age=1)] >>> d = [{'name': 'Alice', 'age': 1}] >>> spark.createDataFrame(d).collect() [Row(age=1, name=u'Alice')] >>> rdd = sc.parallelize(l) >>> spark.createDataFrame(rdd).collect() [Row(_1=u'Alice', _2=1)] >>> df = spark.createDataFrame(rdd, ['name', 'age']) >>> df.collect() [Row(name=u'Alice', age=1)] >>> from pyspark.sql import Row >>> Person = Row('name', 'age') >>> person = rdd.map(lambda r: Person(*r)) >>> df2 = spark.createDataFrame(person) >>> df2.collect() [Row(name=u'Alice', age=1)] >>> from pyspark.sql.types import * >>> schema = StructType([ ... StructField("name", StringType(), True), ... StructField("age", IntegerType(), True)]) >>> df3 = spark.createDataFrame(rdd, schema) >>> df3.collect() [Row(name=u'Alice', age=1)] >>> spark.createDataFrame(df.toPandas()).collect() # doctest: +SKIP [Row(name=u'Alice', age=1)] >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect() # doctest: +SKIP [Row(0=1, 1=2)] >>> spark.createDataFrame(rdd, "a: string, b: int").collect() [Row(a=u'Alice', b=1)] >>> rdd = rdd.map(lambda row: row[1]) >>> spark.createDataFrame(rdd, "int").collect() [Row(value=1)] >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... Py4JJavaError: ... """ if isinstance(data, DataFrame): raise TypeError("data is already a DataFrame") if isinstance(schema, basestring): schema = _parse_datatype_string(schema) elif isinstance(schema, (list, tuple)): # Must re-encode any unicode strings to be consistent with StructField names schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in schema] try: import pandas has_pandas = True except Exception: has_pandas = False if has_pandas and isinstance(data, pandas.DataFrame): from pyspark.sql.utils import require_minimum_pandas_version require_minimum_pandas_version() if self.conf.get("spark.sql.execution.pandas.respectSessionTimeZone").lower() \ == "true": timezone = self.conf.get("spark.sql.session.timeZone") else: timezone = None # If no schema supplied by user then get the names of columns only if schema is None: schema = [str(x) if not isinstance(x, basestring) else (x.encode('utf-8') if not isinstance(x, str) else x) for x in data.columns] if self.conf.get("spark.sql.execution.arrow.enabled", "false").lower() == "true" \ and len(data) > 0: try: return self._create_from_pandas_with_arrow(data, schema, timezone) except Exception as e: warnings.warn("Arrow will not be used in createDataFrame: %s" % str(e)) # Fallback to create DataFrame without arrow if raise some exception data = self._convert_from_pandas(data, schema, timezone) if isinstance(schema, StructType): verify_func = _make_type_verifier(schema) if verifySchema else lambda _: True def prepare(obj): verify_func(obj) return obj elif isinstance(schema, DataType): dataType = schema schema = StructType().add("value", schema) verify_func = _make_type_verifier( dataType, name="field value") if verifySchema else lambda _: True def prepare(obj): verify_func(obj) return obj, else: prepare = lambda obj: obj if isinstance(data, RDD): rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio) else: rdd, schema = self._createFromLocal(map(prepare, data), schema) jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd()) jdf = self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(), schema.json()) df = DataFrame(jdf, self._wrapped) df._schema = schema return df
from pyspark.sql import SparkSession from pyspark.sql import functions as func from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType spark = SparkSession.builder.appName("total-amount-spent").getOrCreate() schema = StructType([ StructField("customerID", IntegerType(), True), StructField("itemID", IntegerType(), True), StructField("amountSpent", FloatType(), True) ]) df = spark.read.schema(schema).csv("file:///SparkCourse/customer-orders.csv") df.printSchema() customers = df.select("customerID", "amountSpent") amountSpent = customers.groupBy("customerID").sum("amountSpent") amountSpentRounded = amountSpent.withColumn( "amountSpent", func.round(func.col("sum(amountSpent)"), 2)).select("customerID", "amountSpent").sort("amountSpent") results = amountSpentRounded.collect() for result in results: print(result[0], result[1]) spark.stop()
@F.udf(returnType=IntegerType()) def boro_to_borocode(x): '''Convert boro string to borocode''' return {"NY": 1, "MN": 1, "BX": 2, "BRONX": 2, \ "BK": 3, "K": 33, "KINGS": 3, "KING": 3, "BKLYN": 4, \ "Q": 4, "QUEEN": 4, "QN": 4, "QNS": 4, "QU": 4, \ "ST": 5, "SI": 5, "R": 5, "RICHMOND": 5}.get(x, None) @F.udf(returnType=StringType()) def trim_street(x): '''Remove repeating whitespaces''' return ' '.join(x.upper().split()) count_schema = StructType([StructField('PHYSICALID', IntegerType(), True),\ StructField('year', IntegerType(), True),\ StructField('count', IntegerType(), True)]) # get the violation data into the dataframe # filter for null values and make sure home address has the right format # extract year from issued date and make sure the year is in the 2015 to 2019 range # convert boro to borocode and only make sure nyc boroughs go through by using the null filter # trim streetname of any extra white spaces and collect violations based on a common streetname and borough. # The violations collection row is called `violations` df_violations = csv_df(sqlContext, os.path.join(sys.argv[1] if len(sys.argv) > 1 else "nyc_parking_violation", "*.csv"))\ .select("Issue Date", "Street Name", "House Number", "Violation County").dropna()\ .filter(F.col("House Number").rlike("^(\d+)((-(\d+))*)$"))\ .withColumn("year", date_to_year(F.col("Issue Date")))\ .filter("2015 <= year and year <= 2019")\ .withColumn("BOROCODE", boro_to_borocode(F.col("Violation County")))\ .dropna(subset=["BOROCODE"])\
import unittest from pyspark.sql import SparkSession from pipeline import pipeline from pyspark.sql.types import StructType, StructField, StringType message_schema = StructType([StructField("name", StringType())]) class Test(unittest.TestCase): def test_pipeline(self) -> None: spark = _spark() data = spark.readStream.json(path="test_data/", schema=message_schema) stream = pipeline.process(data) stream.processAllAvailable() stream.stop() self.assertEqual( 2, spark.sql("SELECT COUNT(1) AS count FROM example").first() ["count"]) self.assertEqual( 1, spark.sql("SELECT COUNT(1) AS count FROM example WHERE name = 'a'" ).first()["count"]) self.assertEqual( 1, spark.sql("SELECT COUNT(1) AS count FROM example WHERE name = 'b'" ).first()["count"])
secret = open(app_secrets_path) app_secret = yaml.load(secret, Loader=yaml.FullLoader) # Setup spark to use s3 hadoop_conf = spark.sparkContext._jsc.hadoopConfiguration() hadoop_conf.set("fs.s3a.access.key", app_secret["s3_conf"]["access_key"]) hadoop_conf.set("fs.s3a.secret.key", app_secret["s3_conf"]["secret_access_key"]) print( "\nCreating dataframe ingestion CSV file using 'SparkSession.read.format()'" ) fin_schema = StructType() \ .add("id", IntegerType(), True) \ .add("has_debt", BooleanType(), True) \ .add("has_financial_dependents", BooleanType(), True) \ .add("has_student_loans", BooleanType(), True) \ .add("income", DoubleType(), True) fin_df = spark.read \ .option("header", "false") \ .option("delimiter", ",") \ .format("csv") \ .schema(fin_schema) \ .load("s3a://" + app_conf["s3_conf"]["s3_bucket"] + "/finances.csv") fin_df.printSchema() fin_df.show() print( "Creating dataframe ingestion CSV file using 'SparkSession.read.csv()',"
from pyspark.sql.functions import split, max, struct from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType spark = SparkSession\ .builder\ .appName("BigData")\ .getOrCreate() schema = StructType([ StructField("id", StringType(), True), StructField("language", StringType(), True), StructField("date", StringType(), True), StructField("source", StringType(), True), StructField("len", StringType(), True), StructField("likes", StringType(), True), StructField("RTs", StringType(), True), StructField("Hashtags", StringType(), True), StructField("usermentionnames", StringType(), True), StructField("usermentionid", StringType(), True), StructField("name", StringType(), True), StructField("place", StringType(), True), StructField("followers", DoubleType(), True), StructField("friends", DoubleType(), True), ]) lines=spark \ .readStream \ .option("sep",";") \ .schema(schema) \ .csv("hdfs://localhost:8080/stream") tags = lines.select("Hashtags") h = tags.select(explode(split("Hashtags", ",")).alias("Hashtags"))
def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True): """ Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`. When ``schema`` is a list of column names, the type of each column will be inferred from ``data``. When ``schema`` is ``None``, it will try to infer the schema (column names and types) from ``data``, which should be an RDD of :class:`Row`, or :class:`namedtuple`, or :class:`dict`. When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match the real data, or an exception will be thrown at runtime. If the given schema is not :class:`pyspark.sql.types.StructType`, it will be wrapped into a :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value", each record will also be wrapped into a tuple, which can be converted to row later. If schema inference is needed, ``samplingRatio`` is used to determined the ratio of rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``. :param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean, etc.), or :class:`list`, or :class:`pandas.DataFrame`. :param schema: a :class:`pyspark.sql.types.DataType` or a datatype string or a list of column names, default is ``None``. The data type string format equals to :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`. We can also use ``int`` as a short name for ``IntegerType``. :param samplingRatio: the sample ratio of rows used for inferring :param verifySchema: verify data types of every row against schema. :return: :class:`DataFrame` .. versionchanged:: 2.1 Added verifySchema. .. note:: Usage with spark.sql.execution.arrow.enabled=True is experimental. >>> l = [('Alice', 1)] >>> spark.createDataFrame(l).collect() [Row(_1=u'Alice', _2=1)] >>> spark.createDataFrame(l, ['name', 'age']).collect() [Row(name=u'Alice', age=1)] >>> d = [{'name': 'Alice', 'age': 1}] >>> spark.createDataFrame(d).collect() [Row(age=1, name=u'Alice')] >>> rdd = sc.parallelize(l) >>> spark.createDataFrame(rdd).collect() [Row(_1=u'Alice', _2=1)] >>> df = spark.createDataFrame(rdd, ['name', 'age']) >>> df.collect() [Row(name=u'Alice', age=1)] >>> from pyspark.sql import Row >>> Person = Row('name', 'age') >>> person = rdd.map(lambda r: Person(*r)) >>> df2 = spark.createDataFrame(person) >>> df2.collect() [Row(name=u'Alice', age=1)] >>> from pyspark.sql.types import * >>> schema = StructType([ ... StructField("name", StringType(), True), ... StructField("age", IntegerType(), True)]) >>> df3 = spark.createDataFrame(rdd, schema) >>> df3.collect() [Row(name=u'Alice', age=1)] >>> spark.createDataFrame(df.toPandas()).collect() # doctest: +SKIP [Row(name=u'Alice', age=1)] >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect() # doctest: +SKIP [Row(0=1, 1=2)] >>> spark.createDataFrame(rdd, "a: string, b: int").collect() [Row(a=u'Alice', b=1)] >>> rdd = rdd.map(lambda row: row[1]) >>> spark.createDataFrame(rdd, "int").collect() [Row(value=1)] >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... Py4JJavaError: ... """ SparkSession._activeSession = self self._jvm.SparkSession.setActiveSession(self._jsparkSession) if isinstance(data, DataFrame): raise TypeError("data is already a DataFrame") if isinstance(schema, basestring): schema = _parse_datatype_string(schema) elif isinstance(schema, (list, tuple)): # Must re-encode any unicode strings to be consistent with StructField names schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in schema] try: import pandas has_pandas = True except Exception: has_pandas = False if has_pandas and isinstance(data, pandas.DataFrame): from pyspark.sql.utils import require_minimum_pandas_version require_minimum_pandas_version() if self._wrapped._conf.pandasRespectSessionTimeZone(): timezone = self._wrapped._conf.sessionLocalTimeZone() else: timezone = None # If no schema supplied by user then get the names of columns only if schema is None: schema = [str(x) if not isinstance(x, basestring) else (x.encode('utf-8') if not isinstance(x, str) else x) for x in data.columns] if self._wrapped._conf.arrowEnabled() and len(data) > 0: try: return self._create_from_pandas_with_arrow(data, schema, timezone) except Exception as e: from pyspark.util import _exception_message if self._wrapped._conf.arrowFallbackEnabled(): msg = ( "createDataFrame attempted Arrow optimization because " "'spark.sql.execution.arrow.enabled' is set to true; however, " "failed by the reason below:\n %s\n" "Attempting non-optimization as " "'spark.sql.execution.arrow.fallback.enabled' is set to " "true." % _exception_message(e)) warnings.warn(msg) else: msg = ( "createDataFrame attempted Arrow optimization because " "'spark.sql.execution.arrow.enabled' is set to true, but has reached " "the error below and will not continue because automatic fallback " "with 'spark.sql.execution.arrow.fallback.enabled' has been set to " "false.\n %s" % _exception_message(e)) warnings.warn(msg) raise data = self._convert_from_pandas(data, schema, timezone) if isinstance(schema, StructType): verify_func = _make_type_verifier(schema) if verifySchema else lambda _: True def prepare(obj): verify_func(obj) return obj elif isinstance(schema, DataType): dataType = schema schema = StructType().add("value", schema) verify_func = _make_type_verifier( dataType, name="field value") if verifySchema else lambda _: True def prepare(obj): verify_func(obj) return obj, else: prepare = lambda obj: obj if isinstance(data, RDD): rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio) else: rdd, schema = self._createFromLocal(map(prepare, data), schema) jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd()) jdf = self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(), schema.json()) df = DataFrame(jdf, self._wrapped) df._schema = schema return df
def bluetooth_encounter(data, st: datetime, et: datetime, distance_threshold=12, n_rows_threshold=8, time_threshold=10 * 60, ltime=True): """ :param ds: Input Datastream :param st: Start Time the time window in UTC :param et: End Time of time window in UTC :param distance_threshold: Threshold on mean distance per encounter :param n_rows_threshold: No of rows per group/encounter :param time_threshold: Minimum Duration of time per encounter :param epsilon: A simple threshold :param count_threshold: Threshold on count :return: A Sparse representation of the Bluetooth Encounter """ schema = StructType([ StructField('timestamp', TimestampType()), StructField('localtime', TimestampType()), StructField('start_time', TimestampType()), StructField('end_time', TimestampType()), StructField('user', StringType()), StructField('version', IntegerType()), StructField('latitude', DoubleType()), StructField('distances', ArrayType(DoubleType())), StructField('longitude', DoubleType()), StructField('average_count', DoubleType()), StructField('major', LongType()), StructField('minor', LongType()) ]) @pandas_udf(schema, PandasUDFType.GROUPED_MAP) def get_enconters(data): if data.shape[0] < n_rows_threshold: return pd.DataFrame([], columns=[ 'user', 'major', 'minor', 'start_time', 'end_time', 'version', 'distances', 'timestamp', 'localtime', 'latitude', 'longitude', 'average_count' ]) data = data.sort_values('time').reset_index(drop=True) data_filtered = data[data.distance_estimate < distance_threshold] # data_filtered = data if data_filtered.shape[0] < n_rows_threshold or data_filtered[ 'time'].max() - data_filtered['time'].min() < time_threshold: return pd.DataFrame([], columns=[ 'user', 'major', 'minor', 'start_time', 'end_time', 'version', 'distances', 'timestamp', 'localtime', 'latitude', 'longitude', 'average_count' ]) else: data_all = [] # data = data_filtered k = 0 i = data.shape[0] c = 'localtime' data_all.append([ data_filtered['user'].iloc[k], data['major'].iloc[k], data['minor'].iloc[k], data[c].iloc[k], data[c].iloc[i - 1], data['version'].iloc[k], data['distance_estimate'].iloc[k:i].values, data['timestamp'].iloc[int( (i + k) / 2)], data['localtime'].iloc[int((i + k) / 2)], np.mean(data['latitude'].values[k:i]), np.mean(data['longitude'].values[k:i]), np.mean(data['count'].values[k:i]) ]) return pd.DataFrame(data_all, columns=[ 'user', 'major', 'minor', 'start_time', 'end_time', 'version', 'distances', 'timestamp', 'localtime', 'latitude', 'longitude', 'average_count' ]) # print(st,et) data = data.withColumn('time', F.col('timestamp').cast('double')) # data.show(100,False) # print('--'*40) if ltime: data_filtered = data.filter((data.localtime >= F.lit(st)) & (data.localtime < F.lit(et))) else: data_filtered = data.filter((data.timestamp >= F.lit(st)) & (data.timestamp < F.lit(et))) # data.show(100,False) # print(data_filtered.count(),'filtered data count') data_filtered = data_filtered.filter(data_filtered.longitude != 200) data_result = data_filtered.groupBy(['user', 'major', 'minor', 'version']).apply(get_enconters) # data_filtered.sort('timestamp').show(1000,False) print(data_result.count(), 'encounter count') # data_result.show(5,False) return DataStream(data=data_result, metadata=Metadata())
json_obj = json.loads(array_str) for item in json_obj: yield item["a"], item["b"] if __name__ == "__main__": spark = SparkSession \ .builder \ .appName("compute_stats") \ .getOrCreate() csv_schema = StructType([ StructField('InvoiceNo', StringType(), nullable=False), StructField('Customer_ID', StringType(), nullable=False), StructField('Date', StringType(), nullable=False), StructField('Planet', StringType(), nullable=False), StructField('Purchase', StringType(), nullable=False) ]) # json_schema = ArrayType(StructType( [ StructField('ItemNo', StringType(), nullable=False), # StructField('UnitPrice', StringType(), nullable=False), # StructField('Description', StringType(), nullable=False), # StructField('Quantity', StringType(), nullable=False) ] ) ) json_schema = StructType([ StructField( "My_array", ArrayType( StructType([ StructField('ItemNo', StringType(), nullable=True),
COUNTRIES_FILE_PATH = '/opt/SparkDatasets/geography/countries.csv' CITIES_FILE_PATH = '/opt/SparkDatasets/geography/cities.csv' CONTINENT_STRUCTURE = \ [ ( 'continent_id' , 'integer' ) , ( 'continent_name', 'string' ) ] COUNTRY_STRUCTURE = \ [ ( 'country_id' , 'integer' ) , ( 'continent_id', 'integer' ) , ( 'country_name', 'string' ) ] CITY_STRUCTURE = \ [ ( 'city_id' , 'integer' ) , ( 'country_id', 'integer' ) , ( 'city_name' , 'string' ) ] CONTINENT_SCHEMA = StructType.fromJson( generate_schema_dict(CONTINENT_STRUCTURE) ) COUNTRY_SCHEMA = StructType.fromJson( generate_schema_dict(COUNTRY_STRUCTURE) ) CITY_SCHEMA = StructType.fromJson( generate_schema_dict(CITY_STRUCTURE) ) spark = SparkSession.builder.getOrCreate() continents_df = generate_dataframe( spark, CONTINENT_SCHEMA, CONTINENTS_FILE_PATH ) countries_df = generate_dataframe( spark, COUNTRY_SCHEMA , COUNTRIES_FILE_PATH ) cities_df = generate_dataframe( spark, CITY_SCHEMA , CITIES_FILE_PATH ) continents_df.registerTempTable('continents') countries_df.registerTempTable('countries') cities_df.registerTempTable('cities') print continents_df.count() print countries_df.count()
from pyspark.sql import SparkSession spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate() dataDictionary = [ ('James',{'hair':'black','eye':'brown'}), ('Michael',{'hair':'brown','eye':None}), ('Robert',{'hair':'red','eye':'black'}), ('Washington',{'hair':'grey','eye':'grey'}), ('Jefferson',{'hair':'brown','eye':''}) ] # Using StructType schema from pyspark.sql.types import StructField, StructType, StringType, MapType schema = StructType([ StructField('name', StringType(), True), StructField('properties', MapType(StringType(),StringType()),True) ]) df = spark.createDataFrame(data=dataDictionary, schema = schema) df.printSchema() df.show(truncate=False) df3=df.rdd.map(lambda x: \ (x.name,x.properties["hair"],x.properties["eye"])) \ .toDF(["name","hair","eye"]) df3.printSchema() df3.show() df.withColumn("hair",df.properties.getItem("hair")) \ .withColumn("eye",df.properties.getItem("eye")) \ .drop("properties") \ .show()
# MAGIC Complex outputs are helpful when you need to return multiple values from your UDF. The UDF design pattern involves returning a single column to drill down into, to pull out the desired data. # COMMAND ---------- # MAGIC %md-sandbox # MAGIC Start by determining the desired output. This will look like a schema with a high level `StructType` with numerous `StructFields`. # MAGIC # MAGIC <img alt="Side Note" title="Side Note" style="vertical-align: text-bottom; position: relative; height:1.75em; top:0.05em; transform:rotate(15deg)" src="https://files.training.databricks.com/static/images/icon-note.webp"/> For a refresher on this, see the lesson **Applying Schemas to JSON Data** in <a href="https://academy.databricks.com/collections/frontpage/products/etl-part-1-data-extraction/" target="_blank">ETL Part 1 course from Databricks Academy</a>. # COMMAND ---------- from pyspark.sql.types import DoubleType, StructType, StructField mathOperationsSchema = StructType([ StructField("sum", DoubleType(), True), StructField("multiplication", DoubleType(), True), StructField("division", DoubleType(), True) ]) # COMMAND ---------- # MAGIC %md # MAGIC Create a function that returns a tuple of your desired output. # COMMAND ---------- def manual_math(x, y): return (float(x + y), float(x * y), x / float(y)) manual_math(1, 2)
from pyspark.sql import SparkSession from pyspark.sql.functions import from_json, to_json, col, unbase64, base64, split, expr from pyspark.sql.types import StructField, StructType, StringType, BooleanType, ArrayType, DateType # this is a manually created schema - before Spark 3.0.0, schema inference is not automatic redisMessageSchema = StructType( [ StructField("key", StringType()), StructField("value", StringType()), StructField("expiredType", StringType()), StructField("expiredValue",StringType()), StructField("existType", StringType()), StructField("ch", StringType()), StructField("incr",BooleanType()), StructField("zSetEntries", ArrayType( \ StructType([ StructField("element", StringType()),\ StructField("score", StringType()) \ ])) \ ) ] ) # this is a manually created schema - before Spark 3.0.0, schema inference is not automatic # since we are not using the date or the amount in sql calculations, we are going # to cast them as strings # {"accountNumber":"703934969","location":"spain"} customerLocationSchema = StructType([ StructField("accountNumber", StringType()),
def _create_from_pandas_with_arrow(self, pdf, schema, timezone): """ Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the data types will be used to coerce the data in Pandas to Arrow conversion. """ from distutils.version import LooseVersion from pyspark.serializers import ArrowStreamPandasSerializer from pyspark.sql.types import from_arrow_type, to_arrow_type, TimestampType from pyspark.sql.utils import require_minimum_pandas_version, \ require_minimum_pyarrow_version require_minimum_pandas_version() require_minimum_pyarrow_version() from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype import pyarrow as pa # Create the Spark schema from list of names passed in with Arrow types if isinstance(schema, (list, tuple)): if LooseVersion(pa.__version__) < LooseVersion("0.12.0"): temp_batch = pa.RecordBatch.from_pandas(pdf[0:100], preserve_index=False) arrow_schema = temp_batch.schema else: arrow_schema = pa.Schema.from_pandas(pdf, preserve_index=False) struct = StructType() for name, field in zip(schema, arrow_schema): struct.add(name, from_arrow_type(field.type), nullable=field.nullable) schema = struct # Determine arrow types to coerce data when creating batches if isinstance(schema, StructType): arrow_types = [to_arrow_type(f.dataType) for f in schema.fields] elif isinstance(schema, DataType): raise ValueError("Single data type %s is not supported with Arrow" % str(schema)) else: # Any timestamps must be coerced to be compatible with Spark arrow_types = [to_arrow_type(TimestampType()) if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None for t in pdf.dtypes] # Slice the DataFrame to be batched step = -(-len(pdf) // self.sparkContext.defaultParallelism) # round int up pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step)) # Create list of Arrow (columns, type) for serializer dump_stream arrow_data = [[(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)] for pdf_slice in pdf_slices] jsqlContext = self._wrapped._jsqlContext safecheck = self._wrapped._conf.arrowSafeTypeConversion() col_by_name = True # col by name only applies to StructType columns, can't happen here ser = ArrowStreamPandasSerializer(timezone, safecheck, col_by_name) def reader_func(temp_filename): return self._jvm.PythonSQLUtils.readArrowStreamFromFile(jsqlContext, temp_filename) def create_RDD_server(): return self._jvm.ArrowRDDServer(jsqlContext) # Create Spark DataFrame from Arrow stream file, using one batch per partition jrdd = self._sc._serialize_to_jvm(arrow_data, ser, reader_func, create_RDD_server) jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(), jsqlContext) df = DataFrame(jdf, self._wrapped) df._schema = schema return df
def load_data(): """Load the data from the different files and join the togheter """ users_schema = StructType([ StructField('timestamp', TimestampType(), nullable=False), StructField('userId', IntegerType(), False), StructField('nick', StringType(), False), StructField('twitter', StringType(), False), StructField('dob', DateType(), False), StructField('country', StringType(), False) ]) users = spark.read.load('data/users.csv', format='csv', schema=users_schema, header=True) users = users.withColumn( 'age', F.datediff(F.to_date(F.lit('2016-06-16'), 'yyyy-mm-dd'), users.dob) / 365) ages = users.select(['userId', 'age']) buy_click = spark.read.load('data/buy-clicks.csv', format='csv', inferSchema=True, header=True, timestampFormat="yyyy-MM-dd HH:mm:ss") revenue_by_session_by_user = buy_click.groupBy([ 'userId', 'userSessionId' ]).agg(F.sum('price').alias('revenue')).select(['userId', 'revenue']) revenues=revenue_by_session_by_user.groupBy('userId').agg(F.mean('revenue').alias('avg_buy'), F.min('revenue')\ .alias('min_buy'), F.max('revenue').alias('max_buy')) game_click = spark.read.load('data/game-clicks.csv', format='csv', inferSchema=True, header=True, timestampFormat="yyyy-MM-dd HH:mm:ss") avg_ishit = game_click.groupBy(['userId' ]).agg(F.mean('isHit').alias('avg_isHit')) user_session = spark.read.load('data/user-session.csv', format='csv', inferSchema=True, header=True, timestampFormat="yyyy-MM-dd HH:mm:ss") team = spark.read.load('data/team.csv', format='csv', inferSchema=True, header=True, timestampFormat="yyyy-MM-dd HH:mm:ss") strengths = team.join(user_session, on='teamId', how='inner').select(['userId', 'strength']).dropDuplicates() data = ages.join(revenues, on='userId', how='inner').join(avg_ishit, on='userId', how='inner').join(strengths, on='userId', how='left').na.fill(0) return data
def createDataFrame(self, data, schema=None, samplingRatio=None): """ Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`. When ``schema`` is a list of column names, the type of each column will be inferred from ``data``. When ``schema`` is ``None``, it will try to infer the schema (column names and types) from ``data``, which should be an RDD of :class:`Row`, or :class:`namedtuple`, or :class:`dict`. When ``schema`` is :class:`DataType` or datatype string, it must match the real data, or exception will be thrown at runtime. If the given schema is not StructType, it will be wrapped into a StructType as its only field, and the field name will be "value", each record will also be wrapped into a tuple, which can be converted to row later. If schema inference is needed, ``samplingRatio`` is used to determined the ratio of rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``. :param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean, etc.), or :class:`list`, or :class:`pandas.DataFrame`. :param schema: a :class:`DataType` or a datatype string or a list of column names, default is None. The data type string format equals to `DataType.simpleString`, except that top level struct type can omit the `struct<>` and atomic types use `typeName()` as their format, e.g. use `byte` instead of `tinyint` for ByteType. We can also use `int` as a short name for IntegerType. :param samplingRatio: the sample ratio of rows used for inferring :return: :class:`DataFrame` .. versionchanged:: 2.0 The schema parameter can be a DataType or a datatype string after 2.0. If it's not a StructType, it will be wrapped into a StructType and each record will also be wrapped into a tuple. >>> l = [('Alice', 1)] >>> spark.createDataFrame(l).collect() [Row(_1=u'Alice', _2=1)] >>> spark.createDataFrame(l, ['name', 'age']).collect() [Row(name=u'Alice', age=1)] >>> d = [{'name': 'Alice', 'age': 1}] >>> spark.createDataFrame(d).collect() [Row(age=1, name=u'Alice')] >>> rdd = sc.parallelize(l) >>> spark.createDataFrame(rdd).collect() [Row(_1=u'Alice', _2=1)] >>> df = spark.createDataFrame(rdd, ['name', 'age']) >>> df.collect() [Row(name=u'Alice', age=1)] >>> from pyspark.sql import Row >>> Person = Row('name', 'age') >>> person = rdd.map(lambda r: Person(*r)) >>> df2 = spark.createDataFrame(person) >>> df2.collect() [Row(name=u'Alice', age=1)] >>> from pyspark.sql.types import * >>> schema = StructType([ ... StructField("name", StringType(), True), ... StructField("age", IntegerType(), True)]) >>> df3 = spark.createDataFrame(rdd, schema) >>> df3.collect() [Row(name=u'Alice', age=1)] >>> spark.createDataFrame(df.toPandas()).collect() # doctest: +SKIP [Row(name=u'Alice', age=1)] >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect() # doctest: +SKIP [Row(0=1, 1=2)] >>> spark.createDataFrame(rdd, "a: string, b: int").collect() [Row(a=u'Alice', b=1)] >>> rdd = rdd.map(lambda row: row[1]) >>> spark.createDataFrame(rdd, "int").collect() [Row(value=1)] >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... Py4JJavaError: ... """ if isinstance(data, DataFrame): raise TypeError("data is already a DataFrame") if isinstance(schema, basestring): schema = _parse_datatype_string(schema) try: import pandas has_pandas = True except Exception: has_pandas = False if has_pandas and isinstance(data, pandas.DataFrame): if schema is None: schema = [str(x) for x in data.columns] data = [r.tolist() for r in data.to_records(index=False)] if isinstance(schema, StructType): def prepare(obj): _verify_type(obj, schema) return obj elif isinstance(schema, DataType): datatype = schema def prepare(obj): _verify_type(obj, datatype) return (obj, ) schema = StructType().add("value", datatype) else: prepare = lambda obj: obj if isinstance(data, RDD): rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio) else: rdd, schema = self._createFromLocal(map(prepare, data), schema) jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd()) jdf = self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(), schema.json()) df = DataFrame(jdf, self._wrapped) df._schema = schema return df
"bool": "boolean", "boolean": "boolean", "struct": "struct", "array": "array", "date": "date", "datetime": "datetime", "byte": "byte", "short": "short", "binary": "binary", "null": "null", "vector": "vector", "timestamp": "datetime" } SPARK_DTYPES_DICT = {"string": StringType, "int": IntegerType, "float": FloatType, "double": DoubleType, "boolean": BooleanType, "struct": StructType, "array": ArrayType, "bigint": LongType, "date": DateType, "byte": ByteType, "short": ShortType, "datetime": TimestampType, "binary": BinaryType, "null": NullType, "vector": VectorUDT } SPARK_DTYPES_DICT_OBJECTS = \ {"string": StringType(), "int": IntegerType(), "float": FloatType(), "double": DoubleType(), "boolean": BooleanType(), "struct": StructType(), "array": ArrayType(StringType()), "bigint": LongType(), "date": DateType(), "byte": ByteType(), "short": ShortType(), "datetime": TimestampType(), "binary": BinaryType(), "null": NullType() } PROFILER_COLUMN_TYPES = {"categorical", "numeric", "date", "null", "array", "binary"} PYTHON_TO_PROFILER = {"string": "categorical", "boolean": "categorical", "int": "numeric", "decimal": "numeric", "date": "date", "array": "array", "binaty": "binary", "null": "null"} SPARK_DTYPES_TO_PROFILER = {"int": ["smallint", "tinyint", "bigint", "int"], "decimal": ["float", "double"], "string": "string", "date": {"date", "timestamp"}, "boolean": "boolean", "binary": "binary", "array": "array", "object": "object", "null": "null", "missing": "missing"}