def _get_mon_metric_json_schema(): """get the schema of the incoming monasca metric.""" metric_struct_field = StructField( "metric", StructType([ StructField("dimensions", MapType(StringType(), StringType(), True), True), StructField("value_meta", MapType(StringType(), StringType(), True), True), StructField("name", StringType(), True), StructField("timestamp", StringType(), True), StructField("value", StringType(), True) ]), True) meta_struct_field = StructField( "meta", MapType(StringType(), StringType(), True), True) creation_time_struct_field = StructField("creation_time", StringType(), True) schema = StructType([ creation_time_struct_field, meta_struct_field, metric_struct_field ]) return schema
def add_ucdoc_bb_allocation_map(cfg, df, bookings_map): def helper(ands, minus, amount, day, allocated): es_client_predictions = ESClient(cfg['es_host'], cfg['es_port'], cfg['es_predictions_index'], cfg['es_predictions_type']) _, result = get_ucdoc_prediction_count( ands, minus, bookings_map, day, es_client_predictions) # apply tbr on prediction values prediction_inventory = sum(result.values()) tbr_ratio = amount * 1.0 / prediction_inventory result.update((x, int(y * tbr_ratio)) for x, y in result.items()) # {'magazinelock,3,5G,g_x,2,pt,1004,icc': 788, 'minusonepage,1,5G,g_f,4,pt,1003,icc': 5017} resources = result # {'b2': 800, 'b3': 1000, 'b1': 500} demands = allocated # the sort of booking here is random allocation_map = hwm_generic_allocation( resources, resources.keys(), demands, demands.keys()) return allocation_map _map_type = MapType(StringType(), IntegerType()) df = df.withColumn('allocation_map', udf(helper, MapType(StringType(), _map_type))( df.ands, df.minus, df.amount, df.day, df.allocated)) return df
def transfrom_comment_degrees(spark, database, comments_degree_collection): def deserialize(d): d.pop('_id') for k, v in d.items(): if k == 'cDegrees': d[k] = json.loads(v) for sd in d[k]: for i in sd['sDegrees']: i['negAdv'] = {} if not i['negAdv'] else i['negAdv'] else: d[k] = v return d degrees_df = spark.read.format("com.mongodb.spark.sql.DefaultSource"). \ option("uri", "mongodb://127.0.0.1/").option("database", database).option("collection", comments_degree_collection).\ load() degrees_rdd = degrees_df.rdd.map(lambda y: y.asDict(recursive=True)). \ map(lambda x: deserialize(x)) fields = [ StructField('pId', LongType(), False), StructField('cId', LongType(), False), StructField('cDegValue', DoubleType(), False), StructField( 'cDegrees', ArrayType( StructType([ StructField('sId', IntegerType(), True), StructField( 'sDegrees', ArrayType( StructType([ StructField( 'feature', MapType(StringType(), StringType(), True), True), StructField( 'sentiment', MapType(StringType(), StringType(), True), True), StructField( 'negAdv', MapType(StringType(), StringType(), True), True), StructField('relate', StringType(), False), StructField('degValue', IntegerType(), True) ]), True), True) ]), True), True) ] schema = StructType(fields) temp = spark.createDataFrame(degrees_rdd, schema) temp.write.format("com.mongodb.spark.sql.DefaultSource").mode("overwrite"). \ option("uri", "mongodb://127.0.0.1/").option("database", database).option("collection", 'temp3'). \ save()
def classify_tweets(time, rdd): # Get the singleton instance of SparkSession spark = utils.get_spark_session_instance(rdd.context.getConf()) sql_context = SQLContext(spark.sparkContext) # Filter tweets without text row_rdd = rdd.map(lambda tweet: Row( id_str=tweet["id_str"], text=tweet["text"], timestamp_ms=tweet["timestamp_ms"], created_at=tweet["created_at"], user=tweet["user"], sentiment=tweet["sentiment"] )).filter(lambda tweet: tweet["text"]) print row_rdd.take(5) schema = StructType([ StructField("id_str", StringType(), True), StructField("text", StringType(), True), StructField("timestamp_ms", StringType(), True), StructField("created_at", StringType(), True), StructField("user", MapType(StringType(), StringType()), True), StructField("sentiment", MapType(StringType(), FloatType()), True), ]) tweets_df = spark.createDataFrame(row_rdd, schema=schema) # Fit the texts in the LDA model and get the topics try: custom_stop_words = [] pipeline = ml_utils.set_pipeline(custom_stop_words) model = pipeline.fit(tweets_df) result = model.transform(tweets_df) lda_model = LocalLDAModel.load("s3a://current-models/LDAModel") prediction = lda_model.transform(result) prediction.show(truncate=True) tweets_with_prediction = prediction.rdd.map(lambda tweet: Row( id_str=tweet["id_str"], text=tweet["text"], timestamp_ms=int(tweet["timestamp_ms"]), created_at=tweet["created_at"], user=tweet["user"], sentiment=tweet["sentiment"], topic_distribution=topic_distibution_to_dict(tweet["topicDistribution"]) )) print tweets_with_prediction.take(5) save_to_elastic(tweets_with_prediction) tweets_with_prediction_df = sql_context.createDataFrame(tweets_with_prediction) tweets_with_prediction_df.registerTempTable("tweets") exploded_tweets = sql_context.sql("select id_str, timestamp_ms, explode(topic_distribution) from tweets") exploded_tweets.foreachPartition(write_to_kafka) except Exception: print sys.exc_info()
def create_user_df (hive_context, num_users, num_time_intervals, num_did_buckets): time_intervals = [1586822400 - i*86400 for i in range(num_time_intervals)] keyword_list = [ "education", "entertainment", "game-act", "game-avg", "game-cnc", "game-ent", "game-fishing", "game-moba", "game-mon", "game-rpg", "game-sim", "game-slg", "health", "info", "living-car", "living-food", "living-house", "living-insurance", "living-makeup", "living-map", "living-mon", "living-photo", "other", "reading", "shopping", "social", "sports", "travel", "video", ] keyword_index_map = { "education": 1, "entertainment": 2, "game-act": 3, "game-avg": 4, "game-cnc": 5, "game-ent": 6, "game-fishing": 7, "game-moba": 8, "game-mon": 9, "game-rpg": 10, "game-sim": 11, "game-slg": 12, "health": 13, "info": 14, "living-car": 15, "living-food": 16, "living-house": 17, "living-insurance": 18, "living-makeup": 19, "living-map": 20, "living-mon": 21, "living-photo": 22, "other": 23, "reading": 24, "shopping": 25, "social": 26, "sports": 27, "travel": 28, "video": 29, } user_data = [] for i in range(num_users): keyword_samples = [random.sample(keyword_list, random.randint(1,3)) for _ in range(num_time_intervals)] kws_sample = random.sample(keyword_list, int(0.9*len(keyword_list))) show_counts = [[int(random.expovariate(1))+1 for _ in keywords] for keywords in keyword_samples] user_data.append(Row( age = random.randint(1, 6), gender = random.randint(0, 1), did = hex(random.getrandbits(256)).lstrip('0x').rstrip('L'), did_index = i, interval_start_time = time_intervals, interval_keywords = [','.join(keyword) for keyword in keyword_samples], kwi = [','.join([str(keyword_index_map[keyword]) for keyword in keywords]) for keywords in keyword_samples], kwi_show_counts = [','.join('{}:{}'.format(keyword_index_map[keyword], count) for count, keyword in zip(show_count, keywords)) for show_count, keywords in zip(show_counts, keyword_samples)], kwi_click_counts = [','.join('{}:{}'.format(keyword_index_map[keyword], count - random.randint(0, count)) for count, keyword in zip(show_count, keywords)) for show_count, keywords in zip(show_counts, keyword_samples)], kws = {keyword: random.random() for keyword in kws_sample}, kws_norm = {keyword: random.random() for keyword in kws_sample}, did_bucket = random.randrange(num_did_buckets) )) schema = StructType([ StructField("age", StringType(), True), StructField("gender", StringType(), True), StructField("did", StringType(), True), StructField("did_index", IntegerType(), True), StructField("interval_start_time", ArrayType(StringType()), True), StructField("interval_keywords", ArrayType(StringType()), True), StructField("kwi", ArrayType(StringType()), True), StructField("kwi_show_counts", ArrayType(StringType()), True), StructField("kwi_click_counts", ArrayType(StringType()), True), StructField("kws", MapType(StringType(), FloatType()), True), StructField("kws_norm", MapType(StringType(), FloatType()), True), StructField("did_bucket", IntegerType(), True), ]) return hive_context.createDataFrame(user_data, schema)
def num_featurizer(self, data_frame, ref_df=None, featurize_process=["summary_stat", "sustainment_q"], inputCol="VALUE", labelCol="ITEMID", outputCol="num_features", REPARTITION_CONST=None): from pyspark.sql.functions import udf, array from pyspark.sql.types import StringType, DoubleType, MapType if not data_frame: return ret_data_frame = self.value_aggregator(data_frame) if REPARTITION_CONST is not None: ret_data_frame = ret_data_frame.checkpoint() self.logger.debug( "[NUM_FEATURIZER] ret_dataframe checkpointed:{0}".format( ret_data_frame.count())) if "summary_stat" in featurize_process: udf_summary_stat = udf(preprocessor_gen.calc_summary_stat, MapType(StringType(), DoubleType())) ret_data_frame = ret_data_frame.withColumn( "summary_stat", udf_summary_stat(inputCol + "_LIST", labelCol)) if REPARTITION_CONST is not None: ret_data_frame = ret_data_frame.checkpoint() self.logger.debug( "[NUM_FEATURIZER] summary_stat, ret_dataframe checkpointed:{0}" .format(ret_data_frame.count())) if "sustainment_q" in featurize_process: udf_sustainment_quant = udf( preprocessor_gen.sustainment_quantifier, MapType(StringType(), DoubleType())) ret_data_frame = ret_data_frame.join(ref_df, labelCol).withColumn( "sustainment_q", udf_sustainment_quant("summary_stat", labelCol, "ref_avg", "ref_std", "ref_count")).drop("ref_avg").drop( "ref_std").drop("ref_count") if REPARTITION_CONST is not None: ret_data_frame = ret_data_frame.checkpoint() self.logger.debug( "[NUM_FEATURIZER] sustainment_q, ret_dataframe checkpointed:{0}" .format(ret_data_frame.count())) udf_merge_dict_all = udf(preprocessor_gen.merge_dict_all, MapType(StringType(), DoubleType())) ret_data_frame = ret_data_frame.withColumn( outputCol, udf_merge_dict_all(array(featurize_process))) return ret_data_frame
def convert(self, ma_field: ma_fields.Dict) -> DataType: key_field_converter = self.converter_map.get(type(ma_field.key_field), StringConverter) value_field_converter = self.converter_map.get(type(ma_field.value_field), StringConverter) return MapType( key_field_converter(self.converter_map).convert(ma_field.key_field), value_field_converter(self.converter_map).convert(ma_field.value_field) )
def test_data_type_ops(self): _mock_spark_type = DataType() _mock_dtype = ExtensionDtype() _mappings = ( (CategoricalDtype(), _mock_spark_type, CategoricalOps), (_mock_dtype, DecimalType(), DecimalOps), (_mock_dtype, FractionalType(), FractionalOps), (_mock_dtype, IntegralType(), IntegralOps), (_mock_dtype, StringType(), StringOps), (_mock_dtype, BooleanType(), BooleanOps), (_mock_dtype, TimestampType(), DatetimeOps), (_mock_dtype, TimestampNTZType(), DatetimeNTZOps), (_mock_dtype, DateType(), DateOps), (_mock_dtype, DayTimeIntervalType(), TimedeltaOps), (_mock_dtype, BinaryType(), BinaryOps), (_mock_dtype, ArrayType(StringType()), ArrayOps), (_mock_dtype, MapType(StringType(), IntegralType()), MapOps), (_mock_dtype, StructType(), StructOps), (_mock_dtype, NullType(), NullOps), (_mock_dtype, UserDefinedType(), UDTOps), ) for _dtype, _spark_type, _ops in _mappings: self.assertIsInstance(DataTypeOps(_dtype, _spark_type), _ops) _unknow_spark_type = _mock_spark_type self.assertRaises(TypeError, DataTypeOps, BooleanType(), _unknow_spark_type)
def calculate_factdata_traffic(hive_context, factdata_table, bucket_id, day): def _list_to_map(count_array): count_map = {} for item in count_array: key_value = item.split(':') count_map[key_value[0]] = key_value[1] return count_map command = """ SELECT FACTDATA.count_array, FACTDATA.day, FACTDATA.hour, FACTDATA.uckey FROM {} AS FACTDATA WHERE FACTDATA.bucket_id='{}' AND day='{}' """.format(factdata_table, str(bucket_id), str(day)) df = hive_context.sql(command) list_to_map_udf = fn.udf(_list_to_map, MapType(StringType(), StringType(), False)) df = df.withColumn('count_map', list_to_map_udf(df.count_array)) df = df.select('uckey', 'day', 'hour', fn.explode(df.count_map)).withColumnRenamed( "key", "price_cat").withColumnRenamed("value", "count") # [Row(uckey='native,72bcd2720e5011e79bc8fa163e05184e,WIFI,g_m,5,CPM,15,76', day='2019-11-02', hour=19, price_cat='3', count='4')] return df.groupby().agg(fn.sum('count').alias('count')).take(1)[0]['count']
def test_apply_schema(self): from datetime import date, datetime rdd = self.sc.parallelize([(127, -128, -32768, 32767, 2147483647, 1.0, date(2010, 1, 1), datetime(2010, 1, 1, 1, 1, 1), {"a": 1}, (2,), [1, 2, 3], None)]) schema = StructType([ StructField("byte1", ByteType(), False), StructField("byte2", ByteType(), False), StructField("short1", ShortType(), False), StructField("short2", ShortType(), False), StructField("int1", IntegerType(), False), StructField("float1", FloatType(), False), StructField("date1", DateType(), False), StructField("time1", TimestampType(), False), StructField("map1", MapType(StringType(), IntegerType(), False), False), StructField("struct1", StructType([StructField("b", ShortType(), False)]), False), StructField("list1", ArrayType(ByteType(), False), False), StructField("null1", DoubleType(), True)]) df = self.spark.createDataFrame(rdd, schema) results = df.rdd.map(lambda x: (x.byte1, x.byte2, x.short1, x.short2, x.int1, x.float1, x.date1, x.time1, x.map1["a"], x.struct1.b, x.list1, x.null1)) r = (127, -128, -32768, 32767, 2147483647, 1.0, date(2010, 1, 1), datetime(2010, 1, 1, 1, 1, 1), 1, 2, [1, 2, 3], None) self.assertEqual(r, results.first()) with self.tempView("table2"): df.createOrReplaceTempView("table2") r = self.spark.sql("SELECT byte1 - 1 AS byte1, byte2 + 1 AS byte2, " + "short1 + 1 AS short1, short2 - 1 AS short2, int1 - 1 AS int1, " + "float1 + 1.5 as float1 FROM table2").first() self.assertEqual((126, -127, -32767, 32766, 2147483646, 2.5), tuple(r))
def filterPosts(fileList, sc, ss, subs=set(), minwords='100'): tokensUDF = udf(tokenize, MapType(StringType(),IntegerType())) countUDF = udf(sumCounter, IntegerType()) firstFile=True for filename in fileList: month=filename[-9:-4] print('\n\n\n reading', month, filename) monthData = ss.read.json(filename) if subs!=set(): monthData=monthData.filter(monthData.subreddit.isin(subs)) filtered= monthData \ .filter(monthData['is_self'] == True) \ .select('id','subreddit', tokensUDF('selftext').alias('counter')) \ .withColumn('wordcount', countUDF('counter')) \ .filter('wordcount >='+minwords) \ .select('id','subreddit','counter', 'wordcount') \ .withColumn('month', lit(month)) print('\n\n\n saving', month) filtered.write.parquet('filtered_'+month+'.parquet', mode='overwrite') if firstFile: alldata=filtered firstFile=False else: alldata=alldata.union(filtered) return alldata
def _get_instance_usage_schema(): """get instance usage schema.""" # Initialize columns for all string fields columns = [ "tenant_id", "user_id", "resource_uuid", "geolocation", "region", "zone", "host", "project_id", "aggregated_metric_name", "firstrecord_timestamp_string", "lastrecord_timestamp_string", "service_group", "service_id", "usage_date", "usage_hour", "usage_minute", "aggregation_period", "namespace", "pod_name", "app", "container_name", "interface", "deployment", "daemon_set" ] columns_struct_fields = [ StructField(field_name, StringType(), True) for field_name in columns ] # Add columns for non-string fields columns_struct_fields.append( StructField("firstrecord_timestamp_unix", DoubleType(), True)) columns_struct_fields.append( StructField("lastrecord_timestamp_unix", DoubleType(), True)) columns_struct_fields.append( StructField("quantity", DoubleType(), True)) columns_struct_fields.append( StructField("record_count", DoubleType(), True)) columns_struct_fields.append( StructField("processing_meta", MapType(StringType(), StringType(), True), True)) schema = StructType(columns_struct_fields) return schema
def process_corpus(raw_corpus, normalizer=normalize, tokenizer=tokenize, ngram_counter=ngram_counts): """ :param raw_corpus: RDD[Tuple[int, str]] as returned from load_raw_corpus :param normalizer: Callable[[str], str] preprocessing function :param tokenizer: Callable[[str], Iterable[str]] :param ngram_counter: Callable[[Iterable[str], Dict[str, int]]] :return: DataFrame[document_index: bigint, wc: bigint, token_counts: map<string,int>] """ schema = StructType([ StructField("document_index", LongType()), StructField( "data", StructType([ StructField("wc", LongType()), StructField("token_counts", MapType(StringType(), IntegerType())), ]), ), ]) normalized = raw_corpus.mapValues(normalize) return (normalized.mapValues(tokenize).mapValues(ngram_counter).toDF( schema).select("document_index", "data.wc", "data.token_counts"), normalized)
def parametric_action_preprocessing( df, actions: List[str], multi_steps: Optional[int] = None, include_possible_actions: bool = True, ): assert (not include_possible_actions ), "current we don't support include_possible_actions" next_map_udf = make_next_udf(multi_steps, MapType(LongType(), FloatType())) df = df.withColumn("next_action", next_map_udf("next_action")) def make_not_terminal_udf(): """ Return true iff next_action is an empty map """ def get_not_terminal(next_action): return len(next_action) > 0 return udf(get_not_terminal, BooleanType()) not_terminal_udf = make_not_terminal_udf() df = df.withColumn("not_terminal", not_terminal_udf("next_action")) df = make_sparse2dense(df, "action", actions) df = make_sparse2dense(df, "next_action", actions) return df
def _transform(self, dataframe): out_col = self.getOutputCol() in_col = self.getInputCol() def get_content(data): contents = {} lines = data.splitlines(keepends=False) for line in lines: json_line = json.loads(line) feature_array = json_line.get('features') for element in feature_array: name = element.get('name') value = element.get('value') if name in contents: contents[name].append(value) else: contents[name] = [value] return contents get_cntn = udf(get_content, MapType(StringType(), ArrayType(DoubleType()))) return dataframe.withColumn(out_col, get_cntn(in_col))
def _transform(self, dataframe): out_col = self.getOutputCol() in_col = self.getInputCol() def tags_sum_by_key(tags): types = {} for tag in tags: if tag[1] in types: types[tag[1]] += 1 else: types[tag[1]] = 1 return types def extract_speech_parts(data): tags = [] for post in data: tags.extend(TextBlob(post).tags) speech_parts = tags_sum_by_key(tags) return speech_parts ext_speech_parts = udf(extract_speech_parts, MapType(StringType(), IntegerType())) return dataframe.withColumn(out_col, ext_speech_parts(in_col))
def test_toPandas_fallback_enabled(self): with self.sql_conf( {"spark.sql.execution.arrow.pyspark.fallback.enabled": True}): schema = StructType([ StructField("map", MapType(StringType(), IntegerType()), True) ]) df = self.spark.createDataFrame([({u'a': 1}, )], schema=schema) with QuietTest(self.sc): with self.warnings_lock: with warnings.catch_warnings(record=True) as warns: # we want the warnings to appear even if this test is run from a subclass warnings.simplefilter("always") pdf = df.toPandas() # Catch and check the last UserWarning. user_warns = [ warn.message for warn in warns if isinstance(warn.message, UserWarning) ] self.assertTrue(len(user_warns) > 0) self.assertTrue("Attempting non-optimization" in str( user_warns[-1])) assert_frame_equal(pdf, pd.DataFrame({u'map': [{ u'a': 1 }]}))
def _transform(self, dataframe): out_col = self.getOutputCol() in_col = self.getInputCol() def extract_speech_parts(data): speech_parts = {} for tag in load('help/tagsets/upenn_tagset.pickle').keys(): if any(c.isalpha() for c in tag): speech_parts[tag] = [] for data_line in data: data_line_tags = {} for tag_tuple in TextBlob(data_line).tags: if tag_tuple[1] in data_line_tags: data_line_tags[tag_tuple[1]] += 1 else: data_line_tags[tag_tuple[1]] = 1 for tag in speech_parts.keys(): if tag not in data_line_tags.keys(): speech_parts[tag].append(0) else: speech_parts[tag].append(data_line_tags[tag]) return speech_parts ext_speech_parts = udf(extract_speech_parts, MapType(StringType(), ArrayType(IntegerType()))) return dataframe.withColumn(out_col, ext_speech_parts(in_col))
def test_parse_datatype_string(self): from pyspark.sql.types import _all_atomic_types, _parse_datatype_string for k, t in _all_atomic_types.items(): if t != NullType: self.assertEqual(t(), _parse_datatype_string(k)) self.assertEqual(IntegerType(), _parse_datatype_string("int")) self.assertEqual(DecimalType(1, 1), _parse_datatype_string("decimal(1 ,1)")) self.assertEqual(DecimalType(10, 1), _parse_datatype_string("decimal( 10,1 )")) self.assertEqual(DecimalType(11, 1), _parse_datatype_string("decimal(11,1)")) self.assertEqual(ArrayType(IntegerType()), _parse_datatype_string("array<int >")) self.assertEqual(MapType(IntegerType(), DoubleType()), _parse_datatype_string("map< int, double >")) self.assertEqual( StructType([ StructField("a", IntegerType()), StructField("c", DoubleType()) ]), _parse_datatype_string("struct<a:int, c:double >")) self.assertEqual( StructType([ StructField("a", IntegerType()), StructField("c", DoubleType()) ]), _parse_datatype_string("a:int, c:double")) self.assertEqual( StructType([ StructField("a", IntegerType()), StructField("c", DoubleType()) ]), _parse_datatype_string("a INT, c DOUBLE"))
def to_schema_type(typ, elem): if typ is None: return hint_to_schema_type('None') if issubclass(typ, basestring): return hint_to_schema_type('str') if issubclass(typ, bool): return hint_to_schema_type('bool') if issubclass(typ, float): return hint_to_schema_type('float') if issubclass(typ, (int, long)): # Some integers cannot be stored in long, but we cannot tell this # from the column type. Let it fail in spark. return hint_to_schema_type('int') if issubclass(typ, datetime.datetime): return hint_to_schema_type('datetime') if issubclass(typ, list): if elem is None or len(elem) == 0: raise ValueError('Schema type cannot be determined.') elem_type = to_schema_type(type(elem[0]), None) if elem_type is None: raise TypeError('Element type cannot be determined') return ArrayType(elem_type) if issubclass(typ, dict): if elem is None or len(elem) == 0: raise ValueError('Schema type cannot be determined.') key_type = to_schema_type(type(elem.keys()[0]), None) if key_type is None: raise TypeError('Key type cannot be determined') val_type = to_schema_type(type(elem.values()[0]), None) if val_type is None: raise TypeError('Value type cannot be determined') return MapType(key_type, val_type) if issubclass(typ, types.NoneType): return None return hint_to_schema_type('str')
def as_pings_subset_df(as_df, date_start, total_period, slug=None): """ get subset of activity stream pings with some columns standardized providing an experiment slug will add a branch field and remove pings without the slug table schema client_id activity_dt branch (optional) as cols """ # function can take datetime or the s3 date string or as date string if type(date_start) == str: if '-' in date_start: date_start = string_to_date(date_start, '%Y-%m-%d') else: date_start = string_to_date(date_start) # get date end of maximum possible observation period date_obs_end = date_plus_N(date_start, total_period) # convert everything into as date string format date_start_str = date_to_string(date_start, '%Y-%m-%d') date_obs_end_str = date_to_string(date_obs_end, '%Y-%m-%d') # if we're looking back, switch start and end if date_obs_end < date_start: date_start_str = date_to_string(date_obs_end, '%Y-%m-%d') date_obs_end_str = date_to_string(date_start, '%Y-%m-%d') # ----------------- subset dates ----------------- as_df = as_df.filter("date >= '%s'" % date_start_str) as_df = as_df.filter("date <= '%s'" % date_obs_end_str) # ----------------- tagged only if slug provided ----------------- if slug is not None: # set up udf for parsing activity stream experiment field schema = MapType(StringType(), StringType()) as_experiment_field_udf = udf(as_experiment_field, schema) # get experiments field into standard format as_df = as_df.withColumn('experiments', as_experiment_field_udf(F.col('shield_id'))) # keep only data tagged with experiment and get branch column as_df = as_df.filter("experiments['%s'] is not null" % slug) as_df = as_df.withColumn('branch', F.col('experiments')[slug]) as_df = as_df.drop('experiments') as_df = as_df.withColumn('activity_dt', F.col('date')) as_df = as_df.drop('shield_id').drop('date') return as_df
def transfrom_fea_sen_pairs(spark, database, fea_sen_pairs_collection): def deserialize(d): d.pop('_id') for k, v in d.items(): d[k] = json.loads(v) if k == 'cFeaSenPairs' else v return d fea_sen_pairs_df = spark.read.format("com.mongodb.spark.sql.DefaultSource"). \ option("uri", "mongodb://127.0.0.1/").option("database", database).option("collection", fea_sen_pairs_collection). \ load() fea_sen_pairs_rdd = fea_sen_pairs_df.rdd.map(lambda y: y.asDict(recursive=True)). \ map(lambda x: deserialize(x)) fields = [ StructField('pId', LongType(), False), StructField('cId', LongType(), False), StructField( 'cFeaSenPairs', ArrayType( StructType([ StructField('sId', IntegerType(), True), StructField( 'sFeaSenPairs', ArrayType( StructType([ StructField( 'feature', MapType(StringType(), StringType(), True), True), StructField( 'sentiment', MapType(StringType(), StringType(), True), True), StructField('relate', StringType(), False), ]), True), True) ]), True), True) ] schema = StructType(fields) temp = spark.createDataFrame(fea_sen_pairs_rdd, schema) temp.write.format("com.mongodb.spark.sql.DefaultSource").mode("overwrite"). \ option("uri", "mongodb://127.0.0.1/").option("database", database).option("collection", 'temp'). \ save()
def test_toPandas_fallback_disabled(self): schema = StructType( [StructField("map", MapType(StringType(), IntegerType()), True)]) df = self.spark.createDataFrame([(None, )], schema=schema) with QuietTest(self.sc): with self.warnings_lock: with self.assertRaisesRegexp(Exception, 'Unsupported type'): df.toPandas()
def get_schema(self): """ returns the data schema. In case the schema changes in future, one can redefine the method :return: data schema """ schema = StructType([ StructField("user", StringType(), True), StructField("timestamp", TimestampType(), True), StructField( "items", ArrayType( MapType(StringType(), MapType(StringType(), DoubleType()))), True), ]) return schema
def generate_idx_for_df(df: DataFrame, col_name: str, col_schema): idx_udf = udf(lambda x: udf_array_to_map(x), MapType(IntegerType(), col_schema, True)) df = df.withColumn("map", idx_udf(col(col_name))) df = df.select("problem_type", "user_id", "oms_protected", "problem_id", "create_at", explode("map").alias("item_id", "answer")) return df
def _get_record_store_df_schema(): """get instance usage schema.""" columns = ["event_timestamp_string", "event_type", "event_quantity_name", "event_status", "event_version", "record_type", "resource_uuid", "tenant_id", "user_id", "region", "zone", "host", "project_id", "event_date", "event_hour", "event_minute", "event_second", "metric_group", "metric_id"] columns_struct_fields = [StructField(field_name, StringType(), True) for field_name in columns] # Add a column for a non-string fields columns_struct_fields.insert(0, StructField("event_timestamp_unix", DoubleType(), True)) columns_struct_fields.insert(0, StructField("event_quantity", DoubleType(), True)) # map to metric meta columns_struct_fields.append(StructField("meta", MapType(StringType(), StringType(), True), True)) # map to dimensions columns_struct_fields.append(StructField("dimensions", MapType(StringType(), StringType(), True), True)) # map to value_meta columns_struct_fields.append(StructField("value_meta", MapType(StringType(), StringType(), True), True)) schema = StructType(columns_struct_fields) return schema
def get_common_pyspark_schema(): schema = StructType([ StructField('day', StringType(), True), StructField('ands', ArrayType(StringType()), True), StructField('minus', ArrayType(StringType()), True), StructField('allocated', MapType(StringType(), IntegerType()), True), StructField('amount', IntegerType(), True) ]) return (schema)
def force_decimal_precision_scale(dt: DataType, precision: int = 38, scale: int = 18) -> DataType: """ Returns a data type with a fixed decimal type. The precision and scale of the decimal type are fixed with the given values. Examples -------- >>> from pyspark.sql.types import * >>> force_decimal_precision_scale(StructType([ ... StructField("A", DecimalType(10, 0), True), ... StructField("B", DecimalType(14, 7), False)])) # doctest: +NORMALIZE_WHITESPACE StructType(List(StructField(A,DecimalType(38,18),true),StructField(B,DecimalType(38,18),false))) >>> force_decimal_precision_scale(StructType([ ... StructField("A", ... StructType([ ... StructField('a', ... MapType(DecimalType(5, 0), ... ArrayType(DecimalType(20, 0), False), False), False), ... StructField('b', StringType(), True)])), ... StructField("B", DecimalType(30, 15), False)]), ... precision=30, scale=15) # doctest: +NORMALIZE_WHITESPACE StructType(List(StructField(A,StructType(List(StructField(a,MapType(DecimalType(30,15),\ ArrayType(DecimalType(30,15),false),false),false),StructField(b,StringType,true))),true),\ StructField(B,DecimalType(30,15),false))) """ if isinstance(dt, StructType): new_fields = [] for field in dt.fields: new_fields.append( StructField( field.name, force_decimal_precision_scale(field.dataType, precision, scale), nullable=field.nullable, metadata=field.metadata, )) return StructType(new_fields) elif isinstance(dt, ArrayType): return ArrayType( force_decimal_precision_scale(dt.elementType, precision, scale), containsNull=dt.containsNull, ) elif isinstance(dt, MapType): return MapType( force_decimal_precision_scale(dt.keyType, precision, scale), force_decimal_precision_scale(dt.valueType, precision, scale), valueContainsNull=dt.valueContainsNull, ) elif isinstance(dt, DecimalType): return DecimalType(precision=precision, scale=scale) else: return dt
def processGraphDFLogsV2(filepath, df_logs, spark, sc): def reduce_ccf(key, values): min_value = values.pop(values.index(min(values))) ret = {} ret[key] = min_value if min_value < key: for value in values: acc.add(1) ret[value] = min_value else: ret = None return ret reducer = F.udf(lambda x, y: reduce_ccf(x, y), MapType(IntegerType(), IntegerType())) schema = StructType([ StructField("key", IntegerType(), True), StructField("value", IntegerType(), True)]) df = spark.read.format('csv').load(filepath, headers=False, delimiter='\t', schema=schema) df = df.na.drop() df_logs = add_log(df_logs, filepath, "python-df", 0, "start", datetime.now(), 0, 0) acc = sc.accumulator(1) loop_counter = 1 while acc.value != 0: acc.value = 0 print(f"----------\nStart loop at {datetime.now()}, accumulator value is {acc.value}") # CCF-Iterate df_inverter = df.select(F.col('value').alias('key'), F.col('key').alias('value')) df = df.union(df_inverter) # if logs==True: # acc.value = 0 # debug = df.collect() # df_logs_ret = add_log(df_logs, filepath, "python-df", loop_counter, "it-map", datetime.now(), acc.value, len(debug)) df = df.groupBy('key').agg(F.collect_list('value').alias('value')) df = df.withColumn('reducer', reducer('key', 'value')).select('reducer') df = df.select(F.explode('reducer')) df = df.na.drop() # CCF - Dedup df = df.distinct() collected = df.collect() df = spark.createDataFrame(sc.parallelize(collected), schema) print(f"End loop at {datetime.now()}, final value is {acc.value}") df_logs = add_log(df_logs, filepath, "python-df", loop_counter, "ded_reduce", datetime.now(), acc.value, len(collected)) loop_counter += 1 print(f"----------\nProcessed file at {datetime.now()}\n----------") return df_logs
def load_sql_user_functions(sc, sqlContext): """Load our custom UDAFs into a sql context.""" sqlContext.udf.register('format_id', format_id, StringType()) sqlContext.udf.register('format_metrics', format_metrics, MapType(StringType(), IntegerType())) # custom aggregation function. Needs a jar provided in runner script. agg_counter = sc._jvm.com.jbennet.daskvsspark.udafs.AggregateCounter() sqlContext.sparkSession._jsparkSession.udf().register( 'count_values', agg_counter)