def _transform(self, dataset): inputCol = self.getInputCol() dataType = dataset.schema[inputCol].dataType assert isinstance(dataType, T.MapType) assert isinstance(dataType.keyType, T.StringType) assert isinstance(dataType.valueType, (T.NumericType, T.StringType)) seed = _mh3(inputCol, seed=self.getSeed()) @F.udf(T.MapType(T.IntegerType(), T.FloatType())) def hashNumeric(v): if not v: return {} hashVector = defaultdict(float) for k, v in v.items(): h = _mh3(k, seed=seed) hashVector[h] += v return dict(hashVector) @F.udf(T.MapType(T.IntegerType(), T.FloatType())) def hashString(v): if not v: return v hashVector = defaultdict(float) for k, v in v.items(): h = _mh3(v, seed=_mh3(k, seed=seed)) hashVector[h] += 1. return dict(hashVector) if isinstance(dataType.valueType, T.NumericType): return dataset.withColumn(self.getOutputCol(), hashNumeric(dataset[inputCol])) else: return dataset.withColumn(self.getOutputCol(), hashString(dataset[inputCol]))
def test_update(self): denominator = FeatureRequestTotal.feature_name_from_class() numerator = FeatureResponse5xxTotal.feature_name_from_class() schema = T.StructType([ T.StructField(self.feature.current_features_column, T.MapType(T.StringType(), T.FloatType())), T.StructField(self.feature.past_features_column, T.MapType(T.StringType(), T.FloatType())), ]) sub_df = self.session.createDataFrame([{ self.feature.current_features_column: { self.feature.feature_name: 1., numerator: 2., denominator: 1., }, self.feature.past_features_column: { self.feature.feature_name: 1., numerator: 4., denominator: 2., } }], schema=schema) result_df = self.feature.update(sub_df) result_df.show() value = result_df.select( self.feature.updated_feature_col_name).collect()[0][ self.feature.updated_feature_col_name] expected_value = 2. self.assertAlmostEqual(value, expected_value, places=2)
def test_update(self): count_col = FeatureRequestTotal.feature_name_from_class() mean_col = FeaturePathDepthAverage.feature_name_from_class() schema = T.StructType([ T.StructField(self.feature.current_features_column, T.MapType(T.StringType(), T.FloatType())), T.StructField(self.feature.past_features_column, T.MapType(T.StringType(), T.FloatType())), ]) sub_df = self.session.createDataFrame([{ self.feature.current_features_column: { self.feature.feature_name: 6., count_col: 3., mean_col: 5., }, self.feature.past_features_column: { self.feature.feature_name: 2., count_col: 1., mean_col: 4., } }], schema=schema) result_df = self.feature.update(sub_df) result_df.show() value = result_df.select( self.feature.updated_feature_col_name).collect()[0][ self.feature.updated_feature_col_name] from baskerville.features.helpers import update_variance expected_value = update_variance(2., 6., 1., 3., 4., 5.) print(expected_value) self.assertAlmostEqual(value, expected_value, places=2)
def _transform(self, dataset): inputCol = self.getInputCol() dataType = dataset.schema[inputCol].dataType assert isinstance(dataType, (T.BooleanType, T.NumericType, T.StringType)) seed = _mh3(inputCol, seed=self.getSeed()) @F.udf(T.MapType(T.IntegerType(), T.FloatType())) def hashNumeric(v): if not v: return {} return {seed: float(v)} @F.udf(T.MapType(T.IntegerType(), T.FloatType())) def hashString(v): if not v: return {} return {_mh3(v, seed=seed): 1.} if isinstance(dataType, (T.BooleanType, T.NumericType)): return dataset.withColumn(self.getOutputCol(), hashNumeric(dataset[inputCol])) else: return dataset.withColumn(self.getOutputCol(), hashString(dataset[inputCol]))
def test_update(self): schema = T.StructType([ T.StructField(self.feature.current_features_column, T.MapType(T.StringType(), T.FloatType())), T.StructField(self.feature.past_features_column, T.MapType(T.StringType(), T.FloatType())), ]) sub_df = self.session.createDataFrame([{ self.feature.current_features_column: { self.feature.feature_name: 1., }, self.feature.past_features_column: { self.feature.feature_name: 2., } }], schema=schema) result_df = self.feature.update(sub_df) result_df.show() value = result_df.select( self.feature.updated_feature_col_name).collect()[0][ self.feature.updated_feature_col_name] expected_value = 3. self.assertAlmostEqual(value, expected_value, places=2)
def test_type_mismatch(self): with six.assertRaisesRegex(self, AssertionError, 'Cannot compare heterogeneous types'): schema_has( T.StructType([T.StructField('f1', T.IntegerType())]), T.ArrayType(T.IntegerType()), ) with six.assertRaisesRegex(self, AssertionError, 'Cannot compare heterogeneous types'): schema_has( T.ArrayType(T.IntegerType()), {'f1': T.IntegerType()}, ) with six.assertRaisesRegex(self, TypeError, 'f1 is IntegerType, expected LongType'): schema_has( T.StructType([T.StructField('f1', T.IntegerType())]), T.StructType([T.StructField('f1', T.LongType())]), ) with six.assertRaisesRegex( self, TypeError, 'f1\.element\.s1 is IntegerType, expected LongType', ): schema_has( T.StructType([ T.StructField( 'f1', T.ArrayType(T.StructType([T.StructField('s1', T.IntegerType())])), ), ]), T.StructType([ T.StructField( 'f1', T.ArrayType(T.StructType([T.StructField('s1', T.LongType())])), ), ]), ) with six.assertRaisesRegex(self, TypeError, 'element is IntegerType, expected LongType'): schema_has( T.ArrayType(T.IntegerType()), T.ArrayType(T.LongType()), ) with six.assertRaisesRegex(self, TypeError, 'key is StringType, expected LongType'): schema_has( T.MapType(T.StringType(), T.IntegerType()), T.MapType(T.LongType(), T.IntegerType()), ) with six.assertRaisesRegex(self, TypeError, 'value is IntegerType, expected LongType'): schema_has( T.MapType(T.StringType(), T.IntegerType()), T.MapType(T.StringType(), T.LongType()), )
def _proto3_field_to_spark_data_type(field_desc: FieldDescriptor) -> DataType: """Convert ProtoBuf field descriptor to Spark `DataType` or `StructField` object. Args: field_desc (FieldDescriptor): A ProtoBuf field descriptor. Returns: DataType: A Spark `DataType` or `StructField` object. """ # map type field if _IsMapEntry(field_desc): key_field_desc = field_desc.message_type.fields_by_name["key"] value_field_desc = field_desc.message_type.fields_by_name["value"] key_struct_type = _proto3_field_to_spark_data_type(key_field_desc) value_struct_type = _proto3_field_to_spark_data_type(value_field_desc) return types.MapType(key_struct_type, value_struct_type) if field_desc.type == FieldDescriptor.TYPE_MESSAGE: # nested message field_data_type = _proto3_message_descriptor_to_spark_schema( field_desc.message_type) else: # scalar value types field_data_type = _SPARK_SQL_TYPE_MAP[field_desc.type] # list type field if field_desc.label == FieldDescriptor.LABEL_REPEATED: return types.ArrayType(field_data_type) return field_data_type
def _sort_structs(dt, ignore_order_depth): if ignore_order_depth == 0: return dt if dt.typeName() == 'array': return T.ArrayType( elementType=_sort_structs(dt.elementType, ignore_order_depth), containsNull=ignore_nullability or dt.containsNull, ) if dt.typeName() == 'map': return T.MapType( keyType=_sort_structs(dt.keyType, ignore_order_depth), valueType=_sort_structs(dt.valueType, ignore_order_depth), valueContainsNull=ignore_nullability or dt.valueContainsNull, ) if dt.typeName() == 'struct': return T.StructType([ _sort_structs(f, ignore_order_depth - 1) for f in sorted(dt.fields, key=lambda f: f.name) ]) if dt.typeName() == 'structf': return T.StructField( dt.name, _sort_structs(dt.dataType, ignore_order_depth), nullable=ignore_nullability or dt.nullable, metadata=dt.metadata, ) return dt
def task_a_2_step_1_final(spark): a2_struct = T.StructType([ T.StructField("datetime_start", T.TimestampType()), T.StructField("datetime_end", T.TimestampType()), T.StructField("map_topics", T.MapType( T.StringType(), T.ArrayType(T.StringType()) )) ]) result = kafka_source(spark, config.BOOTSTRAP_SERVERS, "topics-by-state_step-0").parse_json(a2_struct) \ .withWatermark("datetime_end", "1 minute").groupBy( F.window("datetime_end", "3 hour", "1 hour") ) \ .agg( F.first("window.start").alias("timestamp_start"), F.first("window.end").alias("timestamp_end"), F.collect_list("map_topics").alias("statistics") ) \ .select( F.struct( F.concat(F.hour('timestamp_start'), lit(":"), F.minute('timestamp_start')).alias("time_start"), F.concat(F.hour('timestamp_end'), lit(":"), F.minute('timestamp_end')).alias("time_end"), concat_maps_udf(col('statistics')).alias("statistics") ).alias("res") ).send_to_kafka(config.BOOTSTRAP_SERVERS, "topics-by-state", config.LOG_PREFIX) return result
def main(args): spark = sql.SparkSession.builder.appName('update-mutator').getOrCreate() msg_struct = types.StructType([ types.StructField('text', types.StringType(), True), types.StructField('user_id', types.StringType(), True), types.StructField('update_id', types.StringType(), True) ]) sentiments_struct = types.ArrayType( types.MapType(types.StringType(), types.FloatType(), False)) analyzer = vader.SentimentIntensityAnalyzer() analyzer_bcast = spark.sparkContext.broadcast(analyzer) def sentiment_generator_impl(text): va = analyzer_bcast.value english = SpacyMagic.get('en_core_web_sm') result = english(text) sents = [str(sent) for sent in result.sents] sentiment = [va.polarity_scores(str(s)) for s in sents] return sentiment sentiment_generator = functions.udf(sentiment_generator_impl, sentiments_struct) def json_converter_impl(user_id, update_id, text, sentiments): obj = dict(user_id=user_id, update_id=update_id, text=text, sentiments=sentiments) return json.dumps(obj) json_converter = functions.udf(json_converter_impl, types.StringType()) records = (spark.readStream.format('kafka').option( 'kafka.bootstrap.servers', args.brokers).option('subscribe', args.intopic).load().select( functions.column('value').cast(types.StringType()).alias('value') ).select( functions.from_json( functions.column('value'), msg_struct).alias('json')).select( functions.column('json.user_id'), functions.column('json.update_id'), functions.column('json.text'), sentiment_generator(functions.column('json.text')).alias( 'sentiments')).select( json_converter(functions.column('user_id'), functions.column('update_id'), functions.column('text'), functions.column('sentiments')). alias('value')).writeStream.format('kafka').option( 'kafka.bootstrap.servers', args.brokers).option('topic', args.outtopic).option( 'checkpointLocation', '/tmp').start()) records.awaitTermination()
def get_title_er_schema(): return types.StructType([ types.StructField('id', types.LongType(), nullable=False), types.StructField('O*NET-SOC Code', types.StringType(), nullable=False), types.StructField('Title', types.StringType(), nullable=False), types.StructField('Alternate Title', types.ArrayType(types.MapType()), nullable=False), ])
def _rec_build_types(t): if type(t) == list: return T.ArrayType(_rec_build_types(t[0])) elif type(t) == dict: k, v = list(t.items())[0] return T.MapType(_rec_build_types(k), _rec_build_types(v), True) elif type(t) == tuple: return T.StructType([T.StructField("v_" + str(i), _rec_build_types(f), True) for i, f in enumerate(t)]) elif t in T._type_mappings: return T._type_mappings[t]() else: raise TypeError(repr(t) + " is not supported")
def get_cache_schema(): return T.StructType([ T.StructField("id", T.IntegerType(), False), T.StructField("target", T.StringType(), False), T.StructField("ip", T.StringType(), False), T.StructField("first_ever_request", T.TimestampType(), True), T.StructField("old_subset_count", T.IntegerType(), True), T.StructField("old_features", T.MapType(T.StringType(), T.DoubleType()), True), T.StructField("old_num_requests", T.IntegerType(), True), T.StructField("updated_at", T.TimestampType(), True) ])
def _initialize_results(self, scaffolds): data = [ ps.Row(smiles=scaffold, scaffold=scaffold, decorations={}, count=1) for scaffold in scaffolds ] data_schema = pst.StructType([ pst.StructField("smiles", pst.StringType()), pst.StructField("scaffold", pst.StringType()), pst.StructField("decorations", pst.MapType(pst.IntegerType(), pst.StringType())), pst.StructField("count", pst.IntegerType()) ]) return SPARK.createDataFrame(data, schema=data_schema)
def infer_complex_spark_type(typeclass): if typeclass.__origin__ in {list, List}: co_T, *_ = typeclass.__args__ is_nullable, py_type = maybe_unlift_optional(co_T) return t.ArrayType(infer_spark_type(py_type), is_nullable) elif typeclass.__origin__ in {dict, Dict}: k_T, v_T, *_ = typeclass.__args__ is_nullable_key, py_key_type = maybe_unlift_optional(k_T) is_nullable_value, py_value_type = maybe_unlift_optional(v_T) if is_nullable_key: raise TypeError(f"Nullable keys of type {py_key_type} don't allowed in {typeclass}") return t.MapType(infer_spark_type(py_key_type), infer_spark_type(py_value_type), is_nullable_value) else: raise TypeError(f"Don't know how to represent {typeclass} in Spark")
def _transform(self, dataset): @F.udf(T.MapType(T.IntegerType(), T.FloatType())) def cross(*values): if not values: return {} hashVector = defaultdict(float) for d1, d2 in combinations(values, 2): if not d1 or not d2: continue for (k1, v1), (k2, v2) in product(d1.items(), d2.items()): h = (k1 ^ k2) hashVector[h] += v1 * v2 return dict(hashVector) return dataset.withColumn(self.getOutputCol(), cross(*dataset[self.getInputCols()]))
def _transform(self, dataset): inputCol = self.getInputCol() dataType = dataset.schema[inputCol].dataType assert isinstance(dataType, T.ArrayType) assert isinstance(dataType.elementType, T.StringType) seed = _mh3(inputCol, seed=self.getSeed()) @F.udf(T.MapType(T.IntegerType(), T.FloatType())) def hash_(v): if not v: return {} hashVector = defaultdict(float) for x in v: h = _mh3(x, seed=seed) hashVector[h] += 1. return dict(hashVector) return dataset.withColumn(self.getOutputCol(), hash_(dataset[inputCol]))
def get_ret_type(value): if isinstance(value, float): return T.FloatType() if isinstance(value, int): return T.IntegerType() if isinstance(value, str): return T.StringType() if isinstance(value, list): if len(value) == 0: raise Exception( "Python Dataset Wrapper: Failed to parser return type for list" ) return T.ArrayType(DataSet.get_ret_type(value[0])) if isinstance(value, Mapping): if len(value) == 0: raise Exception( "Python Dataset Wrapper: Failed to parser return type for dict" ) x, y = value.popitem() return T.MapType(DataSet.get_ret_type(x), DataSet.get_ret_type(y))
def test_maps_nested_subset(self): schema_has( T.MapType( T.StringType(), T.MapType( T.StringType(), T.StructType([ T.StructField('f1', T.MapType(T.StringType(), T.LongType())), T.StructField('f2', T.MapType(T.StringType(), T.IntegerType())), ]), ), ), T.MapType( T.StringType(), T.MapType( T.StringType(), T.StructType([ T.StructField('f1', T.MapType(T.StringType(), T.LongType())), ]), ), ), )
def _join_results_single(self, scaffolds_df, sampled_df): def _join_scaffold(scaff, decs): mol = usc.join_joined_attachments(scaff, decs) if mol: return usc.to_smiles(mol) join_scaffold_udf = psf.udf(_join_scaffold, pst.StringType()) def _create_decorations_map(decorations_smi, attachment_points): decorations = decorations_smi.split(usc.ATTACHMENT_SEPARATOR_TOKEN) return { idx: _cleanup_decoration(dec) for dec, idx in zip(decorations, attachment_points) } create_decorations_map_udf = psf.udf( _create_decorations_map, pst.MapType(pst.IntegerType(), pst.StringType())) return sampled_df.join(scaffolds_df, on="id")\ .select( join_scaffold_udf("randomized_scaffold", "decoration_smi").alias("smiles"), create_decorations_map_udf("decoration_smi", "attachment_points").alias("decorations"), "scaffold")
def raw_spark_data_flow(): #注册所有需要用到的udf get_xedk_udf = fun.udf(get_xedk, tp.IntegerType()) get_rzdb_udf = fun.udf(get_rzdb, tp.IntegerType()) get_jycs_udf = fun.udf(get_jycs, tp.IntegerType()) get_smjj_udf = fun.udf(get_smjj, tp.IntegerType()) get_wljd_udf = fun.udf(get_wljd, tp.IntegerType()) get_xxjr_udf = fun.udf(get_xxjr, tp.IntegerType()) get_zdgz_num_udf = fun.udf(get_zdgz_num, tp.IntegerType()) get_cxjk_num_udf = fun.udf(get_cxjk_num, tp.IntegerType()) get_change_info_2_udf = fun.udf( get_change_info_2, tp.MapType(tp.StringType(), tp.IntegerType())) get_change_info_udf = fun.udf(get_change_info, tp.IntegerType()) #读取原始输入 old_df = spark.read.parquet( ("{path}" "/all_company_info/{version}").format(path=IN_PATH, version=OLD_VERSION)).fillna({ 'city': u'无', 'county': u'无', 'province': u'无' }) new_df = spark.read.parquet( ("{path}" "/all_company_info/{version}").format(path=IN_PATH, version=NEW_VERSION)).fillna({ 'city': u'无', 'county': u'无', 'province': u'无' }) #高危企业数 high_risk_count_df = new_df.select( 'province', 'city', 'county', 'bbd_qyxx_id').where(new_df.risk_rank == u'高危预警').groupBy([ 'province', 'city', 'county' ]).count().withColumnRenamed('count', 'high_risk_num').fillna({ 'city': u'无', 'county': u'无', 'province': u'无' }).cache() #重点关注企业数 focus_on_count_df = new_df.select( 'province', 'city', 'county', 'bbd_qyxx_id').where(new_df.risk_rank == u'重点关注').groupBy([ 'province', 'city', 'county' ]).count().withColumnRenamed('count', 'focus_on_num').fillna({ 'city': u'无', 'county': u'无', 'province': u'无' }).cache() #持续监控企业数 constantly_monitor_count_df = new_df.select( 'province', 'city', 'county', 'bbd_qyxx_id').where(new_df.risk_rank == u'持续监控').groupBy([ 'province', 'city', 'county' ]).count().withColumnRenamed('count', 'constantly_monitor_num').fillna({ 'city': u'无', 'county': u'无', 'province': u'无' }).cache() #监控企业数 supervise_count_df = new_df.select('province', 'city', 'county', 'bbd_qyxx_id').groupBy([ 'province', 'city', 'county' ]).count().withColumnRenamed( 'count', 'supervise_num').fillna({ 'city': u'无', 'county': u'无', 'province': u'无' }).cache() #新兴金融、网络借贷、私募基金、交易场所 raw_types_num_df = new_df.select( 'province', 'city', 'county', 'company_type').groupBy(['province', 'city', 'county', 'company_type']).count() tid_types_num_df = raw_types_num_df.select( 'province', 'city', 'county', fun.concat_ws(':', 'company_type', 'count').alias('company_type_merge')).groupBy([ 'province', 'city', 'county' ]).agg({ 'company_type_merge': 'collect_list' }).withColumnRenamed('collect_list(company_type_merge)', 'company_type_merge').fillna({ 'city': u'无', 'county': u'无', 'province': u'无' }).cache() #新增高危企业、减少高危企业 tmp_new_df = new_df.select( 'province', 'city', 'county', 'bbd_qyxx_id', 'company_type', 'data_version').where(new_df.risk_rank == u'高危预警') tmp_old_df = old_df.select( 'province', 'city', 'county', 'bbd_qyxx_id', 'company_type', 'data_version').where(old_df.risk_rank == u'高危预警') tmp_new_2_df = tmp_new_df.union(tmp_old_df).groupBy( ['province', 'city', 'county', 'bbd_qyxx_id']).agg({ 'data_version': 'collect_list' }).select( 'province', 'city', 'county', 'bbd_qyxx_id', 'collect_list(data_version)', get_change_info_udf('collect_list(data_version)').alias( 'risk_change')).groupBy(['province', 'city', 'county']).agg({ 'risk_change': 'collect_list' }).select( 'province', 'city', 'county', get_change_info_2_udf('collect_list(risk_change)').alias( 'risk_change_num')) tmp_new_3_df = tmp_new_2_df.select( 'province', 'city', 'county', tmp_new_2_df.risk_change_num.getItem('decline').alias( 'risk_decline_num'), tmp_new_2_df.risk_change_num.getItem('rise').alias( 'risk_rise_num')).fillna({ 'city': u'无', 'county': u'无', 'province': u'无' }).cache() #各行业新增高危企业、减少高危企业 #新兴金融、网络借贷、私募基金、交易场所、融资担保、小额贷款 tmp_new_6_df = tmp_new_df.union(tmp_old_df).groupBy( ['province', 'city', 'county', 'bbd_qyxx_id', 'company_type']).agg({ 'data_version': 'collect_list' }).select( 'province', 'city', 'county', 'bbd_qyxx_id', 'company_type', 'collect_list(data_version)', get_change_info_udf('collect_list(data_version)').alias( 'risk_change')).groupBy([ 'province', 'city', 'county', 'company_type' ]).agg({ 'risk_change': 'collect_list' }).select( 'province', 'city', 'county', 'company_type', get_change_info_2_udf('collect_list(risk_change)').alias( 'risk_change_num')) tmp_new_7_df = tmp_new_6_df.select( 'province', 'city', 'county', 'company_type', tmp_new_6_df.risk_change_num.getItem('decline').alias( 'risk_decline_num'), tmp_new_6_df.risk_change_num.getItem('rise').alias( 'risk_rise_num')).fillna({ 'city': u'无', 'county': u'无', 'province': u'无' }).cache() #选择不同的行业 os.system(("hadoop fs -rmr " "{path}").format(path=TMP_PATH)) tmp_new_7_df.where( tmp_new_7_df.company_type == u'新兴金融').coalesce(10).write.parquet( "{path}/tmp_xxjr_change_df/" "{version}".format(path=TMP_PATH, version=NEW_VERSION)) tmp_new_7_df.where( tmp_new_7_df.company_type == u'网络借贷').coalesce(10).write.parquet( "{path}/tmp_wljd_change_df/" "{version}".format(path=TMP_PATH, version=NEW_VERSION)) tmp_new_7_df.where( tmp_new_7_df.company_type == u'私募基金').coalesce(10).write.parquet( "{path}/tmp_smjj_change_df/" "{version}".format(path=TMP_PATH, version=NEW_VERSION)) tmp_new_7_df.where( tmp_new_7_df.company_type == u'交易场所').coalesce(10).write.parquet( "{path}/tmp_jycs_change_df/" "{version}".format(path=TMP_PATH, version=NEW_VERSION)) tmp_new_7_df.where( tmp_new_7_df.company_type == u'融资担保').coalesce(10).write.parquet( "{path}/tmp_rzdb_change_df/" "{version}".format(path=TMP_PATH, version=NEW_VERSION)) tmp_new_7_df.where( tmp_new_7_df.company_type == u'小额贷款').coalesce(10).write.parquet( "{path}/tmp_xedk_change_df/" "{version}".format(path=TMP_PATH, version=NEW_VERSION)) tmp_xxjr_change_df = spark.read.parquet("{path}/tmp_xxjr_change_df/" "{version}".format( path=TMP_PATH, version=NEW_VERSION)) tmp_wljd_change_df = spark.read.parquet("{path}/tmp_wljd_change_df/" "{version}".format( path=TMP_PATH, version=NEW_VERSION)) tmp_smjj_change_df = spark.read.parquet("{path}/tmp_smjj_change_df/" "{version}".format( path=TMP_PATH, version=NEW_VERSION)) tmp_jycs_change_df = spark.read.parquet("{path}/tmp_jycs_change_df/" "{version}".format( path=TMP_PATH, version=NEW_VERSION)) tmp_rzdb_change_df = spark.read.parquet("{path}/tmp_rzdb_change_df/" "{version}".format( path=TMP_PATH, version=NEW_VERSION)) tmp_xedk_change_df = spark.read.parquet("{path}/tmp_xedk_change_df/" "{version}".format( path=TMP_PATH, version=NEW_VERSION)) #监控企业变动情况 tmp_new_df = new_df.select('province', 'city', 'county', 'bbd_qyxx_id', 'company_type', 'data_version') tmp_old_df = old_df.select('province', 'city', 'county', 'bbd_qyxx_id', 'company_type', 'data_version') tmp_new_4_df = tmp_new_df.union(tmp_old_df).groupBy( ['province', 'city', 'county', 'bbd_qyxx_id']).agg({ 'data_version': 'collect_list' }).select( 'province', 'city', 'county', 'bbd_qyxx_id', 'collect_list(data_version)', get_change_info_udf('collect_list(data_version)').alias( 'risk_change')).groupBy(['province', 'city', 'county']).agg({ 'risk_change': 'collect_list' }).select( 'province', 'city', 'county', get_change_info_2_udf('collect_list(risk_change)').alias( 'risk_change_num')) tmp_new_5_df = tmp_new_4_df.select( 'province', 'city', 'county', tmp_new_4_df.risk_change_num.getItem('decline').alias( 'all_decline_num'), tmp_new_4_df.risk_change_num.getItem('rise').alias( 'all_rise_num')).fillna({ 'city': u'无', 'county': u'无', 'province': u'无' }).cache() #各行业监控企业变动情况 #新兴金融、网络借贷、私募基金、交易场所、融资担保、小额贷款 tmp_new_8_df = tmp_new_df.union(tmp_old_df).groupBy( ['province', 'city', 'county', 'bbd_qyxx_id', 'company_type']).agg({ 'data_version': 'collect_list' }).select( 'province', 'city', 'county', 'bbd_qyxx_id', 'company_type', 'collect_list(data_version)', get_change_info_udf('collect_list(data_version)').alias( 'risk_change')).groupBy([ 'province', 'city', 'county', 'company_type' ]).agg({ 'risk_change': 'collect_list' }).select( 'province', 'city', 'county', 'company_type', get_change_info_2_udf('collect_list(risk_change)').alias( 'risk_change_num')) tmp_new_9_df = tmp_new_8_df.select( 'province', 'city', 'county', 'company_type', tmp_new_8_df.risk_change_num.getItem('decline').alias( 'all_decline_num'), tmp_new_8_df.risk_change_num.getItem('rise').alias( 'all_rise_num')).fillna({ 'city': u'无', 'county': u'无', 'province': u'无' }).cache() #选择不同的行业 tmp_new_9_df.where( tmp_new_9_df.company_type == u'新兴金融').coalesce(10).write.parquet( "{path}/tmp_xxjr_overall_df/" "{version}".format(path=TMP_PATH, version=NEW_VERSION)) tmp_new_9_df.where( tmp_new_9_df.company_type == u'网络借贷').coalesce(10).write.parquet( "{path}/tmp_wljd_overall_df/" "{version}".format(path=TMP_PATH, version=NEW_VERSION)) tmp_new_9_df.where( tmp_new_9_df.company_type == u'私募基金').coalesce(10).write.parquet( "{path}/tmp_smjj_overall_df/" "{version}".format(path=TMP_PATH, version=NEW_VERSION)) tmp_new_9_df.where( tmp_new_9_df.company_type == u'交易场所').coalesce(10).write.parquet( "{path}/tmp_jycs_overall_df/" "{version}".format(path=TMP_PATH, version=NEW_VERSION)) tmp_new_9_df.where( tmp_new_9_df.company_type == u'融资担保').coalesce(10).write.parquet( "{path}/tmp_rzdb_overall_df/" "{version}".format(path=TMP_PATH, version=NEW_VERSION)) tmp_new_9_df.where( tmp_new_9_df.company_type == u'小额贷款').coalesce(10).write.parquet( "{path}/tmp_xedk_overall_df/" "{version}".format(path=TMP_PATH, version=NEW_VERSION)) tmp_xxjr_overall_df = spark.read.parquet("{path}/tmp_xxjr_overall_df/" "{version}".format( path=TMP_PATH, version=NEW_VERSION)) tmp_wljd_overall_df = spark.read.parquet("{path}/tmp_wljd_overall_df/" "{version}".format( path=TMP_PATH, version=NEW_VERSION)) tmp_smjj_overall_df = spark.read.parquet("{path}/tmp_smjj_overall_df/" "{version}".format( path=TMP_PATH, version=NEW_VERSION)) tmp_jycs_overall_df = spark.read.parquet("{path}/tmp_jycs_overall_df/" "{version}".format( path=TMP_PATH, version=NEW_VERSION)) tmp_rzdb_overall_df = spark.read.parquet("{path}/tmp_rzdb_overall_df/" "{version}".format( path=TMP_PATH, version=NEW_VERSION)) tmp_xedk_overall_df = spark.read.parquet("{path}/tmp_xedk_overall_df/" "{version}".format( path=TMP_PATH, version=NEW_VERSION)) #各行业持续监控、重点关注企业 #新兴金融、网络借贷、私募基金、交易场所、融资担保、小额贷款 tmp_new_10_df = new_df.select( 'province', 'city', 'county', 'company_type', 'risk_rank').groupBy([ 'province', 'city', 'county', 'company_type' ]).agg({ 'risk_rank': 'collect_list' }).withColumnRenamed('collect_list(risk_rank)', 'risk_rank').withColumn( 'zdgz_num', get_zdgz_num_udf('risk_rank')).withColumn( 'cxjk_num', get_cxjk_num_udf('risk_rank')).fillna({ 'city': u'无', 'county': u'无', 'province': u'无' }).cache() #选择不同的行业 tmp_new_10_df.where( tmp_new_10_df.company_type == u'新兴金融').coalesce(10).write.parquet( "{path}/tmp_xxjr_monitoring_df/" "{version}".format(path=TMP_PATH, version=NEW_VERSION)) tmp_new_10_df.where( tmp_new_10_df.company_type == u'网络借贷').coalesce(10).write.parquet( "{path}/tmp_wljd_monitoring_df/" "{version}".format(path=TMP_PATH, version=NEW_VERSION)) tmp_new_10_df.where( tmp_new_10_df.company_type == u'私募基金').coalesce(10).write.parquet( "{path}/tmp_smjj_monitoring_df/" "{version}".format(path=TMP_PATH, version=NEW_VERSION)) tmp_new_10_df.where( tmp_new_10_df.company_type == u'交易场所').coalesce(10).write.parquet( "{path}/tmp_jycs_monitoring_df/" "{version}".format(path=TMP_PATH, version=NEW_VERSION)) tmp_new_10_df.where( tmp_new_10_df.company_type == u'融资担保').coalesce(10).write.parquet( "{path}/tmp_rzdb_monitoring_df/" "{version}".format(path=TMP_PATH, version=NEW_VERSION)) tmp_new_10_df.where( tmp_new_10_df.company_type == u'小额贷款').coalesce(10).write.parquet( "{path}/tmp_xedk_monitoring_df/" "{version}".format(path=TMP_PATH, version=NEW_VERSION)) tmp_xxjr_monitoring_df = spark.read.parquet( "{path}/tmp_xxjr_monitoring_df/" "{version}".format(path=TMP_PATH, version=NEW_VERSION)) tmp_wljd_monitoring_df = spark.read.parquet( "{path}/tmp_wljd_monitoring_df/" "{version}".format(path=TMP_PATH, version=NEW_VERSION)) tmp_smjj_monitoring_df = spark.read.parquet( "{path}/tmp_smjj_monitoring_df/" "{version}".format(path=TMP_PATH, version=NEW_VERSION)) tmp_jycs_monitoring_df = spark.read.parquet( "{path}/tmp_jycs_monitoring_df/" "{version}".format(path=TMP_PATH, version=NEW_VERSION)) tmp_rzdb_monitoring_df = spark.read.parquet( "{path}/tmp_rzdb_monitoring_df/" "{version}".format(path=TMP_PATH, version=NEW_VERSION)) tmp_xedk_monitoring_df = spark.read.parquet( "{path}/tmp_xedk_monitoring_df/" "{version}".format(path=TMP_PATH, version=NEW_VERSION)) #组合所有的字段 tid_new_df = new_df.dropDuplicates(['province', 'city', 'county']).join( high_risk_count_df, ['province', 'city', 'county'], 'left_outer').join(focus_on_count_df, [ 'province', 'city', 'county' ], 'left_outer').join(constantly_monitor_count_df, [ 'province', 'city', 'county' ], 'left_outer').join(supervise_count_df, [ 'province', 'city', 'county' ], 'left_outer').join(tid_types_num_df, [ 'province', 'city', 'county' ], 'left_outer').join(tmp_new_3_df, [ 'province', 'city', 'county' ], 'left_outer').join(tmp_new_5_df, [ 'province', 'city', 'county' ], 'left_outer').join(tmp_xxjr_change_df, [ 'province', 'city', 'county' ], 'left_outer').join(tmp_wljd_change_df, [ 'province', 'city', 'county' ], 'left_outer').join(tmp_smjj_change_df, [ 'province', 'city', 'county' ], 'left_outer').join(tmp_jycs_change_df, [ 'province', 'city', 'county' ], 'left_outer').join(tmp_rzdb_change_df, [ 'province', 'city', 'county' ], 'left_outer').join(tmp_xedk_change_df, [ 'province', 'city', 'county' ], 'left_outer').join(tmp_xxjr_overall_df, [ 'province', 'city', 'county' ], 'left_outer').join(tmp_wljd_overall_df, [ 'province', 'city', 'county' ], 'left_outer').join( tmp_smjj_overall_df, ['province', 'city', 'county'], 'left_outer').join( tmp_jycs_overall_df, ['province', 'city', 'county' ], 'left_outer').join(tmp_rzdb_overall_df, [ 'province', 'city', 'county' ], 'left_outer').join(tmp_xedk_overall_df, [ 'province', 'city', 'county' ], 'left_outer').join(tmp_xxjr_monitoring_df, [ 'province', 'city', 'county' ], 'left_outer').join(tmp_wljd_monitoring_df, [ 'province', 'city', 'county' ], 'left_outer').join(tmp_smjj_monitoring_df, [ 'province', 'city', 'county' ], 'left_outer').join(tmp_jycs_monitoring_df, [ 'province', 'city', 'county' ], 'left_outer').join(tmp_rzdb_monitoring_df, [ 'province', 'city', 'county' ], 'left_outer').join(tmp_xedk_monitoring_df, [ 'province', 'city', 'county' ], 'left_outer').select( new_df.province, new_df.city, new_df.county, high_risk_count_df.high_risk_num, focus_on_count_df.focus_on_num, constantly_monitor_count_df.constantly_monitor_num, supervise_count_df.supervise_num, get_xxjr_udf( tid_types_num_df.company_type_merge).alias('xxjr'), get_smjj_udf( tid_types_num_df.company_type_merge).alias('smjj'), get_wljd_udf( tid_types_num_df.company_type_merge).alias('wljd'), get_jycs_udf( tid_types_num_df.company_type_merge).alias('jycs'), get_rzdb_udf( tid_types_num_df.company_type_merge).alias('rzdb'), get_xedk_udf( tid_types_num_df.company_type_merge).alias('xedk'), tmp_new_3_df.risk_decline_num, tmp_new_3_df.risk_rise_num, tmp_new_5_df.all_decline_num, tmp_new_5_df.all_rise_num, tmp_xxjr_change_df.risk_decline_num.alias( 'other_lessen_high_risk'), tmp_xxjr_change_df.risk_rise_num.alias( 'other_add_high_risk'), tmp_wljd_change_df.risk_decline_num.alias( 'net_lessen_high_risk'), tmp_wljd_change_df.risk_rise_num.alias( 'net_add_high_risk'), tmp_smjj_change_df.risk_decline_num.alias( 'private_fund_lessen_high_risk'), tmp_smjj_change_df.risk_rise_num.alias( 'private_fund_add_high_risk'), tmp_jycs_change_df.risk_decline_num.alias( 'trade_place_lessen_high_risk'), tmp_jycs_change_df.risk_rise_num.alias( 'trade_place_add_high_risk'), tmp_rzdb_change_df.risk_decline_num.alias( 'financing_guarantee_lessen_high_risk'), tmp_rzdb_change_df.risk_rise_num.alias( 'financing_guarantee_add_high_risk'), tmp_xedk_change_df.risk_decline_num.alias( 'petty_loan_lessen_high_risk'), tmp_xedk_change_df.risk_rise_num.alias( 'petty_loan_add_high_risk'), tmp_xxjr_overall_df.all_decline_num.alias( 'other_lessen_monitor'), tmp_xxjr_overall_df.all_rise_num.alias( 'other_add_monitor'), tmp_wljd_overall_df.all_decline_num.alias( 'net_lessen_monitor'), tmp_wljd_overall_df.all_rise_num.alias('net_add_monitor'), tmp_smjj_overall_df.all_decline_num.alias( 'private_fund_lessen_monitor'), tmp_smjj_overall_df.all_rise_num.alias( 'private_fund_add_monitor'), tmp_jycs_overall_df.all_decline_num.alias( 'trade_place_lessen_monitor'), tmp_jycs_overall_df.all_rise_num.alias( 'trade_place_add_monitor'), tmp_rzdb_overall_df.all_decline_num.alias( 'financing_guarantee_lessen_monitor'), tmp_rzdb_overall_df.all_rise_num.alias( 'financing_guarantee_add_monitor'), tmp_xedk_overall_df.all_decline_num.alias( 'petty_loan_lessen_monitor'), tmp_xedk_overall_df.all_rise_num.alias( 'petty_loan_add_monitor'), tmp_xxjr_monitoring_df.zdgz_num.alias('other_focus_on'), tmp_xxjr_monitoring_df.cxjk_num.alias( 'other_sustain_monitor'), tmp_wljd_monitoring_df.zdgz_num.alias('net_focus_on'), tmp_wljd_monitoring_df.cxjk_num.alias( 'net_sustain_monitor'), tmp_smjj_monitoring_df.zdgz_num.alias( 'private_fund_focus_on'), tmp_smjj_monitoring_df.cxjk_num.alias( 'private_fund_sustain_monitor'), tmp_jycs_monitoring_df.zdgz_num.alias( 'trade_place_focus_on'), tmp_jycs_monitoring_df.cxjk_num.alias( 'trade_place_sustain_monitor'), tmp_rzdb_monitoring_df.zdgz_num.alias( 'financing_guarantee_focus_on'), tmp_rzdb_monitoring_df.cxjk_num.alias( 'financing_guarantee_sustain_monitor'), tmp_xedk_monitoring_df.zdgz_num.alias( 'petty_loan_focus_on'), tmp_xedk_monitoring_df.cxjk_num.alias( 'petty_loan_sustain_monitor'), fun.current_timestamp().alias('gmt_create'), fun.current_timestamp().alias('gmt_update')).cache() return tid_new_df
import numpy as np assert sys.version_info >= (3, 5) # make sure we have Python 3.5+ from pyspark.ml.feature import VectorAssembler from pyspark.sql import SparkSession, functions, types from pyspark.sql.functions import lower from pyspark.ml import Pipeline # add more functions as necessary schema = types.StructType([ types.StructField('lat', types.DoubleType(), nullable=False), types.StructField('lon', types.DoubleType(), nullable=False), types.StructField('timestamp', types.TimestampType(), nullable=False), types.StructField('amenity', types.StringType(), nullable=False), types.StructField('name', types.StringType(), nullable=True), types.StructField('tags', types.MapType(types.StringType(), types.StringType()), nullable=False), ]) entertainments = [ 'arts_centre', 'bistro', 'nightclub', 'bbq', 'car_rental', 'leisure', 'park', 'restaurant', 'bar', 'casino', 'gambling', 'cafe', 'theatre', 'stripclub', 'pub' ] def main(): # main logic starts here data = spark.read.json("../amenities-vancouver.json.gzip", schema=schema) entertainments_data = data.filter(data.amenity.isin(entertainments)) #entertainments_data.show() entertainments_data.write.json("../entertainments-vancouver",
T.StructField('split_name', T.StringType(), nullable=False), T.StructField('path', T.StringType(), nullable=False), T.StructField('fold_id', T.IntegerType(), nullable=False), ]) ModelParameters = T.StructType([ T.StructField('run_id', T.StringType(), nullable=False), T.StructField('parent_run_id', T.StringType(), nullable=True), T.StructField('wikiid', T.StringType(), nullable=False), T.StructField('started_at', T.TimestampType(), nullable=False), T.StructField('completed_at', T.TimestampType(), nullable=False), T.StructField('algorithm', T.StringType(), nullable=False), T.StructField('objective', T.StringType(), nullable=False), T.StructField('loss', T.DoubleType(), nullable=False), T.StructField('params', T.MapType(T.StringType(), T.StringType(), False), nullable=False), T.StructField('folds', T.ArrayType(TrainingFiles), nullable=False), T.StructField( 'metrics', T.ArrayType( T.StructType([ T.StructField('key', T.StringType(), nullable=False), T.StructField('value', T.DoubleType(), nullable=False), T.StructField('step', T.IntegerType(), nullable=False), T.StructField('fold_id', T.IntegerType(), nullable=False), T.StructField('split', T.StringType(), nullable=False), ]))), T.StructField('artifacts', T.MapType(T.StringType(), T.StringType(), False), nullable=False),
) keyvalue_msg_schema = types.StructType( [ types.StructField("key", types.StringType()), types.StructField("value", types.IntegerType()), ] ) map_msg_schema = types.StructType( [ types.StructField( "repeated_keyvalue_field", types.ArrayType(keyvalue_msg_schema) ), types.StructField( "map_field", types.MapType(types.StringType(), types.IntegerType()) ), ] ) # TODO: revise fake test pb_duration_schema = proto3_message_type_to_spark_schema(Duration) pb_timestamp_schema = proto3_message_type_to_spark_schema(Timestamp) time_msg_schema = types.StructType( [ types.StructField("start", pb_timestamp_schema), types.StructField("end", pb_timestamp_schema), types.StructField("duration", pb_duration_schema), ] )
def spark_data_flow(): json_to_obj_udf = fun.udf(json_to_obj, tp.MapType(tp.StringType(), tp.FloatType())) get_float_udf = fun.udf(get_float, tp.FloatType()) get_claim_transfer_udf = fun.udf(partial(get_keyword, u'不可转让'), tp.FloatType()) get_bank_custody_udf = fun.udf(partial(get_keyword, u'无存管'), tp.FloatType()) get_risk_reserve_udf = fun.udf(partial(get_keyword, u'无存管'), tp.FloatType()) get_unique_string_udf = fun.udf(get_unique_string, tp.StringType()) raw_wdzj_df = spark.sql(''' SELECT bbd_qyxx_id, company_name, platform_name, automatic_bidding, claim_transfer, bank_custody, platform_state, risk_reserve FROM dw.qyxg_wdzj WHERE dt='{version}' '''.format(version=WDZJ_VERSION)) tid_wdzj_df = raw_wdzj_df.select( 'bbd_qyxx_id', 'company_name', 'platform_name', 'platform_state', get_claim_transfer_udf('claim_transfer').alias('p2p_feature_18'), get_bank_custody_udf('bank_custody').alias('p2p_feature_19'), get_risk_reserve_udf('risk_reserve').alias('p2p_feature_20')) platform_df = spark.sql(''' SELECT bbd_qyxx_id ,company_name ,platform_name ,platform_state ,regcap ,per_lending_amount ,avg_soldout_time ,total_num_of_lender ,total_turnover ,total_deal_volume ,monthly_deal_data ,per_lending_num ,avg_lend_time ,per_borrowing_num ,loan_balance ,per_borrowing_amount ,borrowing_dispersion ,total_num_of_borrower FROM dw.qyxg_platform_data WHERE dt = '{version}' '''.format(version=PLATFORM_VERSION)) tid_platform_df = platform_df.select( 'bbd_qyxx_id', 'company_name', 'platform_name', 'platform_state', json_to_obj_udf('monthly_deal_data').getItem('turnover').alias( 'p2p_feature_1'), json_to_obj_udf('monthly_deal_data').getItem('num_of_lender').alias( 'p2p_feature_2'), fun.when(platform_df.platform_state == u'异常', 0).when(platform_df.platform_state == u'正常', 2).otherwise(1).alias('p2p_feature_3'), get_float_udf('borrowing_dispersion').alias('p2p_feature_4'), get_float_udf('per_lending_amount').alias('p2p_feature_5'), get_float_udf('regcap').alias('p2p_feature_6'), get_float_udf('avg_soldout_time').alias('p2p_feature_7'), get_float_udf('total_num_of_lender').alias('p2p_feature_8'), get_float_udf('total_turnover').alias('p2p_feature_9'), get_float_udf('total_deal_volume').alias('p2p_feature_10'), get_float_udf('per_lending_num').alias('p2p_feature_11'), get_float_udf('avg_lend_time').alias('p2p_feature_12'), get_float_udf('per_borrowing_num').alias('p2p_feature_13'), get_float_udf('per_borrowing_amount').alias('p2p_feature_14'), get_float_udf('loan_balance').alias('p2p_feature_15'), json_to_obj_udf('monthly_deal_data').getItem( 'nominal_interest_rate').alias('p2p_feature_16'), get_float_udf('total_num_of_borrower').alias('p2p_feature_17')) prd_platform_df = tid_platform_df.join(tid_wdzj_df, [ tid_platform_df.platform_name == tid_wdzj_df.platform_name, tid_platform_df.company_name == tid_wdzj_df.company_name ], 'outer').select( get_unique_string_udf(tid_platform_df.bbd_qyxx_id, tid_wdzj_df.bbd_qyxx_id).alias('bbd_qyxx_id'), get_unique_string_udf(tid_platform_df.company_name, tid_wdzj_df.company_name).alias('company_name'), get_unique_string_udf( tid_platform_df.platform_name, tid_wdzj_df.platform_name).alias('platform_name'), get_unique_string_udf( tid_platform_df.platform_state, tid_wdzj_df.platform_state).alias('platform_state'), 'p2p_feature_1', 'p2p_feature_2', 'p2p_feature_3', 'p2p_feature_4', 'p2p_feature_5', 'p2p_feature_6', 'p2p_feature_7', 'p2p_feature_8', 'p2p_feature_9', 'p2p_feature_10', 'p2p_feature_11', 'p2p_feature_12', 'p2p_feature_13', 'p2p_feature_14', 'p2p_feature_15', 'p2p_feature_16', 'p2p_feature_17', tid_wdzj_df.p2p_feature_18, tid_wdzj_df.p2p_feature_19, tid_wdzj_df.p2p_feature_20).dropDuplicates( ['bbd_qyxx_id', 'platform_name']).fillna(0.) return prd_platform_df
when(F.col("naics_code").isin(722511), "full_service_restaurants").\ when(F.col("naics_code").isin(722513), "limited_service_restaurants").\ when(F.col("naics_code").isin(446110, 446191), "pharmacies_and_drug_stores").\ when(F.col("naics_code").isin(311811,722515), "snack_and_bakeries").\ when(F.col("naics_code").isin(445210,445220,445230,445291,445292,445299), "specialty_food_stores").\ when(F.col("naics_code").isin(445110), "supermarkets_except_convenience_stores")).\ select("placekey","safegraph_place_id","naics_code","file_name") def explodeVisits(date_range_start, visit_by_day): start = datetime.datetime(*map(int, date_range_start[:10].split('-'))) return {(start + datetime.timedelta(days=days)): visits for days, visits in enumerate(json.loads(visit_by_day))} #Credit to the professor, I levarage this piece of code from class udfExpand = F.udf(explodeVisits, T.MapType(DateType(), T.IntegerType())) df = spark.read.csv("hdfs:///data/share/bdm/weekly-patterns-nyc-2019-2020/*", header=True) \ .select("placekey","safegraph_place_id", F.explode(udfExpand('date_range_start', 'visits_by_day')) \ .alias('date', "visits")) # .where(f"date=='{date}'") #Credit to the professor, I leverage this piece of code from class def find_median(values_list): try: median = np.median(values_list) return round(float(median), 2) except Exception: return None
def ibis_map_dtype_to_spark_dtype(ibis_dtype_obj): key_type = spark_dtype(ibis_dtype_obj.key_type) value_type = spark_dtype(ibis_dtype_obj.value_type) value_contains_null = ibis_dtype_obj.value_type.nullable return pt.MapType(key_type, value_type, value_contains_null)
def run_job(spark_context, sql_context, submission_date_range, use_test_data=False): """ Compute crash aggregates for the specified submission date range, and upload the result to S3. """ start_date = datetime.strptime(submission_date_range[0], "%Y%m%d").date() end_date = datetime.strptime(submission_date_range[1], "%Y%m%d").date() schema = types.StructType([ types.StructField("activity_date", types.StringType(), nullable=False), types.StructField("dimensions", types.MapType(types.StringType(), types.StringType(), True), nullable=False), types.StructField("stats", types.MapType(types.StringType(), types.DoubleType(), True), nullable=False), ]) current_date = start_date while current_date <= end_date: # useful statements for testing the program if use_test_data: # use test pings; very good for debugging the uploading process sys.path.append( os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "test")) import dataset pings = sc.parallelize(list(dataset.generate_pings())) else: pings = retrieve_crash_data(spark_context, current_date.strftime("%Y%m%d"), COMPARABLE_DIMENSIONS, FRACTION) result, main_processed_count, main_ignored_count, crash_processed_count, crash_ignored_count = compare_crashes( spark_context, pings, COMPARABLE_DIMENSIONS, DIMENSION_NAMES) result = result.coalesce(1) # put everything into a single partition df = sql_context.createDataFrame(result, schema) print("SUCCESSFULLY COMPUTED CRASH AGGREGATES FOR {}".format( current_date)) # upload the dataframe as Parquet to S3 s3_result_url = ( "s3n://telemetry-parquet/crash_aggregates/v1/submission_date={}". format(current_date)) df.write.parquet(s3_result_url) print("SUCCESSFULLY UPLOADED CRASH AGGREGATES FOR {} TO S3:".format( current_date)) print("{} main pings processed, {} main pings ignored".format( main_processed_count.value, main_ignored_count.value)) print("{} crash pings processed, {} crash pings ignored".format( crash_processed_count.value, crash_ignored_count.value)) current_date += timedelta(days=1) print("========================================") print("JOB COMPLETED SUCCESSFULLY") print("========================================")
class PA2Data(object): review_schema = T.StructType([ T.StructField('reviewerID', T.StringType(), False), T.StructField('asin', T.StringType(), False), T.StructField('overall', T.FloatType(), False) ]) product_schema = T.StructType([ T.StructField('asin', T.StringType()), T.StructField('salesRank', T.StringType()), T.StructField('categories', T.StringType()), T.StructField('title', T.StringType()), T.StructField('price', T.FloatType()), T.StructField('related', T.StringType()) ]) product_processed_schema = T.StructType([ T.StructField('asin', T.StringType()), T.StructField('title', T.StringType()), T.StructField('category', T.StringType()) ]) salesRank_schema = T.MapType(T.StringType(), T.IntegerType()) categories_schema = T.ArrayType(T.ArrayType(T.StringType())) related_schema = T.MapType(T.StringType(), T.ArrayType(T.StringType())) schema = { 'review': review_schema, 'product': product_schema, 'product_processed': product_processed_schema } metadata_schema = { 'salesRank': salesRank_schema, 'categories': categories_schema, 'related': related_schema } def __init__(self, spark, path_dict, output_root, deploy, input_format='dataframe'): self.spark = spark self.path_dict = path_dict self.output_root = output_root self.deploy = deploy self.input_format = input_format def load(self, name, path, infer_schema=False): if name in ['ml_features_train', 'ml_features_test']: data = self.spark.read.parquet(path) else: schema = self.schema[name] if not infer_schema else None data = self.spark.read.csv(path, schema=schema, escape='"', quote='"', inferSchema=infer_schema, header=True) if name == 'product': for column, column_schema in self.metadata_schema.items(): if column in data.columns: data = data.withColumn( column, F.from_json(F.col(column), column_schema)) return data def load_all(self, input_format='dataframe', no_cache=False): self.input_format = input_format print("Loading datasets ...", end='') # noqa data_dict = {} count_dict = {} for name, path in self.path_dict.items(): data = self.load(name, path) if input_format == 'rdd': data = data.rdd elif input_format == 'koalas': data = data.to_koalas() if self.deploy and not no_cache: data = data.cache() data_dict[name] = data count_dict[name] = data.count() if not no_cache else None print("Done") return data_dict, count_dict def cache_switch(self, data_dict, part): count_dict = {} if self.input_format == 'koalas': print('cache_switch() has no effect on Koalas') else: part_1_data = ['product', 'review', 'product_processed'] part_2_data = ['ml_features_train', 'ml_features_test'] if part == 'part_1': data_dict, count_dict = self.switch(data_dict, part_1_data, part_2_data) elif part == 'part_2': data_dict, count_dict = self.switch(data_dict, part_2_data, part_1_data) else: raise ValueError return data_dict, count_dict def switch(self, data_dict, to_persist, to_unpersist): count_dict = {} for name in to_unpersist: try: data_dict[name].unpersist() except Exception as e: pass for name in to_persist: data_dict[name] = data_dict[name].cache() count_dict[name] = data_dict[name].count() return data_dict, count_dict def save(self, res, task_name, filename=None): if task_name in TASK_NAMES or task_name in ['task_0', 'summary']: if not filename: filename = task_name if isinstance(res, Mapping): df = self.spark.createDataFrame([res]) else: df = self.spark.createDataFrame(res) output_path = 'file://' + os.path.join(self.output_root, filename + EXT) df.coalesce(1).write.mode('overwrite').json(output_path) else: raise ValueError
def spark_data_flow(smjj_version): #私募信息 smjj_df = spark.sql( ''' SELECT * FROM dw.qyxg_jijin_simu WHERE dt = '{version}' '''.format( version=smjj_version ) ) tid_df = smjj_df.select( 'bbd_qyxx_id', smjj_df.fund_manager_chinese.alias('company_name'), 'managed_fund_type', 'pic_millon', 'regcap_paidpro', 'law_firm', 'no_qualification', 'employees', 'entitled_way', 'ifcareer_qualification', 'vip_type', 'interim_after_fund', 'interim_before_fund', 'integrity_info', 'special_message', 'legal_opinion' ).dropDuplicates( ['company_name'] ).cache() udf_return_type = tp.FloatType() prd_df = tid_df.select( 'bbd_qyxx_id', 'company_name', SparkUdf.define_spark_udf( 1, udf_return_type)('managed_fund_type').alias('pe_feature_1'), SparkUdf.define_spark_udf( 2, udf_return_type)('pic_millon').alias('pe_feature_2'), SparkUdf.define_spark_udf( 3, udf_return_type)('regcap_paidpro').alias('pe_feature_3'), SparkUdf.define_spark_udf( 4, udf_return_type)('law_firm').alias('pe_feature_4'), SparkUdf.define_spark_udf( 5, udf_return_type)('no_qualification').alias('pe_feature_5'), SparkUdf.define_spark_udf( 6, udf_return_type)('employees').alias('pe_feature_6'), SparkUdf.define_spark_udf( 7, udf_return_type)('entitled_way').alias('pe_feature_7'), SparkUdf.define_spark_udf( 8, udf_return_type)('ifcareer_qualification').alias('pe_feature_8'), SparkUdf.define_spark_udf( 9, udf_return_type)('vip_type').alias('pe_feature_9'), SparkUdf.define_spark_udf( 10, udf_return_type)('interim_after_fund').alias('pe_feature_10'), SparkUdf.define_spark_udf( 11, udf_return_type)('interim_before_fund').alias('pe_feature_11'), SparkUdf.define_spark_udf( 12, tp.MapType(tp.StringType(), tp.StringType()) )('integrity_info').alias('pe_feature_12'), SparkUdf.define_spark_udf( 13, tp.MapType(tp.StringType(), tp.StringType()) )('integrity_info', 'special_message').alias('pe_feature_13'), SparkUdf.define_spark_udf( 14, udf_return_type)('legal_opinion').alias('pe_feature_14') ) return prd_df