def _transform(self, dataset):
        inputCol = self.getInputCol()
        dataType = dataset.schema[inputCol].dataType
        assert isinstance(dataType, T.MapType)
        assert isinstance(dataType.keyType, T.StringType)
        assert isinstance(dataType.valueType, (T.NumericType, T.StringType))
        seed = _mh3(inputCol, seed=self.getSeed())

        @F.udf(T.MapType(T.IntegerType(), T.FloatType()))
        def hashNumeric(v):
            if not v:
                return {}
            hashVector = defaultdict(float)
            for k, v in v.items():
                h = _mh3(k, seed=seed)
                hashVector[h] += v
            return dict(hashVector)

        @F.udf(T.MapType(T.IntegerType(), T.FloatType()))
        def hashString(v):
            if not v:
                return v
            hashVector = defaultdict(float)
            for k, v in v.items():
                h = _mh3(v, seed=_mh3(k, seed=seed))
                hashVector[h] += 1.
            return dict(hashVector)

        if isinstance(dataType.valueType, T.NumericType):
            return dataset.withColumn(self.getOutputCol(),
                                      hashNumeric(dataset[inputCol]))
        else:
            return dataset.withColumn(self.getOutputCol(),
                                      hashString(dataset[inputCol]))
Example #2
0
    def test_update(self):
        denominator = FeatureRequestTotal.feature_name_from_class()
        numerator = FeatureResponse5xxTotal.feature_name_from_class()
        schema = T.StructType([
            T.StructField(self.feature.current_features_column,
                          T.MapType(T.StringType(), T.FloatType())),
            T.StructField(self.feature.past_features_column,
                          T.MapType(T.StringType(), T.FloatType())),
        ])

        sub_df = self.session.createDataFrame([{
            self.feature.current_features_column: {
                self.feature.feature_name: 1.,
                numerator: 2.,
                denominator: 1.,
            },
            self.feature.past_features_column: {
                self.feature.feature_name: 1.,
                numerator: 4.,
                denominator: 2.,
            }
        }],
                                              schema=schema)
        result_df = self.feature.update(sub_df)

        result_df.show()
        value = result_df.select(
            self.feature.updated_feature_col_name).collect()[0][
                self.feature.updated_feature_col_name]
        expected_value = 2.
        self.assertAlmostEqual(value, expected_value, places=2)
Example #3
0
    def test_update(self):
        count_col = FeatureRequestTotal.feature_name_from_class()
        mean_col = FeaturePathDepthAverage.feature_name_from_class()
        schema = T.StructType([
            T.StructField(self.feature.current_features_column,
                          T.MapType(T.StringType(), T.FloatType())),
            T.StructField(self.feature.past_features_column,
                          T.MapType(T.StringType(), T.FloatType())),
        ])

        sub_df = self.session.createDataFrame([{
            self.feature.current_features_column: {
                self.feature.feature_name: 6.,
                count_col: 3.,
                mean_col: 5.,
            },
            self.feature.past_features_column: {
                self.feature.feature_name: 2.,
                count_col: 1.,
                mean_col: 4.,
            }
        }],
                                              schema=schema)
        result_df = self.feature.update(sub_df)

        result_df.show()
        value = result_df.select(
            self.feature.updated_feature_col_name).collect()[0][
                self.feature.updated_feature_col_name]
        from baskerville.features.helpers import update_variance
        expected_value = update_variance(2., 6., 1., 3., 4., 5.)
        print(expected_value)
        self.assertAlmostEqual(value, expected_value, places=2)
    def _transform(self, dataset):
        inputCol = self.getInputCol()
        dataType = dataset.schema[inputCol].dataType
        assert isinstance(dataType,
                          (T.BooleanType, T.NumericType, T.StringType))
        seed = _mh3(inputCol, seed=self.getSeed())

        @F.udf(T.MapType(T.IntegerType(), T.FloatType()))
        def hashNumeric(v):
            if not v:
                return {}
            return {seed: float(v)}

        @F.udf(T.MapType(T.IntegerType(), T.FloatType()))
        def hashString(v):
            if not v:
                return {}
            return {_mh3(v, seed=seed): 1.}

        if isinstance(dataType, (T.BooleanType, T.NumericType)):
            return dataset.withColumn(self.getOutputCol(),
                                      hashNumeric(dataset[inputCol]))
        else:
            return dataset.withColumn(self.getOutputCol(),
                                      hashString(dataset[inputCol]))
Example #5
0
    def test_update(self):
        schema = T.StructType([
            T.StructField(self.feature.current_features_column,
                          T.MapType(T.StringType(), T.FloatType())),
            T.StructField(self.feature.past_features_column,
                          T.MapType(T.StringType(), T.FloatType())),
        ])

        sub_df = self.session.createDataFrame([{
            self.feature.current_features_column: {
                self.feature.feature_name: 1.,
            },
            self.feature.past_features_column: {
                self.feature.feature_name: 2.,
            }
        }],
                                              schema=schema)
        result_df = self.feature.update(sub_df)

        result_df.show()
        value = result_df.select(
            self.feature.updated_feature_col_name).collect()[0][
                self.feature.updated_feature_col_name]
        expected_value = 3.
        self.assertAlmostEqual(value, expected_value, places=2)
Example #6
0
    def test_type_mismatch(self):
        with six.assertRaisesRegex(self, AssertionError, 'Cannot compare heterogeneous types'):
            schema_has(
                T.StructType([T.StructField('f1', T.IntegerType())]),
                T.ArrayType(T.IntegerType()),
            )

        with six.assertRaisesRegex(self, AssertionError, 'Cannot compare heterogeneous types'):
            schema_has(
                T.ArrayType(T.IntegerType()),
                {'f1': T.IntegerType()},
            )

        with six.assertRaisesRegex(self, TypeError, 'f1 is IntegerType, expected LongType'):
            schema_has(
                T.StructType([T.StructField('f1', T.IntegerType())]),
                T.StructType([T.StructField('f1', T.LongType())]),
            )

        with six.assertRaisesRegex(
                self,
                TypeError,
                'f1\.element\.s1 is IntegerType, expected LongType',
        ):
            schema_has(
                T.StructType([
                    T.StructField(
                        'f1',
                        T.ArrayType(T.StructType([T.StructField('s1', T.IntegerType())])),
                    ),
                ]),
                T.StructType([
                    T.StructField(
                        'f1',
                        T.ArrayType(T.StructType([T.StructField('s1', T.LongType())])),
                    ),
                ]),
            )

        with six.assertRaisesRegex(self, TypeError, 'element is IntegerType, expected LongType'):
            schema_has(
                T.ArrayType(T.IntegerType()),
                T.ArrayType(T.LongType()),
            )

        with six.assertRaisesRegex(self, TypeError, 'key is StringType, expected LongType'):
            schema_has(
                T.MapType(T.StringType(), T.IntegerType()),
                T.MapType(T.LongType(), T.IntegerType()),
            )

        with six.assertRaisesRegex(self, TypeError, 'value is IntegerType, expected LongType'):
            schema_has(
                T.MapType(T.StringType(), T.IntegerType()),
                T.MapType(T.StringType(), T.LongType()),
            )
Example #7
0
def _proto3_field_to_spark_data_type(field_desc: FieldDescriptor) -> DataType:
    """Convert ProtoBuf field descriptor to Spark `DataType` or `StructField` object.

    Args:
        field_desc (FieldDescriptor): A ProtoBuf field descriptor.
    Returns:
        DataType: A Spark `DataType` or `StructField` object.
    """
    # map type field
    if _IsMapEntry(field_desc):
        key_field_desc = field_desc.message_type.fields_by_name["key"]
        value_field_desc = field_desc.message_type.fields_by_name["value"]
        key_struct_type = _proto3_field_to_spark_data_type(key_field_desc)
        value_struct_type = _proto3_field_to_spark_data_type(value_field_desc)
        return types.MapType(key_struct_type, value_struct_type)

    if field_desc.type == FieldDescriptor.TYPE_MESSAGE:
        # nested message
        field_data_type = _proto3_message_descriptor_to_spark_schema(
            field_desc.message_type)
    else:
        # scalar value types
        field_data_type = _SPARK_SQL_TYPE_MAP[field_desc.type]

    # list type field
    if field_desc.label == FieldDescriptor.LABEL_REPEATED:
        return types.ArrayType(field_data_type)

    return field_data_type
Example #8
0
 def _sort_structs(dt, ignore_order_depth):
     if ignore_order_depth == 0:
         return dt
     if dt.typeName() == 'array':
         return T.ArrayType(
             elementType=_sort_structs(dt.elementType, ignore_order_depth),
             containsNull=ignore_nullability or dt.containsNull,
         )
     if dt.typeName() == 'map':
         return T.MapType(
             keyType=_sort_structs(dt.keyType, ignore_order_depth),
             valueType=_sort_structs(dt.valueType, ignore_order_depth),
             valueContainsNull=ignore_nullability or dt.valueContainsNull,
         )
     if dt.typeName() == 'struct':
         return T.StructType([
             _sort_structs(f, ignore_order_depth - 1)
             for f in sorted(dt.fields, key=lambda f: f.name)
         ])
     if dt.typeName() == 'structf':
         return T.StructField(
             dt.name,
             _sort_structs(dt.dataType, ignore_order_depth),
             nullable=ignore_nullability or dt.nullable,
             metadata=dt.metadata,
         )
     return dt
Example #9
0
def task_a_2_step_1_final(spark):
    a2_struct = T.StructType([
        T.StructField("datetime_start", T.TimestampType()),
        T.StructField("datetime_end", T.TimestampType()),
        T.StructField("map_topics", T.MapType(
            T.StringType(),
            T.ArrayType(T.StringType())
        ))
    ])

    result = kafka_source(spark, config.BOOTSTRAP_SERVERS, "topics-by-state_step-0").parse_json(a2_struct) \
        .withWatermark("datetime_end", "1 minute").groupBy(
        F.window("datetime_end", "3 hour", "1 hour")
    ) \
        .agg(
        F.first("window.start").alias("timestamp_start"),
        F.first("window.end").alias("timestamp_end"),
        F.collect_list("map_topics").alias("statistics")
    ) \
        .select(
        F.struct(
            F.concat(F.hour('timestamp_start'), lit(":"), F.minute('timestamp_start')).alias("time_start"),
            F.concat(F.hour('timestamp_end'), lit(":"), F.minute('timestamp_end')).alias("time_end"),
            concat_maps_udf(col('statistics')).alias("statistics")
        ).alias("res")
    ).send_to_kafka(config.BOOTSTRAP_SERVERS, "topics-by-state", config.LOG_PREFIX)

    return result
Example #10
0
def main(args):
    spark = sql.SparkSession.builder.appName('update-mutator').getOrCreate()

    msg_struct = types.StructType([
        types.StructField('text', types.StringType(), True),
        types.StructField('user_id', types.StringType(), True),
        types.StructField('update_id', types.StringType(), True)
    ])

    sentiments_struct = types.ArrayType(
        types.MapType(types.StringType(), types.FloatType(), False))

    analyzer = vader.SentimentIntensityAnalyzer()
    analyzer_bcast = spark.sparkContext.broadcast(analyzer)

    def sentiment_generator_impl(text):
        va = analyzer_bcast.value
        english = SpacyMagic.get('en_core_web_sm')
        result = english(text)
        sents = [str(sent) for sent in result.sents]
        sentiment = [va.polarity_scores(str(s)) for s in sents]
        return sentiment

    sentiment_generator = functions.udf(sentiment_generator_impl,
                                        sentiments_struct)

    def json_converter_impl(user_id, update_id, text, sentiments):
        obj = dict(user_id=user_id,
                   update_id=update_id,
                   text=text,
                   sentiments=sentiments)
        return json.dumps(obj)

    json_converter = functions.udf(json_converter_impl, types.StringType())

    records = (spark.readStream.format('kafka').option(
        'kafka.bootstrap.servers',
        args.brokers).option('subscribe', args.intopic).load().select(
            functions.column('value').cast(types.StringType()).alias('value')
        ).select(
            functions.from_json(
                functions.column('value'), msg_struct).alias('json')).select(
                    functions.column('json.user_id'),
                    functions.column('json.update_id'),
                    functions.column('json.text'),
                    sentiment_generator(functions.column('json.text')).alias(
                        'sentiments')).select(
                            json_converter(functions.column('user_id'),
                                           functions.column('update_id'),
                                           functions.column('text'),
                                           functions.column('sentiments')).
                            alias('value')).writeStream.format('kafka').option(
                                'kafka.bootstrap.servers',
                                args.brokers).option('topic',
                                                     args.outtopic).option(
                                                         'checkpointLocation',
                                                         '/tmp').start())

    records.awaitTermination()
Example #11
0
 def get_title_er_schema():
     return types.StructType([
         types.StructField('id', types.LongType(), nullable=False),
         types.StructField('O*NET-SOC Code',
                           types.StringType(),
                           nullable=False),
         types.StructField('Title', types.StringType(), nullable=False),
         types.StructField('Alternate Title',
                           types.ArrayType(types.MapType()),
                           nullable=False),
     ])
Example #12
0
def _rec_build_types(t):
    if type(t) == list:
        return T.ArrayType(_rec_build_types(t[0]))
    elif type(t) == dict:
        k, v = list(t.items())[0]
        return T.MapType(_rec_build_types(k), _rec_build_types(v), True)
    elif type(t) == tuple:
        return T.StructType([T.StructField("v_" + str(i), _rec_build_types(f), True) for i, f in enumerate(t)])
    elif t in T._type_mappings:
        return T._type_mappings[t]()
    else:
        raise TypeError(repr(t) + " is not supported")
Example #13
0
def get_cache_schema():
    return T.StructType([
        T.StructField("id", T.IntegerType(), False),
        T.StructField("target", T.StringType(), False),
        T.StructField("ip", T.StringType(), False),
        T.StructField("first_ever_request", T.TimestampType(), True),
        T.StructField("old_subset_count", T.IntegerType(), True),
        T.StructField("old_features", T.MapType(T.StringType(),
                                                T.DoubleType()), True),
        T.StructField("old_num_requests", T.IntegerType(), True),
        T.StructField("updated_at", T.TimestampType(), True)
    ])
 def _initialize_results(self, scaffolds):
     data = [
         ps.Row(smiles=scaffold, scaffold=scaffold, decorations={}, count=1)
         for scaffold in scaffolds
     ]
     data_schema = pst.StructType([
         pst.StructField("smiles", pst.StringType()),
         pst.StructField("scaffold", pst.StringType()),
         pst.StructField("decorations",
                         pst.MapType(pst.IntegerType(), pst.StringType())),
         pst.StructField("count", pst.IntegerType())
     ])
     return SPARK.createDataFrame(data, schema=data_schema)
Example #15
0
def infer_complex_spark_type(typeclass):
    if typeclass.__origin__ in {list, List}:
        co_T, *_ = typeclass.__args__
        is_nullable, py_type = maybe_unlift_optional(co_T)
        return t.ArrayType(infer_spark_type(py_type), is_nullable)
    elif typeclass.__origin__ in {dict, Dict}:
        k_T, v_T, *_ = typeclass.__args__
        is_nullable_key, py_key_type = maybe_unlift_optional(k_T)
        is_nullable_value, py_value_type = maybe_unlift_optional(v_T)
        if is_nullable_key:
            raise TypeError(f"Nullable keys of type {py_key_type} don't allowed in {typeclass}")
        return t.MapType(infer_spark_type(py_key_type), infer_spark_type(py_value_type), is_nullable_value)
    else:
        raise TypeError(f"Don't know how to represent {typeclass} in Spark")
    def _transform(self, dataset):
        @F.udf(T.MapType(T.IntegerType(), T.FloatType()))
        def cross(*values):
            if not values:
                return {}
            hashVector = defaultdict(float)
            for d1, d2 in combinations(values, 2):
                if not d1 or not d2:
                    continue
                for (k1, v1), (k2, v2) in product(d1.items(), d2.items()):
                    h = (k1 ^ k2)
                    hashVector[h] += v1 * v2
            return dict(hashVector)

        return dataset.withColumn(self.getOutputCol(),
                                  cross(*dataset[self.getInputCols()]))
    def _transform(self, dataset):
        inputCol = self.getInputCol()
        dataType = dataset.schema[inputCol].dataType
        assert isinstance(dataType, T.ArrayType)
        assert isinstance(dataType.elementType, T.StringType)
        seed = _mh3(inputCol, seed=self.getSeed())

        @F.udf(T.MapType(T.IntegerType(), T.FloatType()))
        def hash_(v):
            if not v:
                return {}
            hashVector = defaultdict(float)
            for x in v:
                h = _mh3(x, seed=seed)
                hashVector[h] += 1.
            return dict(hashVector)

        return dataset.withColumn(self.getOutputCol(),
                                  hash_(dataset[inputCol]))
Example #18
0
 def get_ret_type(value):
     if isinstance(value, float):
         return T.FloatType()
     if isinstance(value, int):
         return T.IntegerType()
     if isinstance(value, str):
         return T.StringType()
     if isinstance(value, list):
         if len(value) == 0:
             raise Exception(
                 "Python Dataset Wrapper: Failed to parser return type for list"
             )
         return T.ArrayType(DataSet.get_ret_type(value[0]))
     if isinstance(value, Mapping):
         if len(value) == 0:
             raise Exception(
                 "Python Dataset Wrapper: Failed to parser return type for dict"
             )
         x, y = value.popitem()
         return T.MapType(DataSet.get_ret_type(x), DataSet.get_ret_type(y))
Example #19
0
 def test_maps_nested_subset(self):
     schema_has(
         T.MapType(
             T.StringType(),
             T.MapType(
                 T.StringType(),
                 T.StructType([
                     T.StructField('f1', T.MapType(T.StringType(), T.LongType())),
                     T.StructField('f2', T.MapType(T.StringType(), T.IntegerType())),
                 ]),
             ),
         ),
         T.MapType(
             T.StringType(),
             T.MapType(
                 T.StringType(),
                 T.StructType([
                     T.StructField('f1', T.MapType(T.StringType(), T.LongType())),
                 ]),
             ),
         ),
     )
    def _join_results_single(self, scaffolds_df, sampled_df):
        def _join_scaffold(scaff, decs):
            mol = usc.join_joined_attachments(scaff, decs)
            if mol:
                return usc.to_smiles(mol)

        join_scaffold_udf = psf.udf(_join_scaffold, pst.StringType())

        def _create_decorations_map(decorations_smi, attachment_points):
            decorations = decorations_smi.split(usc.ATTACHMENT_SEPARATOR_TOKEN)
            return {
                idx: _cleanup_decoration(dec)
                for dec, idx in zip(decorations, attachment_points)
            }

        create_decorations_map_udf = psf.udf(
            _create_decorations_map,
            pst.MapType(pst.IntegerType(), pst.StringType()))

        return sampled_df.join(scaffolds_df, on="id")\
            .select(
                join_scaffold_udf("randomized_scaffold", "decoration_smi").alias("smiles"),
                create_decorations_map_udf("decoration_smi", "attachment_points").alias("decorations"),
                "scaffold")
Example #21
0
def raw_spark_data_flow():
    #注册所有需要用到的udf
    get_xedk_udf = fun.udf(get_xedk, tp.IntegerType())
    get_rzdb_udf = fun.udf(get_rzdb, tp.IntegerType())
    get_jycs_udf = fun.udf(get_jycs, tp.IntegerType())
    get_smjj_udf = fun.udf(get_smjj, tp.IntegerType())
    get_wljd_udf = fun.udf(get_wljd, tp.IntegerType())
    get_xxjr_udf = fun.udf(get_xxjr, tp.IntegerType())

    get_zdgz_num_udf = fun.udf(get_zdgz_num, tp.IntegerType())
    get_cxjk_num_udf = fun.udf(get_cxjk_num, tp.IntegerType())

    get_change_info_2_udf = fun.udf(
        get_change_info_2, tp.MapType(tp.StringType(), tp.IntegerType()))
    get_change_info_udf = fun.udf(get_change_info, tp.IntegerType())

    #读取原始输入
    old_df = spark.read.parquet(
        ("{path}"
         "/all_company_info/{version}").format(path=IN_PATH,
                                               version=OLD_VERSION)).fillna({
                                                   'city':
                                                   u'无',
                                                   'county':
                                                   u'无',
                                                   'province':
                                                   u'无'
                                               })
    new_df = spark.read.parquet(
        ("{path}"
         "/all_company_info/{version}").format(path=IN_PATH,
                                               version=NEW_VERSION)).fillna({
                                                   'city':
                                                   u'无',
                                                   'county':
                                                   u'无',
                                                   'province':
                                                   u'无'
                                               })

    #高危企业数
    high_risk_count_df = new_df.select(
        'province', 'city', 'county',
        'bbd_qyxx_id').where(new_df.risk_rank == u'高危预警').groupBy([
            'province', 'city', 'county'
        ]).count().withColumnRenamed('count', 'high_risk_num').fillna({
            'city':
            u'无',
            'county':
            u'无',
            'province':
            u'无'
        }).cache()

    #重点关注企业数
    focus_on_count_df = new_df.select(
        'province', 'city', 'county',
        'bbd_qyxx_id').where(new_df.risk_rank == u'重点关注').groupBy([
            'province', 'city', 'county'
        ]).count().withColumnRenamed('count', 'focus_on_num').fillna({
            'city':
            u'无',
            'county':
            u'无',
            'province':
            u'无'
        }).cache()

    #持续监控企业数
    constantly_monitor_count_df = new_df.select(
        'province', 'city', 'county',
        'bbd_qyxx_id').where(new_df.risk_rank == u'持续监控').groupBy([
            'province', 'city', 'county'
        ]).count().withColumnRenamed('count',
                                     'constantly_monitor_num').fillna({
                                         'city':
                                         u'无',
                                         'county':
                                         u'无',
                                         'province':
                                         u'无'
                                     }).cache()

    #监控企业数
    supervise_count_df = new_df.select('province', 'city', 'county',
                                       'bbd_qyxx_id').groupBy([
                                           'province', 'city', 'county'
                                       ]).count().withColumnRenamed(
                                           'count', 'supervise_num').fillna({
                                               'city':
                                               u'无',
                                               'county':
                                               u'无',
                                               'province':
                                               u'无'
                                           }).cache()

    #新兴金融、网络借贷、私募基金、交易场所
    raw_types_num_df = new_df.select(
        'province', 'city', 'county',
        'company_type').groupBy(['province', 'city', 'county',
                                 'company_type']).count()
    tid_types_num_df = raw_types_num_df.select(
        'province', 'city', 'county',
        fun.concat_ws(':', 'company_type',
                      'count').alias('company_type_merge')).groupBy([
                          'province', 'city', 'county'
                      ]).agg({
                          'company_type_merge': 'collect_list'
                      }).withColumnRenamed('collect_list(company_type_merge)',
                                           'company_type_merge').fillna({
                                               'city':
                                               u'无',
                                               'county':
                                               u'无',
                                               'province':
                                               u'无'
                                           }).cache()

    #新增高危企业、减少高危企业
    tmp_new_df = new_df.select(
        'province', 'city', 'county', 'bbd_qyxx_id', 'company_type',
        'data_version').where(new_df.risk_rank == u'高危预警')
    tmp_old_df = old_df.select(
        'province', 'city', 'county', 'bbd_qyxx_id', 'company_type',
        'data_version').where(old_df.risk_rank == u'高危预警')
    tmp_new_2_df = tmp_new_df.union(tmp_old_df).groupBy(
        ['province', 'city', 'county', 'bbd_qyxx_id']).agg({
            'data_version':
            'collect_list'
        }).select(
            'province', 'city', 'county', 'bbd_qyxx_id',
            'collect_list(data_version)',
            get_change_info_udf('collect_list(data_version)').alias(
                'risk_change')).groupBy(['province', 'city', 'county']).agg({
                    'risk_change':
                    'collect_list'
                }).select(
                    'province', 'city', 'county',
                    get_change_info_2_udf('collect_list(risk_change)').alias(
                        'risk_change_num'))
    tmp_new_3_df = tmp_new_2_df.select(
        'province', 'city', 'county',
        tmp_new_2_df.risk_change_num.getItem('decline').alias(
            'risk_decline_num'),
        tmp_new_2_df.risk_change_num.getItem('rise').alias(
            'risk_rise_num')).fillna({
                'city': u'无',
                'county': u'无',
                'province': u'无'
            }).cache()

    #各行业新增高危企业、减少高危企业
    #新兴金融、网络借贷、私募基金、交易场所、融资担保、小额贷款
    tmp_new_6_df = tmp_new_df.union(tmp_old_df).groupBy(
        ['province', 'city', 'county', 'bbd_qyxx_id', 'company_type']).agg({
            'data_version':
            'collect_list'
        }).select(
            'province', 'city', 'county', 'bbd_qyxx_id', 'company_type',
            'collect_list(data_version)',
            get_change_info_udf('collect_list(data_version)').alias(
                'risk_change')).groupBy([
                    'province', 'city', 'county', 'company_type'
                ]).agg({
                    'risk_change': 'collect_list'
                }).select(
                    'province', 'city', 'county', 'company_type',
                    get_change_info_2_udf('collect_list(risk_change)').alias(
                        'risk_change_num'))

    tmp_new_7_df = tmp_new_6_df.select(
        'province', 'city', 'county', 'company_type',
        tmp_new_6_df.risk_change_num.getItem('decline').alias(
            'risk_decline_num'),
        tmp_new_6_df.risk_change_num.getItem('rise').alias(
            'risk_rise_num')).fillna({
                'city': u'无',
                'county': u'无',
                'province': u'无'
            }).cache()

    #选择不同的行业
    os.system(("hadoop fs -rmr " "{path}").format(path=TMP_PATH))
    tmp_new_7_df.where(
        tmp_new_7_df.company_type == u'新兴金融').coalesce(10).write.parquet(
            "{path}/tmp_xxjr_change_df/"
            "{version}".format(path=TMP_PATH, version=NEW_VERSION))
    tmp_new_7_df.where(
        tmp_new_7_df.company_type == u'网络借贷').coalesce(10).write.parquet(
            "{path}/tmp_wljd_change_df/"
            "{version}".format(path=TMP_PATH, version=NEW_VERSION))
    tmp_new_7_df.where(
        tmp_new_7_df.company_type == u'私募基金').coalesce(10).write.parquet(
            "{path}/tmp_smjj_change_df/"
            "{version}".format(path=TMP_PATH, version=NEW_VERSION))
    tmp_new_7_df.where(
        tmp_new_7_df.company_type == u'交易场所').coalesce(10).write.parquet(
            "{path}/tmp_jycs_change_df/"
            "{version}".format(path=TMP_PATH, version=NEW_VERSION))
    tmp_new_7_df.where(
        tmp_new_7_df.company_type == u'融资担保').coalesce(10).write.parquet(
            "{path}/tmp_rzdb_change_df/"
            "{version}".format(path=TMP_PATH, version=NEW_VERSION))
    tmp_new_7_df.where(
        tmp_new_7_df.company_type == u'小额贷款').coalesce(10).write.parquet(
            "{path}/tmp_xedk_change_df/"
            "{version}".format(path=TMP_PATH, version=NEW_VERSION))

    tmp_xxjr_change_df = spark.read.parquet("{path}/tmp_xxjr_change_df/"
                                            "{version}".format(
                                                path=TMP_PATH,
                                                version=NEW_VERSION))
    tmp_wljd_change_df = spark.read.parquet("{path}/tmp_wljd_change_df/"
                                            "{version}".format(
                                                path=TMP_PATH,
                                                version=NEW_VERSION))
    tmp_smjj_change_df = spark.read.parquet("{path}/tmp_smjj_change_df/"
                                            "{version}".format(
                                                path=TMP_PATH,
                                                version=NEW_VERSION))
    tmp_jycs_change_df = spark.read.parquet("{path}/tmp_jycs_change_df/"
                                            "{version}".format(
                                                path=TMP_PATH,
                                                version=NEW_VERSION))
    tmp_rzdb_change_df = spark.read.parquet("{path}/tmp_rzdb_change_df/"
                                            "{version}".format(
                                                path=TMP_PATH,
                                                version=NEW_VERSION))
    tmp_xedk_change_df = spark.read.parquet("{path}/tmp_xedk_change_df/"
                                            "{version}".format(
                                                path=TMP_PATH,
                                                version=NEW_VERSION))

    #监控企业变动情况
    tmp_new_df = new_df.select('province', 'city', 'county', 'bbd_qyxx_id',
                               'company_type', 'data_version')
    tmp_old_df = old_df.select('province', 'city', 'county', 'bbd_qyxx_id',
                               'company_type', 'data_version')
    tmp_new_4_df = tmp_new_df.union(tmp_old_df).groupBy(
        ['province', 'city', 'county', 'bbd_qyxx_id']).agg({
            'data_version':
            'collect_list'
        }).select(
            'province', 'city', 'county', 'bbd_qyxx_id',
            'collect_list(data_version)',
            get_change_info_udf('collect_list(data_version)').alias(
                'risk_change')).groupBy(['province', 'city', 'county']).agg({
                    'risk_change':
                    'collect_list'
                }).select(
                    'province', 'city', 'county',
                    get_change_info_2_udf('collect_list(risk_change)').alias(
                        'risk_change_num'))
    tmp_new_5_df = tmp_new_4_df.select(
        'province', 'city', 'county',
        tmp_new_4_df.risk_change_num.getItem('decline').alias(
            'all_decline_num'),
        tmp_new_4_df.risk_change_num.getItem('rise').alias(
            'all_rise_num')).fillna({
                'city': u'无',
                'county': u'无',
                'province': u'无'
            }).cache()

    #各行业监控企业变动情况
    #新兴金融、网络借贷、私募基金、交易场所、融资担保、小额贷款
    tmp_new_8_df = tmp_new_df.union(tmp_old_df).groupBy(
        ['province', 'city', 'county', 'bbd_qyxx_id', 'company_type']).agg({
            'data_version':
            'collect_list'
        }).select(
            'province', 'city', 'county', 'bbd_qyxx_id', 'company_type',
            'collect_list(data_version)',
            get_change_info_udf('collect_list(data_version)').alias(
                'risk_change')).groupBy([
                    'province', 'city', 'county', 'company_type'
                ]).agg({
                    'risk_change': 'collect_list'
                }).select(
                    'province', 'city', 'county', 'company_type',
                    get_change_info_2_udf('collect_list(risk_change)').alias(
                        'risk_change_num'))
    tmp_new_9_df = tmp_new_8_df.select(
        'province', 'city', 'county', 'company_type',
        tmp_new_8_df.risk_change_num.getItem('decline').alias(
            'all_decline_num'),
        tmp_new_8_df.risk_change_num.getItem('rise').alias(
            'all_rise_num')).fillna({
                'city': u'无',
                'county': u'无',
                'province': u'无'
            }).cache()

    #选择不同的行业
    tmp_new_9_df.where(
        tmp_new_9_df.company_type == u'新兴金融').coalesce(10).write.parquet(
            "{path}/tmp_xxjr_overall_df/"
            "{version}".format(path=TMP_PATH, version=NEW_VERSION))
    tmp_new_9_df.where(
        tmp_new_9_df.company_type == u'网络借贷').coalesce(10).write.parquet(
            "{path}/tmp_wljd_overall_df/"
            "{version}".format(path=TMP_PATH, version=NEW_VERSION))
    tmp_new_9_df.where(
        tmp_new_9_df.company_type == u'私募基金').coalesce(10).write.parquet(
            "{path}/tmp_smjj_overall_df/"
            "{version}".format(path=TMP_PATH, version=NEW_VERSION))
    tmp_new_9_df.where(
        tmp_new_9_df.company_type == u'交易场所').coalesce(10).write.parquet(
            "{path}/tmp_jycs_overall_df/"
            "{version}".format(path=TMP_PATH, version=NEW_VERSION))
    tmp_new_9_df.where(
        tmp_new_9_df.company_type == u'融资担保').coalesce(10).write.parquet(
            "{path}/tmp_rzdb_overall_df/"
            "{version}".format(path=TMP_PATH, version=NEW_VERSION))
    tmp_new_9_df.where(
        tmp_new_9_df.company_type == u'小额贷款').coalesce(10).write.parquet(
            "{path}/tmp_xedk_overall_df/"
            "{version}".format(path=TMP_PATH, version=NEW_VERSION))

    tmp_xxjr_overall_df = spark.read.parquet("{path}/tmp_xxjr_overall_df/"
                                             "{version}".format(
                                                 path=TMP_PATH,
                                                 version=NEW_VERSION))
    tmp_wljd_overall_df = spark.read.parquet("{path}/tmp_wljd_overall_df/"
                                             "{version}".format(
                                                 path=TMP_PATH,
                                                 version=NEW_VERSION))
    tmp_smjj_overall_df = spark.read.parquet("{path}/tmp_smjj_overall_df/"
                                             "{version}".format(
                                                 path=TMP_PATH,
                                                 version=NEW_VERSION))
    tmp_jycs_overall_df = spark.read.parquet("{path}/tmp_jycs_overall_df/"
                                             "{version}".format(
                                                 path=TMP_PATH,
                                                 version=NEW_VERSION))
    tmp_rzdb_overall_df = spark.read.parquet("{path}/tmp_rzdb_overall_df/"
                                             "{version}".format(
                                                 path=TMP_PATH,
                                                 version=NEW_VERSION))
    tmp_xedk_overall_df = spark.read.parquet("{path}/tmp_xedk_overall_df/"
                                             "{version}".format(
                                                 path=TMP_PATH,
                                                 version=NEW_VERSION))

    #各行业持续监控、重点关注企业
    #新兴金融、网络借贷、私募基金、交易场所、融资担保、小额贷款
    tmp_new_10_df = new_df.select(
        'province', 'city', 'county', 'company_type', 'risk_rank').groupBy([
            'province', 'city', 'county', 'company_type'
        ]).agg({
            'risk_rank': 'collect_list'
        }).withColumnRenamed('collect_list(risk_rank)',
                             'risk_rank').withColumn(
                                 'zdgz_num',
                                 get_zdgz_num_udf('risk_rank')).withColumn(
                                     'cxjk_num',
                                     get_cxjk_num_udf('risk_rank')).fillna({
                                         'city':
                                         u'无',
                                         'county':
                                         u'无',
                                         'province':
                                         u'无'
                                     }).cache()

    #选择不同的行业
    tmp_new_10_df.where(
        tmp_new_10_df.company_type == u'新兴金融').coalesce(10).write.parquet(
            "{path}/tmp_xxjr_monitoring_df/"
            "{version}".format(path=TMP_PATH, version=NEW_VERSION))

    tmp_new_10_df.where(
        tmp_new_10_df.company_type == u'网络借贷').coalesce(10).write.parquet(
            "{path}/tmp_wljd_monitoring_df/"
            "{version}".format(path=TMP_PATH, version=NEW_VERSION))
    tmp_new_10_df.where(
        tmp_new_10_df.company_type == u'私募基金').coalesce(10).write.parquet(
            "{path}/tmp_smjj_monitoring_df/"
            "{version}".format(path=TMP_PATH, version=NEW_VERSION))
    tmp_new_10_df.where(
        tmp_new_10_df.company_type == u'交易场所').coalesce(10).write.parquet(
            "{path}/tmp_jycs_monitoring_df/"
            "{version}".format(path=TMP_PATH, version=NEW_VERSION))
    tmp_new_10_df.where(
        tmp_new_10_df.company_type == u'融资担保').coalesce(10).write.parquet(
            "{path}/tmp_rzdb_monitoring_df/"
            "{version}".format(path=TMP_PATH, version=NEW_VERSION))
    tmp_new_10_df.where(
        tmp_new_10_df.company_type == u'小额贷款').coalesce(10).write.parquet(
            "{path}/tmp_xedk_monitoring_df/"
            "{version}".format(path=TMP_PATH, version=NEW_VERSION))

    tmp_xxjr_monitoring_df = spark.read.parquet(
        "{path}/tmp_xxjr_monitoring_df/"
        "{version}".format(path=TMP_PATH, version=NEW_VERSION))
    tmp_wljd_monitoring_df = spark.read.parquet(
        "{path}/tmp_wljd_monitoring_df/"
        "{version}".format(path=TMP_PATH, version=NEW_VERSION))
    tmp_smjj_monitoring_df = spark.read.parquet(
        "{path}/tmp_smjj_monitoring_df/"
        "{version}".format(path=TMP_PATH, version=NEW_VERSION))
    tmp_jycs_monitoring_df = spark.read.parquet(
        "{path}/tmp_jycs_monitoring_df/"
        "{version}".format(path=TMP_PATH, version=NEW_VERSION))
    tmp_rzdb_monitoring_df = spark.read.parquet(
        "{path}/tmp_rzdb_monitoring_df/"
        "{version}".format(path=TMP_PATH, version=NEW_VERSION))
    tmp_xedk_monitoring_df = spark.read.parquet(
        "{path}/tmp_xedk_monitoring_df/"
        "{version}".format(path=TMP_PATH, version=NEW_VERSION))

    #组合所有的字段
    tid_new_df = new_df.dropDuplicates(['province', 'city', 'county']).join(
        high_risk_count_df, ['province', 'city', 'county'],
        'left_outer').join(focus_on_count_df, [
            'province', 'city', 'county'
        ], 'left_outer').join(constantly_monitor_count_df, [
            'province', 'city',
            'county'
        ], 'left_outer').join(supervise_count_df, [
            'province',
            'city',
            'county'
        ], 'left_outer').join(tid_types_num_df, [
            'province',
            'city',
            'county'
        ], 'left_outer').join(tmp_new_3_df, [
            'province',
            'city',
            'county'
        ], 'left_outer').join(tmp_new_5_df, [
            'province',
            'city',
            'county'
        ], 'left_outer').join(tmp_xxjr_change_df, [
            'province',
            'city',
            'county'
        ], 'left_outer').join(tmp_wljd_change_df, [
            'province',
            'city',
            'county'
        ], 'left_outer').join(tmp_smjj_change_df, [
            'province',
            'city',
            'county'
        ], 'left_outer').join(tmp_jycs_change_df, [
            'province',
            'city', 'county'
        ], 'left_outer').join(tmp_rzdb_change_df, [
            'province', 'city',
            'county'
        ], 'left_outer').join(tmp_xedk_change_df, [
            'province', 'city',
            'county'
        ], 'left_outer').join(tmp_xxjr_overall_df, [
            'province', 'city',
            'county'
        ], 'left_outer').join(tmp_wljd_overall_df, [
            'province', 'city',
            'county'
        ], 'left_outer').join(
            tmp_smjj_overall_df, ['province', 'city', 'county'],
            'left_outer').join(
                tmp_jycs_overall_df,
                ['province', 'city', 'county'
                 ], 'left_outer').join(tmp_rzdb_overall_df, [
                     'province', 'city', 'county'
                 ], 'left_outer').join(tmp_xedk_overall_df, [
                     'province', 'city', 'county'
                 ], 'left_outer').join(tmp_xxjr_monitoring_df, [
                     'province', 'city', 'county'
                 ], 'left_outer').join(tmp_wljd_monitoring_df, [
                     'province', 'city', 'county'
                 ], 'left_outer').join(tmp_smjj_monitoring_df, [
                     'province', 'city', 'county'
                 ], 'left_outer').join(tmp_jycs_monitoring_df, [
                     'province', 'city', 'county'
                 ], 'left_outer').join(tmp_rzdb_monitoring_df, [
                     'province',
                     'city', 'county'
                 ], 'left_outer').join(tmp_xedk_monitoring_df, [
                     'province',
                     'city',
                     'county'
                 ], 'left_outer').select(
                     new_df.province,
                     new_df.city,
                     new_df.county, high_risk_count_df.high_risk_num,
                     focus_on_count_df.focus_on_num,
                     constantly_monitor_count_df.constantly_monitor_num,
                     supervise_count_df.supervise_num,
                     get_xxjr_udf(
                         tid_types_num_df.company_type_merge).alias('xxjr'),
                     get_smjj_udf(
                         tid_types_num_df.company_type_merge).alias('smjj'),
                     get_wljd_udf(
                         tid_types_num_df.company_type_merge).alias('wljd'),
                     get_jycs_udf(
                         tid_types_num_df.company_type_merge).alias('jycs'),
                     get_rzdb_udf(
                         tid_types_num_df.company_type_merge).alias('rzdb'),
                     get_xedk_udf(
                         tid_types_num_df.company_type_merge).alias('xedk'),
                     tmp_new_3_df.risk_decline_num, tmp_new_3_df.risk_rise_num,
                     tmp_new_5_df.all_decline_num, tmp_new_5_df.all_rise_num,
                     tmp_xxjr_change_df.risk_decline_num.alias(
                         'other_lessen_high_risk'),
                     tmp_xxjr_change_df.risk_rise_num.alias(
                         'other_add_high_risk'),
                     tmp_wljd_change_df.risk_decline_num.alias(
                         'net_lessen_high_risk'),
                     tmp_wljd_change_df.risk_rise_num.alias(
                         'net_add_high_risk'),
                     tmp_smjj_change_df.risk_decline_num.alias(
                         'private_fund_lessen_high_risk'),
                     tmp_smjj_change_df.risk_rise_num.alias(
                         'private_fund_add_high_risk'),
                     tmp_jycs_change_df.risk_decline_num.alias(
                         'trade_place_lessen_high_risk'),
                     tmp_jycs_change_df.risk_rise_num.alias(
                         'trade_place_add_high_risk'),
                     tmp_rzdb_change_df.risk_decline_num.alias(
                         'financing_guarantee_lessen_high_risk'),
                     tmp_rzdb_change_df.risk_rise_num.alias(
                         'financing_guarantee_add_high_risk'),
                     tmp_xedk_change_df.risk_decline_num.alias(
                         'petty_loan_lessen_high_risk'),
                     tmp_xedk_change_df.risk_rise_num.alias(
                         'petty_loan_add_high_risk'),
                     tmp_xxjr_overall_df.all_decline_num.alias(
                         'other_lessen_monitor'),
                     tmp_xxjr_overall_df.all_rise_num.alias(
                         'other_add_monitor'),
                     tmp_wljd_overall_df.all_decline_num.alias(
                         'net_lessen_monitor'),
                     tmp_wljd_overall_df.all_rise_num.alias('net_add_monitor'),
                     tmp_smjj_overall_df.all_decline_num.alias(
                         'private_fund_lessen_monitor'),
                     tmp_smjj_overall_df.all_rise_num.alias(
                         'private_fund_add_monitor'),
                     tmp_jycs_overall_df.all_decline_num.alias(
                         'trade_place_lessen_monitor'),
                     tmp_jycs_overall_df.all_rise_num.alias(
                         'trade_place_add_monitor'),
                     tmp_rzdb_overall_df.all_decline_num.alias(
                         'financing_guarantee_lessen_monitor'),
                     tmp_rzdb_overall_df.all_rise_num.alias(
                         'financing_guarantee_add_monitor'),
                     tmp_xedk_overall_df.all_decline_num.alias(
                         'petty_loan_lessen_monitor'),
                     tmp_xedk_overall_df.all_rise_num.alias(
                         'petty_loan_add_monitor'),
                     tmp_xxjr_monitoring_df.zdgz_num.alias('other_focus_on'),
                     tmp_xxjr_monitoring_df.cxjk_num.alias(
                         'other_sustain_monitor'),
                     tmp_wljd_monitoring_df.zdgz_num.alias('net_focus_on'),
                     tmp_wljd_monitoring_df.cxjk_num.alias(
                         'net_sustain_monitor'),
                     tmp_smjj_monitoring_df.zdgz_num.alias(
                         'private_fund_focus_on'),
                     tmp_smjj_monitoring_df.cxjk_num.alias(
                         'private_fund_sustain_monitor'),
                     tmp_jycs_monitoring_df.zdgz_num.alias(
                         'trade_place_focus_on'),
                     tmp_jycs_monitoring_df.cxjk_num.alias(
                         'trade_place_sustain_monitor'),
                     tmp_rzdb_monitoring_df.zdgz_num.alias(
                         'financing_guarantee_focus_on'),
                     tmp_rzdb_monitoring_df.cxjk_num.alias(
                         'financing_guarantee_sustain_monitor'),
                     tmp_xedk_monitoring_df.zdgz_num.alias(
                         'petty_loan_focus_on'),
                     tmp_xedk_monitoring_df.cxjk_num.alias(
                         'petty_loan_sustain_monitor'),
                     fun.current_timestamp().alias('gmt_create'),
                     fun.current_timestamp().alias('gmt_update')).cache()

    return tid_new_df
Example #22
0
import numpy as np
assert sys.version_info >= (3, 5)  # make sure we have Python 3.5+
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession, functions, types
from pyspark.sql.functions import lower
from pyspark.ml import Pipeline

# add more functions as necessary
schema = types.StructType([
    types.StructField('lat', types.DoubleType(), nullable=False),
    types.StructField('lon', types.DoubleType(), nullable=False),
    types.StructField('timestamp', types.TimestampType(), nullable=False),
    types.StructField('amenity', types.StringType(), nullable=False),
    types.StructField('name', types.StringType(), nullable=True),
    types.StructField('tags',
                      types.MapType(types.StringType(), types.StringType()),
                      nullable=False),
])
entertainments = [
    'arts_centre', 'bistro', 'nightclub', 'bbq', 'car_rental', 'leisure',
    'park', 'restaurant', 'bar', 'casino', 'gambling', 'cafe', 'theatre',
    'stripclub', 'pub'
]


def main():
    # main logic starts here
    data = spark.read.json("../amenities-vancouver.json.gzip", schema=schema)
    entertainments_data = data.filter(data.amenity.isin(entertainments))
    #entertainments_data.show()
    entertainments_data.write.json("../entertainments-vancouver",
Example #23
0
    T.StructField('split_name', T.StringType(), nullable=False),
    T.StructField('path', T.StringType(), nullable=False),
    T.StructField('fold_id', T.IntegerType(), nullable=False),
])

ModelParameters = T.StructType([
    T.StructField('run_id', T.StringType(), nullable=False),
    T.StructField('parent_run_id', T.StringType(), nullable=True),
    T.StructField('wikiid', T.StringType(), nullable=False),
    T.StructField('started_at', T.TimestampType(), nullable=False),
    T.StructField('completed_at', T.TimestampType(), nullable=False),
    T.StructField('algorithm', T.StringType(), nullable=False),
    T.StructField('objective', T.StringType(), nullable=False),
    T.StructField('loss', T.DoubleType(), nullable=False),
    T.StructField('params',
                  T.MapType(T.StringType(), T.StringType(), False),
                  nullable=False),
    T.StructField('folds', T.ArrayType(TrainingFiles), nullable=False),
    T.StructField(
        'metrics',
        T.ArrayType(
            T.StructType([
                T.StructField('key', T.StringType(), nullable=False),
                T.StructField('value', T.DoubleType(), nullable=False),
                T.StructField('step', T.IntegerType(), nullable=False),
                T.StructField('fold_id', T.IntegerType(), nullable=False),
                T.StructField('split', T.StringType(), nullable=False),
            ]))),
    T.StructField('artifacts',
                  T.MapType(T.StringType(), T.StringType(), False),
                  nullable=False),
Example #24
0
)

keyvalue_msg_schema = types.StructType(
    [
        types.StructField("key", types.StringType()),
        types.StructField("value", types.IntegerType()),
    ]
)

map_msg_schema = types.StructType(
    [
        types.StructField(
            "repeated_keyvalue_field", types.ArrayType(keyvalue_msg_schema)
        ),
        types.StructField(
            "map_field", types.MapType(types.StringType(), types.IntegerType())
        ),
    ]
)

# TODO: revise fake test
pb_duration_schema = proto3_message_type_to_spark_schema(Duration)
pb_timestamp_schema = proto3_message_type_to_spark_schema(Timestamp)

time_msg_schema = types.StructType(
    [
        types.StructField("start", pb_timestamp_schema),
        types.StructField("end", pb_timestamp_schema),
        types.StructField("duration", pb_duration_schema),
    ]
)
Example #25
0
def spark_data_flow():
    json_to_obj_udf = fun.udf(json_to_obj,
                              tp.MapType(tp.StringType(), tp.FloatType()))
    get_float_udf = fun.udf(get_float, tp.FloatType())
    get_claim_transfer_udf = fun.udf(partial(get_keyword, u'不可转让'),
                                     tp.FloatType())
    get_bank_custody_udf = fun.udf(partial(get_keyword, u'无存管'),
                                   tp.FloatType())
    get_risk_reserve_udf = fun.udf(partial(get_keyword, u'无存管'),
                                   tp.FloatType())
    get_unique_string_udf = fun.udf(get_unique_string, tp.StringType())

    raw_wdzj_df = spark.sql('''
        SELECT
        bbd_qyxx_id,
        company_name,
        platform_name,
        automatic_bidding,
        claim_transfer,
        bank_custody,
        platform_state,
        risk_reserve
        FROM
        dw.qyxg_wdzj
        WHERE
        dt='{version}'
        '''.format(version=WDZJ_VERSION))
    tid_wdzj_df = raw_wdzj_df.select(
        'bbd_qyxx_id', 'company_name', 'platform_name', 'platform_state',
        get_claim_transfer_udf('claim_transfer').alias('p2p_feature_18'),
        get_bank_custody_udf('bank_custody').alias('p2p_feature_19'),
        get_risk_reserve_udf('risk_reserve').alias('p2p_feature_20'))

    platform_df = spark.sql('''
        SELECT
        bbd_qyxx_id
        ,company_name
        ,platform_name
        ,platform_state
        ,regcap
        ,per_lending_amount
        ,avg_soldout_time 
        ,total_num_of_lender 
        ,total_turnover 
        ,total_deal_volume 
        ,monthly_deal_data 
        ,per_lending_num 
        ,avg_lend_time 
        ,per_borrowing_num 
        ,loan_balance 
        ,per_borrowing_amount 
        ,borrowing_dispersion 
        ,total_num_of_borrower
        FROM
        dw.qyxg_platform_data
        WHERE
        dt = '{version}'
        '''.format(version=PLATFORM_VERSION))
    tid_platform_df = platform_df.select(
        'bbd_qyxx_id', 'company_name', 'platform_name', 'platform_state',
        json_to_obj_udf('monthly_deal_data').getItem('turnover').alias(
            'p2p_feature_1'),
        json_to_obj_udf('monthly_deal_data').getItem('num_of_lender').alias(
            'p2p_feature_2'),
        fun.when(platform_df.platform_state == u'异常',
                 0).when(platform_df.platform_state == u'正常',
                         2).otherwise(1).alias('p2p_feature_3'),
        get_float_udf('borrowing_dispersion').alias('p2p_feature_4'),
        get_float_udf('per_lending_amount').alias('p2p_feature_5'),
        get_float_udf('regcap').alias('p2p_feature_6'),
        get_float_udf('avg_soldout_time').alias('p2p_feature_7'),
        get_float_udf('total_num_of_lender').alias('p2p_feature_8'),
        get_float_udf('total_turnover').alias('p2p_feature_9'),
        get_float_udf('total_deal_volume').alias('p2p_feature_10'),
        get_float_udf('per_lending_num').alias('p2p_feature_11'),
        get_float_udf('avg_lend_time').alias('p2p_feature_12'),
        get_float_udf('per_borrowing_num').alias('p2p_feature_13'),
        get_float_udf('per_borrowing_amount').alias('p2p_feature_14'),
        get_float_udf('loan_balance').alias('p2p_feature_15'),
        json_to_obj_udf('monthly_deal_data').getItem(
            'nominal_interest_rate').alias('p2p_feature_16'),
        get_float_udf('total_num_of_borrower').alias('p2p_feature_17'))

    prd_platform_df = tid_platform_df.join(tid_wdzj_df, [
        tid_platform_df.platform_name == tid_wdzj_df.platform_name,
        tid_platform_df.company_name == tid_wdzj_df.company_name
    ], 'outer').select(
        get_unique_string_udf(tid_platform_df.bbd_qyxx_id,
                              tid_wdzj_df.bbd_qyxx_id).alias('bbd_qyxx_id'),
        get_unique_string_udf(tid_platform_df.company_name,
                              tid_wdzj_df.company_name).alias('company_name'),
        get_unique_string_udf(
            tid_platform_df.platform_name,
            tid_wdzj_df.platform_name).alias('platform_name'),
        get_unique_string_udf(
            tid_platform_df.platform_state,
            tid_wdzj_df.platform_state).alias('platform_state'),
        'p2p_feature_1', 'p2p_feature_2', 'p2p_feature_3', 'p2p_feature_4',
        'p2p_feature_5', 'p2p_feature_6', 'p2p_feature_7', 'p2p_feature_8',
        'p2p_feature_9', 'p2p_feature_10', 'p2p_feature_11', 'p2p_feature_12',
        'p2p_feature_13', 'p2p_feature_14', 'p2p_feature_15', 'p2p_feature_16',
        'p2p_feature_17', tid_wdzj_df.p2p_feature_18,
        tid_wdzj_df.p2p_feature_19, tid_wdzj_df.p2p_feature_20).dropDuplicates(
            ['bbd_qyxx_id', 'platform_name']).fillna(0.)

    return prd_platform_df
Example #26
0
               when(F.col("naics_code").isin(722511), "full_service_restaurants").\
               when(F.col("naics_code").isin(722513), "limited_service_restaurants").\
               when(F.col("naics_code").isin(446110, 446191), "pharmacies_and_drug_stores").\
               when(F.col("naics_code").isin(311811,722515), "snack_and_bakeries").\
               when(F.col("naics_code").isin(445210,445220,445230,445291,445292,445299), "specialty_food_stores").\
               when(F.col("naics_code").isin(445110), "supermarkets_except_convenience_stores")).\
    select("placekey","safegraph_place_id","naics_code","file_name")

    def explodeVisits(date_range_start, visit_by_day):
        start = datetime.datetime(*map(int, date_range_start[:10].split('-')))
        return {(start + datetime.timedelta(days=days)): visits
                for days, visits in enumerate(json.loads(visit_by_day))}

    #Credit to the professor, I levarage this piece of code from class

    udfExpand = F.udf(explodeVisits, T.MapType(DateType(), T.IntegerType()))
    df = spark.read.csv("hdfs:///data/share/bdm/weekly-patterns-nyc-2019-2020/*", header=True) \
           .select("placekey","safegraph_place_id",
              F.explode(udfExpand('date_range_start', 'visits_by_day')) \
                 .alias('date', "visits"))

    # .where(f"date=='{date}'")
    #Credit to the professor, I leverage this piece of code from class


    def find_median(values_list):
        try:
            median = np.median(values_list)
            return round(float(median), 2)
        except Exception:
            return None
Example #27
0
def ibis_map_dtype_to_spark_dtype(ibis_dtype_obj):
    key_type = spark_dtype(ibis_dtype_obj.key_type)
    value_type = spark_dtype(ibis_dtype_obj.value_type)
    value_contains_null = ibis_dtype_obj.value_type.nullable
    return pt.MapType(key_type, value_type, value_contains_null)
def run_job(spark_context,
            sql_context,
            submission_date_range,
            use_test_data=False):
    """
    Compute crash aggregates for the specified submission date range,
    and upload the result to S3.
    """
    start_date = datetime.strptime(submission_date_range[0], "%Y%m%d").date()
    end_date = datetime.strptime(submission_date_range[1], "%Y%m%d").date()

    schema = types.StructType([
        types.StructField("activity_date", types.StringType(), nullable=False),
        types.StructField("dimensions",
                          types.MapType(types.StringType(), types.StringType(),
                                        True),
                          nullable=False),
        types.StructField("stats",
                          types.MapType(types.StringType(), types.DoubleType(),
                                        True),
                          nullable=False),
    ])

    current_date = start_date
    while current_date <= end_date:
        # useful statements for testing the program
        if use_test_data:
            # use test pings; very good for debugging the uploading process
            sys.path.append(
                os.path.join(os.path.dirname(os.path.abspath(__file__)), "..",
                             "test"))
            import dataset
            pings = sc.parallelize(list(dataset.generate_pings()))
        else:
            pings = retrieve_crash_data(spark_context,
                                        current_date.strftime("%Y%m%d"),
                                        COMPARABLE_DIMENSIONS, FRACTION)

        result, main_processed_count, main_ignored_count, crash_processed_count, crash_ignored_count = compare_crashes(
            spark_context, pings, COMPARABLE_DIMENSIONS, DIMENSION_NAMES)
        result = result.coalesce(1)  # put everything into a single partition
        df = sql_context.createDataFrame(result, schema)
        print("SUCCESSFULLY COMPUTED CRASH AGGREGATES FOR {}".format(
            current_date))

        # upload the dataframe as Parquet to S3
        s3_result_url = (
            "s3n://telemetry-parquet/crash_aggregates/v1/submission_date={}".
            format(current_date))
        df.write.parquet(s3_result_url)

        print("SUCCESSFULLY UPLOADED CRASH AGGREGATES FOR {} TO S3:".format(
            current_date))
        print("{} main pings processed, {} main pings ignored".format(
            main_processed_count.value, main_ignored_count.value))
        print("{} crash pings processed, {} crash pings ignored".format(
            crash_processed_count.value, crash_ignored_count.value))

        current_date += timedelta(days=1)

    print("========================================")
    print("JOB COMPLETED SUCCESSFULLY")
    print("========================================")
class PA2Data(object):
    review_schema = T.StructType([
        T.StructField('reviewerID', T.StringType(), False),
        T.StructField('asin', T.StringType(), False),
        T.StructField('overall', T.FloatType(), False)
    ])
    product_schema = T.StructType([
        T.StructField('asin', T.StringType()),
        T.StructField('salesRank', T.StringType()),
        T.StructField('categories', T.StringType()),
        T.StructField('title', T.StringType()),
        T.StructField('price', T.FloatType()),
        T.StructField('related', T.StringType())
    ])
    product_processed_schema = T.StructType([
        T.StructField('asin', T.StringType()),
        T.StructField('title', T.StringType()),
        T.StructField('category', T.StringType())
    ])
    salesRank_schema = T.MapType(T.StringType(), T.IntegerType())
    categories_schema = T.ArrayType(T.ArrayType(T.StringType()))
    related_schema = T.MapType(T.StringType(), T.ArrayType(T.StringType()))
    schema = {
        'review': review_schema,
        'product': product_schema,
        'product_processed': product_processed_schema
    }
    metadata_schema = {
        'salesRank': salesRank_schema,
        'categories': categories_schema,
        'related': related_schema
    }

    def __init__(self,
                 spark,
                 path_dict,
                 output_root,
                 deploy,
                 input_format='dataframe'):
        self.spark = spark
        self.path_dict = path_dict
        self.output_root = output_root
        self.deploy = deploy
        self.input_format = input_format

    def load(self, name, path, infer_schema=False):
        if name in ['ml_features_train', 'ml_features_test']:
            data = self.spark.read.parquet(path)
        else:
            schema = self.schema[name] if not infer_schema else None
            data = self.spark.read.csv(path,
                                       schema=schema,
                                       escape='"',
                                       quote='"',
                                       inferSchema=infer_schema,
                                       header=True)
        if name == 'product':
            for column, column_schema in self.metadata_schema.items():
                if column in data.columns:
                    data = data.withColumn(
                        column, F.from_json(F.col(column), column_schema))
        return data

    def load_all(self, input_format='dataframe', no_cache=False):
        self.input_format = input_format
        print("Loading datasets ...", end='')  # noqa
        data_dict = {}
        count_dict = {}
        for name, path in self.path_dict.items():

            data = self.load(name, path)
            if input_format == 'rdd':
                data = data.rdd
            elif input_format == 'koalas':
                data = data.to_koalas()
            if self.deploy and not no_cache:
                data = data.cache()
            data_dict[name] = data
            count_dict[name] = data.count() if not no_cache else None
        print("Done")
        return data_dict, count_dict

    def cache_switch(self, data_dict, part):
        count_dict = {}
        if self.input_format == 'koalas':
            print('cache_switch() has no effect on Koalas')
        else:
            part_1_data = ['product', 'review', 'product_processed']
            part_2_data = ['ml_features_train', 'ml_features_test']
            if part == 'part_1':
                data_dict, count_dict = self.switch(data_dict, part_1_data,
                                                    part_2_data)
            elif part == 'part_2':
                data_dict, count_dict = self.switch(data_dict, part_2_data,
                                                    part_1_data)
            else:
                raise ValueError
        return data_dict, count_dict

    def switch(self, data_dict, to_persist, to_unpersist):
        count_dict = {}
        for name in to_unpersist:
            try:
                data_dict[name].unpersist()
            except Exception as e:
                pass
        for name in to_persist:
            data_dict[name] = data_dict[name].cache()
            count_dict[name] = data_dict[name].count()
        return data_dict, count_dict

    def save(self, res, task_name, filename=None):
        if task_name in TASK_NAMES or task_name in ['task_0', 'summary']:
            if not filename:
                filename = task_name
            if isinstance(res, Mapping):
                df = self.spark.createDataFrame([res])
            else:
                df = self.spark.createDataFrame(res)
            output_path = 'file://' + os.path.join(self.output_root,
                                                   filename + EXT)
            df.coalesce(1).write.mode('overwrite').json(output_path)
        else:
            raise ValueError
Example #30
0
def spark_data_flow(smjj_version):
    #私募信息
    smjj_df = spark.sql(
        '''
        SELECT
        *
        FROM
        dw.qyxg_jijin_simu
        WHERE
        dt = '{version}'
        '''.format(
            version=smjj_version
        )
    )
    tid_df = smjj_df.select(
        'bbd_qyxx_id',
        smjj_df.fund_manager_chinese.alias('company_name'),
        'managed_fund_type',
        'pic_millon',
        'regcap_paidpro',
        'law_firm',
        'no_qualification',
        'employees',
        'entitled_way',
        'ifcareer_qualification',
        'vip_type',
        'interim_after_fund',
        'interim_before_fund',
        'integrity_info',
        'special_message',
        'legal_opinion'
    ).dropDuplicates(
        ['company_name']
    ).cache()
    
    udf_return_type = tp.FloatType()
    prd_df = tid_df.select(
        'bbd_qyxx_id',
        'company_name',
        SparkUdf.define_spark_udf(
            1, udf_return_type)('managed_fund_type').alias('pe_feature_1'),
        SparkUdf.define_spark_udf(
            2, udf_return_type)('pic_millon').alias('pe_feature_2'),
        SparkUdf.define_spark_udf(
            3, udf_return_type)('regcap_paidpro').alias('pe_feature_3'),
        SparkUdf.define_spark_udf(
            4, udf_return_type)('law_firm').alias('pe_feature_4'),
        SparkUdf.define_spark_udf(
            5, udf_return_type)('no_qualification').alias('pe_feature_5'),
        SparkUdf.define_spark_udf(
            6, udf_return_type)('employees').alias('pe_feature_6'),
        SparkUdf.define_spark_udf(
            7, udf_return_type)('entitled_way').alias('pe_feature_7'),
        SparkUdf.define_spark_udf(
            8, udf_return_type)('ifcareer_qualification').alias('pe_feature_8'),
        SparkUdf.define_spark_udf(
            9, udf_return_type)('vip_type').alias('pe_feature_9'),    
        SparkUdf.define_spark_udf(
            10, udf_return_type)('interim_after_fund').alias('pe_feature_10'), 
        SparkUdf.define_spark_udf(
            11, udf_return_type)('interim_before_fund').alias('pe_feature_11'), 
        SparkUdf.define_spark_udf(
            12, tp.MapType(tp.StringType(), tp.StringType())
        )('integrity_info').alias('pe_feature_12'),
        SparkUdf.define_spark_udf(
            13, tp.MapType(tp.StringType(), tp.StringType())
        )('integrity_info', 'special_message').alias('pe_feature_13'),
        SparkUdf.define_spark_udf(
            14, udf_return_type)('legal_opinion').alias('pe_feature_14')
    )
    
    return prd_df