コード例 #1
0
    def _get_mon_metric_json_schema():
        """get the schema of the incoming monasca metric."""

        metric_struct_field = StructField(
            "metric",
            StructType([
                StructField("dimensions",
                            MapType(StringType(), StringType(), True), True),
                StructField("value_meta",
                            MapType(StringType(), StringType(), True), True),
                StructField("name", StringType(), True),
                StructField("timestamp", StringType(), True),
                StructField("value", StringType(), True)
            ]), True)

        meta_struct_field = StructField(
            "meta", MapType(StringType(), StringType(), True), True)

        creation_time_struct_field = StructField("creation_time", StringType(),
                                                 True)

        schema = StructType([
            creation_time_struct_field, meta_struct_field, metric_struct_field
        ])
        return schema
コード例 #2
0
ファイル: main.py プロジェクト: xu-weiyuan/blue-marlin
def add_ucdoc_bb_allocation_map(cfg, df, bookings_map):
    def helper(ands, minus, amount, day, allocated):
        es_client_predictions = ESClient(cfg['es_host'], cfg['es_port'],
                                         cfg['es_predictions_index'], cfg['es_predictions_type'])
        _, result = get_ucdoc_prediction_count(
            ands, minus, bookings_map, day, es_client_predictions)

        # apply tbr on prediction values
        prediction_inventory = sum(result.values())
        tbr_ratio = amount * 1.0 / prediction_inventory
        result.update((x, int(y * tbr_ratio)) for x, y in result.items())

        # {'magazinelock,3,5G,g_x,2,pt,1004,icc': 788, 'minusonepage,1,5G,g_f,4,pt,1003,icc': 5017}
        resources = result

        # {'b2': 800, 'b3': 1000, 'b1': 500}
        demands = allocated

        # the sort of booking here is random
        allocation_map = hwm_generic_allocation(
            resources, resources.keys(), demands, demands.keys())

        return allocation_map

    _map_type = MapType(StringType(), IntegerType())
    df = df.withColumn('allocation_map', udf(helper, MapType(StringType(), _map_type))(
        df.ands, df.minus, df.amount, df.day, df.allocated))

    return df
コード例 #3
0
def transfrom_comment_degrees(spark, database, comments_degree_collection):
    def deserialize(d):
        d.pop('_id')
        for k, v in d.items():
            if k == 'cDegrees':
                d[k] = json.loads(v)
                for sd in d[k]:
                    for i in sd['sDegrees']:
                        i['negAdv'] = {} if not i['negAdv'] else i['negAdv']
            else:
                d[k] = v

        return d

    degrees_df = spark.read.format("com.mongodb.spark.sql.DefaultSource"). \
        option("uri", "mongodb://127.0.0.1/").option("database", database).option("collection",
                                                                                  comments_degree_collection).\
        load()

    degrees_rdd = degrees_df.rdd.map(lambda y: y.asDict(recursive=True)). \
        map(lambda x: deserialize(x))

    fields = [
        StructField('pId', LongType(), False),
        StructField('cId', LongType(), False),
        StructField('cDegValue', DoubleType(), False),
        StructField(
            'cDegrees',
            ArrayType(
                StructType([
                    StructField('sId', IntegerType(), True),
                    StructField(
                        'sDegrees',
                        ArrayType(
                            StructType([
                                StructField(
                                    'feature',
                                    MapType(StringType(), StringType(), True),
                                    True),
                                StructField(
                                    'sentiment',
                                    MapType(StringType(), StringType(), True),
                                    True),
                                StructField(
                                    'negAdv',
                                    MapType(StringType(), StringType(), True),
                                    True),
                                StructField('relate', StringType(), False),
                                StructField('degValue', IntegerType(), True)
                            ]), True), True)
                ]), True), True)
    ]

    schema = StructType(fields)
    temp = spark.createDataFrame(degrees_rdd, schema)
    temp.write.format("com.mongodb.spark.sql.DefaultSource").mode("overwrite"). \
        option("uri", "mongodb://127.0.0.1/").option("database", database).option("collection",
                                                                                  'temp3'). \
        save()
コード例 #4
0
ファイル: stream.py プロジェクト: rajrohith/lambda-arch1
def classify_tweets(time, rdd):
    # Get the singleton instance of SparkSession
    spark = utils.get_spark_session_instance(rdd.context.getConf())
    sql_context = SQLContext(spark.sparkContext)

    # Filter tweets without text
    row_rdd = rdd.map(lambda tweet: Row(
            id_str=tweet["id_str"],
            text=tweet["text"],
            timestamp_ms=tweet["timestamp_ms"],
            created_at=tweet["created_at"],
            user=tweet["user"],
            sentiment=tweet["sentiment"]
    )).filter(lambda tweet: tweet["text"])
    print row_rdd.take(5)

    schema = StructType([
        StructField("id_str", StringType(), True),
        StructField("text", StringType(), True),
        StructField("timestamp_ms", StringType(), True),
        StructField("created_at", StringType(), True),
        StructField("user", MapType(StringType(), StringType()), True),
        StructField("sentiment", MapType(StringType(), FloatType()), True),
    ])
    tweets_df = spark.createDataFrame(row_rdd, schema=schema)

    # Fit the texts in the LDA model and get the topics
    try:
        custom_stop_words = []
        pipeline = ml_utils.set_pipeline(custom_stop_words)
        model = pipeline.fit(tweets_df)

        result = model.transform(tweets_df)

        lda_model = LocalLDAModel.load("s3a://current-models/LDAModel")

        prediction = lda_model.transform(result)
        prediction.show(truncate=True)

        tweets_with_prediction = prediction.rdd.map(lambda tweet: Row(
            id_str=tweet["id_str"],
            text=tweet["text"],
            timestamp_ms=int(tweet["timestamp_ms"]),
            created_at=tweet["created_at"],
            user=tweet["user"],
            sentiment=tweet["sentiment"],
            topic_distribution=topic_distibution_to_dict(tweet["topicDistribution"])
        ))
        print tweets_with_prediction.take(5)

        save_to_elastic(tweets_with_prediction)

        tweets_with_prediction_df = sql_context.createDataFrame(tweets_with_prediction)
        tweets_with_prediction_df.registerTempTable("tweets")
        exploded_tweets = sql_context.sql("select id_str, timestamp_ms, explode(topic_distribution) from tweets")
        exploded_tweets.foreachPartition(write_to_kafka)
    except Exception:
        print sys.exc_info()
コード例 #5
0
def create_user_df (hive_context, num_users, num_time_intervals, num_did_buckets):
    time_intervals = [1586822400 - i*86400 for i in range(num_time_intervals)]
    keyword_list = [
        "education", "entertainment", "game-act", "game-avg", "game-cnc", 
        "game-ent", "game-fishing", "game-moba", "game-mon", "game-rpg", 
        "game-sim", "game-slg", "health", "info", "living-car", 
        "living-food", "living-house", "living-insurance", "living-makeup", "living-map", 
        "living-mon", "living-photo", "other", "reading", "shopping", 
        "social", "sports", "travel", "video", 
    ]
    keyword_index_map = {
        "education": 1, "entertainment": 2, "game-act": 3, "game-avg": 4, "game-cnc": 5, 
        "game-ent": 6, "game-fishing": 7, "game-moba": 8, "game-mon": 9, "game-rpg": 10, 
        "game-sim": 11, "game-slg": 12, "health": 13, "info": 14, "living-car": 15, 
        "living-food": 16, "living-house": 17, "living-insurance": 18, "living-makeup": 19, "living-map": 20, 
        "living-mon": 21, "living-photo": 22, "other": 23, "reading": 24, "shopping": 25, 
        "social": 26, "sports": 27, "travel": 28, "video": 29, 
    }

    user_data = []
    for i in range(num_users):
        keyword_samples = [random.sample(keyword_list, random.randint(1,3)) for _ in range(num_time_intervals)]
        kws_sample = random.sample(keyword_list, int(0.9*len(keyword_list)))
        show_counts = [[int(random.expovariate(1))+1 for _ in keywords] for keywords in keyword_samples]
        user_data.append(Row(
            age = random.randint(1, 6), 
            gender = random.randint(0, 1), 
            did = hex(random.getrandbits(256)).lstrip('0x').rstrip('L'),
            did_index = i, 
            interval_start_time = time_intervals,
            interval_keywords = [','.join(keyword) for keyword in keyword_samples],
            kwi = [','.join([str(keyword_index_map[keyword]) for keyword in keywords]) for keywords in keyword_samples], 
            kwi_show_counts = [','.join('{}:{}'.format(keyword_index_map[keyword], count) for count, keyword in zip(show_count, keywords)) for show_count, keywords in zip(show_counts, keyword_samples)], 
            kwi_click_counts = [','.join('{}:{}'.format(keyword_index_map[keyword], count - random.randint(0, count)) for count, keyword in zip(show_count, keywords)) for show_count, keywords in zip(show_counts, keyword_samples)], 
            kws = {keyword: random.random() for keyword in kws_sample},
            kws_norm = {keyword: random.random() for keyword in kws_sample},
            did_bucket = random.randrange(num_did_buckets)
        ))

    schema = StructType([        
        StructField("age", StringType(), True),
        StructField("gender", StringType(), True),
        StructField("did", StringType(), True),
        StructField("did_index", IntegerType(), True),
        StructField("interval_start_time", ArrayType(StringType()), True),
        StructField("interval_keywords", ArrayType(StringType()), True),
        StructField("kwi", ArrayType(StringType()), True),
        StructField("kwi_show_counts", ArrayType(StringType()), True),
        StructField("kwi_click_counts", ArrayType(StringType()), True),
        StructField("kws", MapType(StringType(), FloatType()), True),
        StructField("kws_norm", MapType(StringType(), FloatType()), True),
        StructField("did_bucket", IntegerType(), True),
    ])

    return hive_context.createDataFrame(user_data, schema)
コード例 #6
0
    def num_featurizer(self,
                       data_frame,
                       ref_df=None,
                       featurize_process=["summary_stat", "sustainment_q"],
                       inputCol="VALUE",
                       labelCol="ITEMID",
                       outputCol="num_features",
                       REPARTITION_CONST=None):
        from pyspark.sql.functions import udf, array
        from pyspark.sql.types import StringType, DoubleType, MapType
        if not data_frame:
            return
        ret_data_frame = self.value_aggregator(data_frame)
        if REPARTITION_CONST is not None:
            ret_data_frame = ret_data_frame.checkpoint()
            self.logger.debug(
                "[NUM_FEATURIZER] ret_dataframe checkpointed:{0}".format(
                    ret_data_frame.count()))
        if "summary_stat" in featurize_process:
            udf_summary_stat = udf(preprocessor_gen.calc_summary_stat,
                                   MapType(StringType(), DoubleType()))
            ret_data_frame = ret_data_frame.withColumn(
                "summary_stat", udf_summary_stat(inputCol + "_LIST", labelCol))
            if REPARTITION_CONST is not None:
                ret_data_frame = ret_data_frame.checkpoint()
                self.logger.debug(
                    "[NUM_FEATURIZER] summary_stat, ret_dataframe checkpointed:{0}"
                    .format(ret_data_frame.count()))

        if "sustainment_q" in featurize_process:
            udf_sustainment_quant = udf(
                preprocessor_gen.sustainment_quantifier,
                MapType(StringType(), DoubleType()))
            ret_data_frame = ret_data_frame.join(ref_df, labelCol).withColumn(
                "sustainment_q",
                udf_sustainment_quant("summary_stat", labelCol, "ref_avg",
                                      "ref_std",
                                      "ref_count")).drop("ref_avg").drop(
                                          "ref_std").drop("ref_count")
            if REPARTITION_CONST is not None:
                ret_data_frame = ret_data_frame.checkpoint()
                self.logger.debug(
                    "[NUM_FEATURIZER] sustainment_q, ret_dataframe checkpointed:{0}"
                    .format(ret_data_frame.count()))
        udf_merge_dict_all = udf(preprocessor_gen.merge_dict_all,
                                 MapType(StringType(), DoubleType()))
        ret_data_frame = ret_data_frame.withColumn(
            outputCol, udf_merge_dict_all(array(featurize_process)))
        return ret_data_frame
コード例 #7
0
 def convert(self, ma_field: ma_fields.Dict) -> DataType:
     key_field_converter = self.converter_map.get(type(ma_field.key_field), StringConverter)
     value_field_converter = self.converter_map.get(type(ma_field.value_field), StringConverter)
     return MapType(
         key_field_converter(self.converter_map).convert(ma_field.key_field),
         value_field_converter(self.converter_map).convert(ma_field.value_field)
     )
コード例 #8
0
    def test_data_type_ops(self):
        _mock_spark_type = DataType()
        _mock_dtype = ExtensionDtype()
        _mappings = (
            (CategoricalDtype(), _mock_spark_type, CategoricalOps),
            (_mock_dtype, DecimalType(), DecimalOps),
            (_mock_dtype, FractionalType(), FractionalOps),
            (_mock_dtype, IntegralType(), IntegralOps),
            (_mock_dtype, StringType(), StringOps),
            (_mock_dtype, BooleanType(), BooleanOps),
            (_mock_dtype, TimestampType(), DatetimeOps),
            (_mock_dtype, TimestampNTZType(), DatetimeNTZOps),
            (_mock_dtype, DateType(), DateOps),
            (_mock_dtype, DayTimeIntervalType(), TimedeltaOps),
            (_mock_dtype, BinaryType(), BinaryOps),
            (_mock_dtype, ArrayType(StringType()), ArrayOps),
            (_mock_dtype, MapType(StringType(), IntegralType()), MapOps),
            (_mock_dtype, StructType(), StructOps),
            (_mock_dtype, NullType(), NullOps),
            (_mock_dtype, UserDefinedType(), UDTOps),
        )
        for _dtype, _spark_type, _ops in _mappings:
            self.assertIsInstance(DataTypeOps(_dtype, _spark_type), _ops)

        _unknow_spark_type = _mock_spark_type
        self.assertRaises(TypeError, DataTypeOps, BooleanType(),
                          _unknow_spark_type)
コード例 #9
0
def calculate_factdata_traffic(hive_context, factdata_table, bucket_id, day):
    def _list_to_map(count_array):
        count_map = {}
        for item in count_array:
            key_value = item.split(':')
            count_map[key_value[0]] = key_value[1]
        return count_map

    command = """
        SELECT
        FACTDATA.count_array,
        FACTDATA.day,
        FACTDATA.hour,
        FACTDATA.uckey
        FROM {} AS FACTDATA
        WHERE FACTDATA.bucket_id='{}' AND day='{}'
        """.format(factdata_table, str(bucket_id), str(day))

    df = hive_context.sql(command)
    list_to_map_udf = fn.udf(_list_to_map,
                             MapType(StringType(), StringType(), False))
    df = df.withColumn('count_map', list_to_map_udf(df.count_array))
    df = df.select('uckey', 'day', 'hour',
                   fn.explode(df.count_map)).withColumnRenamed(
                       "key", "price_cat").withColumnRenamed("value", "count")
    # [Row(uckey='native,72bcd2720e5011e79bc8fa163e05184e,WIFI,g_m,5,CPM,15,76', day='2019-11-02', hour=19, price_cat='3', count='4')]

    return df.groupby().agg(fn.sum('count').alias('count')).take(1)[0]['count']
コード例 #10
0
    def test_apply_schema(self):
        from datetime import date, datetime
        rdd = self.sc.parallelize([(127, -128, -32768, 32767, 2147483647, 1.0,
                                    date(2010, 1, 1), datetime(2010, 1, 1, 1, 1, 1),
                                    {"a": 1}, (2,), [1, 2, 3], None)])
        schema = StructType([
            StructField("byte1", ByteType(), False),
            StructField("byte2", ByteType(), False),
            StructField("short1", ShortType(), False),
            StructField("short2", ShortType(), False),
            StructField("int1", IntegerType(), False),
            StructField("float1", FloatType(), False),
            StructField("date1", DateType(), False),
            StructField("time1", TimestampType(), False),
            StructField("map1", MapType(StringType(), IntegerType(), False), False),
            StructField("struct1", StructType([StructField("b", ShortType(), False)]), False),
            StructField("list1", ArrayType(ByteType(), False), False),
            StructField("null1", DoubleType(), True)])
        df = self.spark.createDataFrame(rdd, schema)
        results = df.rdd.map(lambda x: (x.byte1, x.byte2, x.short1, x.short2, x.int1, x.float1,
                             x.date1, x.time1, x.map1["a"], x.struct1.b, x.list1, x.null1))
        r = (127, -128, -32768, 32767, 2147483647, 1.0, date(2010, 1, 1),
             datetime(2010, 1, 1, 1, 1, 1), 1, 2, [1, 2, 3], None)
        self.assertEqual(r, results.first())

        with self.tempView("table2"):
            df.createOrReplaceTempView("table2")
            r = self.spark.sql("SELECT byte1 - 1 AS byte1, byte2 + 1 AS byte2, " +
                               "short1 + 1 AS short1, short2 - 1 AS short2, int1 - 1 AS int1, " +
                               "float1 + 1.5 as float1 FROM table2").first()

            self.assertEqual((126, -127, -32767, 32766, 2147483646, 2.5), tuple(r))
コード例 #11
0
ファイル: main.py プロジェクト: KChalk/RedditProject
def filterPosts(fileList, sc, ss, subs=set(), minwords='100'):
    tokensUDF = udf(tokenize, MapType(StringType(),IntegerType()))
    countUDF = udf(sumCounter, IntegerType())

    firstFile=True
    for filename in fileList:
        month=filename[-9:-4]
        print('\n\n\n reading', month, filename)
        monthData = ss.read.json(filename)

        if subs!=set():
            monthData=monthData.filter(monthData.subreddit.isin(subs))

        filtered= monthData \
            .filter(monthData['is_self'] == True) 	\
            .select('id','subreddit', tokensUDF('selftext').alias('counter'))	\
            .withColumn('wordcount', countUDF('counter'))	\
            .filter('wordcount >='+minwords) \
            .select('id','subreddit','counter', 'wordcount') \
            .withColumn('month', lit(month))
        print('\n\n\n saving', month)
        filtered.write.parquet('filtered_'+month+'.parquet', mode='overwrite')
        if firstFile:
            alldata=filtered
            firstFile=False
        else:
            alldata=alldata.union(filtered)

    return alldata
コード例 #12
0
    def _get_instance_usage_schema():
        """get instance usage schema."""

        # Initialize columns for all string fields
        columns = [
            "tenant_id", "user_id", "resource_uuid", "geolocation", "region",
            "zone", "host", "project_id", "aggregated_metric_name",
            "firstrecord_timestamp_string", "lastrecord_timestamp_string",
            "service_group", "service_id", "usage_date", "usage_hour",
            "usage_minute", "aggregation_period", "namespace", "pod_name",
            "app", "container_name", "interface", "deployment", "daemon_set"
        ]

        columns_struct_fields = [
            StructField(field_name, StringType(), True)
            for field_name in columns
        ]

        # Add columns for non-string fields
        columns_struct_fields.append(
            StructField("firstrecord_timestamp_unix", DoubleType(), True))
        columns_struct_fields.append(
            StructField("lastrecord_timestamp_unix", DoubleType(), True))
        columns_struct_fields.append(
            StructField("quantity", DoubleType(), True))
        columns_struct_fields.append(
            StructField("record_count", DoubleType(), True))

        columns_struct_fields.append(
            StructField("processing_meta",
                        MapType(StringType(), StringType(), True), True))
        schema = StructType(columns_struct_fields)

        return schema
コード例 #13
0
def process_corpus(raw_corpus,
                   normalizer=normalize,
                   tokenizer=tokenize,
                   ngram_counter=ngram_counts):
    """
    :param raw_corpus: RDD[Tuple[int, str]] as returned from load_raw_corpus
    :param normalizer: Callable[[str], str] preprocessing function
    :param tokenizer: Callable[[str], Iterable[str]]
    :param ngram_counter: Callable[[Iterable[str], Dict[str, int]]]
    :return: DataFrame[document_index: bigint, wc: bigint, token_counts: map<string,int>]
    """
    schema = StructType([
        StructField("document_index", LongType()),
        StructField(
            "data",
            StructType([
                StructField("wc", LongType()),
                StructField("token_counts", MapType(StringType(),
                                                    IntegerType())),
            ]),
        ),
    ])
    normalized = raw_corpus.mapValues(normalize)
    return (normalized.mapValues(tokenize).mapValues(ngram_counter).toDF(
        schema).select("document_index", "data.wc",
                       "data.token_counts"), normalized)
コード例 #14
0
def parametric_action_preprocessing(
    df,
    actions: List[str],
    multi_steps: Optional[int] = None,
    include_possible_actions: bool = True,
):
    assert (not include_possible_actions
            ), "current we don't support include_possible_actions"

    next_map_udf = make_next_udf(multi_steps, MapType(LongType(), FloatType()))
    df = df.withColumn("next_action", next_map_udf("next_action"))

    def make_not_terminal_udf():
        """ Return true iff next_action is an empty map """
        def get_not_terminal(next_action):
            return len(next_action) > 0

        return udf(get_not_terminal, BooleanType())

    not_terminal_udf = make_not_terminal_udf()
    df = df.withColumn("not_terminal", not_terminal_udf("next_action"))

    df = make_sparse2dense(df, "action", actions)
    df = make_sparse2dense(df, "next_action", actions)
    return df
コード例 #15
0
    def _transform(self, dataframe):

        out_col = self.getOutputCol()
        in_col = self.getInputCol()

        def get_content(data):
            contents = {}
            lines = data.splitlines(keepends=False)

            for line in lines:
                json_line = json.loads(line)
                feature_array = json_line.get('features')

                for element in feature_array:
                    name = element.get('name')
                    value = element.get('value')
                    if name in contents:
                        contents[name].append(value)
                    else:
                        contents[name] = [value]

            return contents

        get_cntn = udf(get_content,
                       MapType(StringType(), ArrayType(DoubleType())))
        return dataframe.withColumn(out_col, get_cntn(in_col))
コード例 #16
0
ファイル: posts.py プロジェクト: MichalTrojanowski/ZTNBD-ZAD
    def _transform(self, dataframe):
        out_col = self.getOutputCol()
        in_col = self.getInputCol()

        def tags_sum_by_key(tags):
            types = {}

            for tag in tags:
                if tag[1] in types:
                    types[tag[1]] += 1
                else:
                    types[tag[1]] = 1

            return types

        def extract_speech_parts(data):
            tags = []
            for post in data:
                tags.extend(TextBlob(post).tags)
            speech_parts = tags_sum_by_key(tags)
            return speech_parts

        ext_speech_parts = udf(extract_speech_parts,
                               MapType(StringType(), IntegerType()))
        return dataframe.withColumn(out_col, ext_speech_parts(in_col))
コード例 #17
0
ファイル: test_arrow.py プロジェクト: Swidasya/spark-research
 def test_toPandas_fallback_enabled(self):
     with self.sql_conf(
         {"spark.sql.execution.arrow.pyspark.fallback.enabled": True}):
         schema = StructType([
             StructField("map", MapType(StringType(), IntegerType()), True)
         ])
         df = self.spark.createDataFrame([({u'a': 1}, )], schema=schema)
         with QuietTest(self.sc):
             with self.warnings_lock:
                 with warnings.catch_warnings(record=True) as warns:
                     # we want the warnings to appear even if this test is run from a subclass
                     warnings.simplefilter("always")
                     pdf = df.toPandas()
                     # Catch and check the last UserWarning.
                     user_warns = [
                         warn.message for warn in warns
                         if isinstance(warn.message, UserWarning)
                     ]
                     self.assertTrue(len(user_warns) > 0)
                     self.assertTrue("Attempting non-optimization" in str(
                         user_warns[-1]))
                     assert_frame_equal(pdf,
                                        pd.DataFrame({u'map': [{
                                            u'a': 1
                                        }]}))
コード例 #18
0
ファイル: posts.py プロジェクト: pkprzekwas/post_extractor
    def _transform(self, dataframe):
        out_col = self.getOutputCol()
        in_col = self.getInputCol()

        def extract_speech_parts(data):
            speech_parts = {}

            for tag in load('help/tagsets/upenn_tagset.pickle').keys():
                if any(c.isalpha() for c in tag):
                    speech_parts[tag] = []

            for data_line in data:
                data_line_tags = {}

                for tag_tuple in TextBlob(data_line).tags:
                    if tag_tuple[1] in data_line_tags:
                        data_line_tags[tag_tuple[1]] += 1
                    else:
                        data_line_tags[tag_tuple[1]] = 1

                for tag in speech_parts.keys():
                    if tag not in data_line_tags.keys():
                        speech_parts[tag].append(0)
                    else:
                        speech_parts[tag].append(data_line_tags[tag])
            return speech_parts

        ext_speech_parts = udf(extract_speech_parts,
                               MapType(StringType(), ArrayType(IntegerType())))
        return dataframe.withColumn(out_col, ext_speech_parts(in_col))
コード例 #19
0
ファイル: test_types.py プロジェクト: zwj0110/spark
 def test_parse_datatype_string(self):
     from pyspark.sql.types import _all_atomic_types, _parse_datatype_string
     for k, t in _all_atomic_types.items():
         if t != NullType:
             self.assertEqual(t(), _parse_datatype_string(k))
     self.assertEqual(IntegerType(), _parse_datatype_string("int"))
     self.assertEqual(DecimalType(1, 1),
                      _parse_datatype_string("decimal(1  ,1)"))
     self.assertEqual(DecimalType(10, 1),
                      _parse_datatype_string("decimal( 10,1 )"))
     self.assertEqual(DecimalType(11, 1),
                      _parse_datatype_string("decimal(11,1)"))
     self.assertEqual(ArrayType(IntegerType()),
                      _parse_datatype_string("array<int >"))
     self.assertEqual(MapType(IntegerType(), DoubleType()),
                      _parse_datatype_string("map< int, double  >"))
     self.assertEqual(
         StructType([
             StructField("a", IntegerType()),
             StructField("c", DoubleType())
         ]), _parse_datatype_string("struct<a:int, c:double >"))
     self.assertEqual(
         StructType([
             StructField("a", IntegerType()),
             StructField("c", DoubleType())
         ]), _parse_datatype_string("a:int, c:double"))
     self.assertEqual(
         StructType([
             StructField("a", IntegerType()),
             StructField("c", DoubleType())
         ]), _parse_datatype_string("a INT, c DOUBLE"))
コード例 #20
0
def to_schema_type(typ, elem):
    if typ is None:
        return hint_to_schema_type('None')
    if issubclass(typ, basestring):
        return hint_to_schema_type('str')
    if issubclass(typ, bool):
        return hint_to_schema_type('bool')
    if issubclass(typ, float):
        return hint_to_schema_type('float')
    if issubclass(typ, (int, long)):
        # Some integers cannot be stored in long, but we cannot tell this
        #  from the column type.  Let it fail in spark.
        return hint_to_schema_type('int')
    if issubclass(typ, datetime.datetime):
        return hint_to_schema_type('datetime')
    if issubclass(typ, list):
        if elem is None or len(elem) == 0:
            raise ValueError('Schema type cannot be determined.')
        elem_type = to_schema_type(type(elem[0]), None)
        if elem_type is None:
            raise TypeError('Element type cannot be determined')
        return ArrayType(elem_type)
    if issubclass(typ, dict):
        if elem is None or len(elem) == 0:
            raise ValueError('Schema type cannot be determined.')
        key_type = to_schema_type(type(elem.keys()[0]), None)
        if key_type is None:
            raise TypeError('Key type cannot be determined')
        val_type = to_schema_type(type(elem.values()[0]), None)
        if val_type is None:
            raise TypeError('Value type cannot be determined')
        return MapType(key_type, val_type)
    if issubclass(typ, types.NoneType):
        return None
    return hint_to_schema_type('str')
コード例 #21
0
def as_pings_subset_df(as_df, date_start, total_period, slug=None):
    """
    get subset of activity stream pings with some columns standardized

    providing an experiment slug will add a branch field and remove pings
    without the slug

    table schema
      client_id
      activity_dt
      branch (optional)
      as cols

    """

    # function can take datetime or the s3 date string or as date string
    if type(date_start) == str:
        if '-' in date_start:
            date_start = string_to_date(date_start, '%Y-%m-%d')
        else:
            date_start = string_to_date(date_start)

    # get date end of maximum possible observation period
    date_obs_end = date_plus_N(date_start, total_period)

    # convert everything into as date string format
    date_start_str = date_to_string(date_start, '%Y-%m-%d')
    date_obs_end_str = date_to_string(date_obs_end, '%Y-%m-%d')

    # if we're looking back, switch start and end
    if date_obs_end < date_start:
        date_start_str = date_to_string(date_obs_end, '%Y-%m-%d')
        date_obs_end_str = date_to_string(date_start, '%Y-%m-%d')

    # ----------------- subset dates -----------------

    as_df = as_df.filter("date >= '%s'" % date_start_str)
    as_df = as_df.filter("date <= '%s'" % date_obs_end_str)

    # ----------------- tagged only if slug provided -----------------

    if slug is not None:
        # set up udf for parsing activity stream experiment field
        schema = MapType(StringType(), StringType())
        as_experiment_field_udf = udf(as_experiment_field, schema)

        # get experiments field into standard format
        as_df = as_df.withColumn('experiments',
                                 as_experiment_field_udf(F.col('shield_id')))

        # keep only data tagged with experiment and get branch column
        as_df = as_df.filter("experiments['%s'] is not null" % slug)
        as_df = as_df.withColumn('branch', F.col('experiments')[slug])
        as_df = as_df.drop('experiments')

    as_df = as_df.withColumn('activity_dt', F.col('date'))
    as_df = as_df.drop('shield_id').drop('date')

    return as_df
コード例 #22
0
def transfrom_fea_sen_pairs(spark, database, fea_sen_pairs_collection):
    def deserialize(d):
        d.pop('_id')
        for k, v in d.items():
            d[k] = json.loads(v) if k == 'cFeaSenPairs' else v
        return d

    fea_sen_pairs_df = spark.read.format("com.mongodb.spark.sql.DefaultSource"). \
        option("uri", "mongodb://127.0.0.1/").option("database", database).option("collection",
                                                                                  fea_sen_pairs_collection). \
        load()

    fea_sen_pairs_rdd = fea_sen_pairs_df.rdd.map(lambda y: y.asDict(recursive=True)). \
        map(lambda x: deserialize(x))

    fields = [
        StructField('pId', LongType(), False),
        StructField('cId', LongType(), False),
        StructField(
            'cFeaSenPairs',
            ArrayType(
                StructType([
                    StructField('sId', IntegerType(), True),
                    StructField(
                        'sFeaSenPairs',
                        ArrayType(
                            StructType([
                                StructField(
                                    'feature',
                                    MapType(StringType(), StringType(), True),
                                    True),
                                StructField(
                                    'sentiment',
                                    MapType(StringType(), StringType(), True),
                                    True),
                                StructField('relate', StringType(), False),
                            ]), True), True)
                ]), True), True)
    ]

    schema = StructType(fields)
    temp = spark.createDataFrame(fea_sen_pairs_rdd, schema)
    temp.write.format("com.mongodb.spark.sql.DefaultSource").mode("overwrite"). \
        option("uri", "mongodb://127.0.0.1/").option("database", database).option("collection",
                                                                                  'temp'). \
        save()
コード例 #23
0
ファイル: test_arrow.py プロジェクト: Swidasya/spark-research
 def test_toPandas_fallback_disabled(self):
     schema = StructType(
         [StructField("map", MapType(StringType(), IntegerType()), True)])
     df = self.spark.createDataFrame([(None, )], schema=schema)
     with QuietTest(self.sc):
         with self.warnings_lock:
             with self.assertRaisesRegexp(Exception, 'Unsupported type'):
                 df.toPandas()
コード例 #24
0
ファイル: ikea_assignment.py プロジェクト: raalesir/ikea
    def get_schema(self):
        """
        returns the  data schema. In case the schema changes  in future, one can redefine the method

        :return: data schema
        """
        schema = StructType([
            StructField("user", StringType(), True),
            StructField("timestamp", TimestampType(), True),
            StructField(
                "items",
                ArrayType(
                    MapType(StringType(), MapType(StringType(),
                                                  DoubleType()))), True),
        ])

        return schema
コード例 #25
0
def generate_idx_for_df(df: DataFrame, col_name: str, col_schema):
    idx_udf = udf(lambda x: udf_array_to_map(x),
                  MapType(IntegerType(), col_schema, True))
    df = df.withColumn("map", idx_udf(col(col_name)))
    df = df.select("problem_type", "user_id", "oms_protected", "problem_id",
                   "create_at",
                   explode("map").alias("item_id", "answer"))
    return df
コード例 #26
0
    def _get_record_store_df_schema():
        """get instance usage schema."""

        columns = ["event_timestamp_string",
                   "event_type", "event_quantity_name",
                   "event_status", "event_version",
                   "record_type", "resource_uuid", "tenant_id",
                   "user_id", "region", "zone",
                   "host", "project_id",
                   "event_date", "event_hour", "event_minute",
                   "event_second", "metric_group", "metric_id"]

        columns_struct_fields = [StructField(field_name, StringType(), True)
                                 for field_name in columns]

        # Add a column for a non-string fields
        columns_struct_fields.insert(0,
                                     StructField("event_timestamp_unix",
                                                 DoubleType(), True))
        columns_struct_fields.insert(0,
                                     StructField("event_quantity",
                                                 DoubleType(), True))

        # map to metric meta
        columns_struct_fields.append(StructField("meta",
                                                 MapType(StringType(),
                                                         StringType(),
                                                         True),
                                                 True))
        # map to dimensions
        columns_struct_fields.append(StructField("dimensions",
                                                 MapType(StringType(),
                                                         StringType(),
                                                         True),
                                                 True))
        # map to value_meta
        columns_struct_fields.append(StructField("value_meta",
                                                 MapType(StringType(),
                                                         StringType(),
                                                         True),
                                                 True))

        schema = StructType(columns_struct_fields)

        return schema
コード例 #27
0
def get_common_pyspark_schema():
    schema = StructType([
        StructField('day', StringType(), True),
        StructField('ands', ArrayType(StringType()), True),
        StructField('minus', ArrayType(StringType()), True),
        StructField('allocated', MapType(StringType(), IntegerType()), True),
        StructField('amount', IntegerType(), True)
    ])
    return (schema)
コード例 #28
0
def force_decimal_precision_scale(dt: DataType,
                                  precision: int = 38,
                                  scale: int = 18) -> DataType:
    """
    Returns a data type with a fixed decimal type.

    The precision and scale of the decimal type are fixed with the given values.

    Examples
    --------
    >>> from pyspark.sql.types import *
    >>> force_decimal_precision_scale(StructType([
    ...     StructField("A", DecimalType(10, 0), True),
    ...     StructField("B", DecimalType(14, 7), False)]))  # doctest: +NORMALIZE_WHITESPACE
    StructType(List(StructField(A,DecimalType(38,18),true),StructField(B,DecimalType(38,18),false)))

    >>> force_decimal_precision_scale(StructType([
    ...     StructField("A",
    ...         StructType([
    ...             StructField('a',
    ...                 MapType(DecimalType(5, 0),
    ...                 ArrayType(DecimalType(20, 0), False), False), False),
    ...             StructField('b', StringType(), True)])),
    ...     StructField("B", DecimalType(30, 15), False)]),
    ...     precision=30, scale=15)  # doctest: +NORMALIZE_WHITESPACE
    StructType(List(StructField(A,StructType(List(StructField(a,MapType(DecimalType(30,15),\
ArrayType(DecimalType(30,15),false),false),false),StructField(b,StringType,true))),true),\
StructField(B,DecimalType(30,15),false)))
    """
    if isinstance(dt, StructType):
        new_fields = []
        for field in dt.fields:
            new_fields.append(
                StructField(
                    field.name,
                    force_decimal_precision_scale(field.dataType, precision,
                                                  scale),
                    nullable=field.nullable,
                    metadata=field.metadata,
                ))
        return StructType(new_fields)
    elif isinstance(dt, ArrayType):
        return ArrayType(
            force_decimal_precision_scale(dt.elementType, precision, scale),
            containsNull=dt.containsNull,
        )
    elif isinstance(dt, MapType):
        return MapType(
            force_decimal_precision_scale(dt.keyType, precision, scale),
            force_decimal_precision_scale(dt.valueType, precision, scale),
            valueContainsNull=dt.valueContainsNull,
        )
    elif isinstance(dt, DecimalType):
        return DecimalType(precision=precision, scale=scale)
    else:
        return dt
コード例 #29
0
def processGraphDFLogsV2(filepath, df_logs, spark, sc):
  
  def reduce_ccf(key, values):
    min_value = values.pop(values.index(min(values)))
    ret = {}
    ret[key] = min_value
    if min_value < key:
      for value in values:
        acc.add(1)
        ret[value] = min_value
    else:
      ret = None
    return ret

  reducer = F.udf(lambda x, y: reduce_ccf(x, y), MapType(IntegerType(), IntegerType()))

  schema = StructType([
      StructField("key", IntegerType(), True),
      StructField("value", IntegerType(), True)])

  df = spark.read.format('csv').load(filepath, headers=False, delimiter='\t', schema=schema)
  df = df.na.drop()
  df_logs = add_log(df_logs, filepath, "python-df", 0, "start", datetime.now(), 0, 0)

  acc = sc.accumulator(1)
  loop_counter = 1
  
  while acc.value != 0:
    
    acc.value = 0
    print(f"----------\nStart loop at {datetime.now()}, accumulator value is {acc.value}")
    
    # CCF-Iterate
    df_inverter = df.select(F.col('value').alias('key'), F.col('key').alias('value'))
    df = df.union(df_inverter)
    
#     if logs==True:
#       acc.value = 0
#       debug = df.collect()
#       df_logs_ret = add_log(df_logs, filepath, "python-df", loop_counter, "it-map", datetime.now(), acc.value, len(debug)) 

    df = df.groupBy('key').agg(F.collect_list('value').alias('value'))
    df = df.withColumn('reducer', reducer('key', 'value')).select('reducer')
    df = df.select(F.explode('reducer'))
    df = df.na.drop()
    
    # CCF - Dedup
    df = df.distinct()
    collected = df.collect()
    df = spark.createDataFrame(sc.parallelize(collected), schema)
    print(f"End loop at {datetime.now()}, final value is {acc.value}")
    df_logs = add_log(df_logs, filepath, "python-df", loop_counter, "ded_reduce", datetime.now(), acc.value, len(collected))
    loop_counter += 1
  
  print(f"----------\nProcessed file at {datetime.now()}\n----------")
  return df_logs
コード例 #30
0
def load_sql_user_functions(sc, sqlContext):
    """Load our custom UDAFs into a sql context."""
    sqlContext.udf.register('format_id', format_id, StringType())
    sqlContext.udf.register('format_metrics', format_metrics,
                            MapType(StringType(), IntegerType()))

    # custom aggregation function. Needs a jar provided in runner script.
    agg_counter = sc._jvm.com.jbennet.daskvsspark.udafs.AggregateCounter()
    sqlContext.sparkSession._jsparkSession.udf().register(
        'count_values', agg_counter)