Esempio n. 1
0
 def generateExpr(columnName, listIntervals):
     if (len(listIntervals) == 1):
         return when(col(columnName).between(listIntervals[0][0], listIntervals[0][1]), 0).otherwise(None)
     else:
         return (when((col(columnName) >= listIntervals[0][0]) & (col(columnName) < listIntervals[0][1]),
                      len(listIntervals) - 1)
                 .otherwise(generateExpr(columnName, listIntervals[1:])))
 def data(self):
     from pyspark.sql.functions import array, explode, col, lit
     return self.spark.range(10).toDF('id') \
         .withColumn("vs", array([lit(i * 1.0) + col('id') for i in range(20, 30)])) \
         .withColumn("v", explode(col('vs'))) \
         .drop('vs') \
         .withColumn('w', lit(1.0))
    def test_mixed_sql_and_udf(self):
        df = self.data
        w = self.unbounded_window
        ow = self.ordered_window
        max_udf = self.pandas_agg_max_udf
        min_udf = self.pandas_agg_min_udf

        result1 = df.withColumn('v_diff', max_udf(df['v']).over(w) - min_udf(df['v']).over(w))
        expected1 = df.withColumn('v_diff', max(df['v']).over(w) - min(df['v']).over(w))

        # Test mixing sql window function and window udf in the same expression
        result2 = df.withColumn('v_diff', max_udf(df['v']).over(w) - min(df['v']).over(w))
        expected2 = expected1

        # Test chaining sql aggregate function and udf
        result3 = df.withColumn('max_v', max_udf(df['v']).over(w)) \
                    .withColumn('min_v', min(df['v']).over(w)) \
                    .withColumn('v_diff', col('max_v') - col('min_v')) \
                    .drop('max_v', 'min_v')
        expected3 = expected1

        # Test mixing sql window function and udf
        result4 = df.withColumn('max_v', max_udf(df['v']).over(w)) \
                    .withColumn('rank', rank().over(ow))
        expected4 = df.withColumn('max_v', max(df['v']).over(w)) \
                      .withColumn('rank', rank().over(ow))

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
        self.assertPandasEqual(expected2.toPandas(), result2.toPandas())
        self.assertPandasEqual(expected3.toPandas(), result3.toPandas())
        self.assertPandasEqual(expected4.toPandas(), result4.toPandas())
Esempio n. 4
0
 def test_vectorized_udf_string_in_udf(self):
     import pandas as pd
     df = self.spark.range(10)
     str_f = pandas_udf(lambda x: pd.Series(map(str, x)), StringType())
     actual = df.select(str_f(col('id')))
     expected = df.select(col('id').cast('string'))
     self.assertEquals(expected.collect(), actual.collect())
Esempio n. 5
0
 def setup_method(self, method):
     sparkConf = create_spark_conf().setMaster("local[4]")\
         .setAppName("test wide and deep")
     self.sc = init_nncontext(sparkConf)
     self.sqlContext = SQLContext(self.sc)
     data_path = os.path.join(os.path.split(__file__)[0], "../../resources/recommender")
     categorical_gender_udf = udf(lambda gender:
                                  categorical_from_vocab_list(gender, ["F", "M"], start=1))
     bucket_udf = udf(lambda feature1, feature2:
                      hash_bucket(str(feature1) + "_" + str(feature2), bucket_size=100))
     self.data_in = self.sqlContext.read.parquet(data_path) \
         .withColumn("gender", categorical_gender_udf(col("gender")).cast("int")) \
         .withColumn("occupation-gender",
                     bucket_udf(col("occupation"), col("gender")).cast("int"))
     self.column_info = ColumnFeatureInfo(
         wide_base_cols=["occupation", "gender"],
         wide_base_dims=[21, 3],
         wide_cross_cols=["occupation-gender"],
         wide_cross_dims=[100],
         indicator_cols=["occupation", "gender"],
         indicator_dims=[21, 3],
         embed_cols=["userId", "itemId"],
         embed_in_dims=[100, 100],
         embed_out_dims=[20, 20],
         continuous_cols=["age"])
    def test_basic(self):
        df = self.data
        weighted_mean_udf = self.pandas_agg_weighted_mean_udf

        # Groupby one column and aggregate one UDF with literal
        result1 = df.groupby('id').agg(weighted_mean_udf(df.v, lit(1.0))).sort('id')
        expected1 = df.groupby('id').agg(mean(df.v).alias('weighted_mean(v, 1.0)')).sort('id')
        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())

        # Groupby one expression and aggregate one UDF with literal
        result2 = df.groupby((col('id') + 1)).agg(weighted_mean_udf(df.v, lit(1.0)))\
            .sort(df.id + 1)
        expected2 = df.groupby((col('id') + 1))\
            .agg(mean(df.v).alias('weighted_mean(v, 1.0)')).sort(df.id + 1)
        self.assertPandasEqual(expected2.toPandas(), result2.toPandas())

        # Groupby one column and aggregate one UDF without literal
        result3 = df.groupby('id').agg(weighted_mean_udf(df.v, df.w)).sort('id')
        expected3 = df.groupby('id').agg(mean(df.v).alias('weighted_mean(v, w)')).sort('id')
        self.assertPandasEqual(expected3.toPandas(), result3.toPandas())

        # Groupby one expression and aggregate one UDF without literal
        result4 = df.groupby((col('id') + 1).alias('id'))\
            .agg(weighted_mean_udf(df.v, df.w))\
            .sort('id')
        expected4 = df.groupby((col('id') + 1).alias('id'))\
            .agg(mean(df.v).alias('weighted_mean(v, w)'))\
            .sort('id')
        self.assertPandasEqual(expected4.toPandas(), result4.toPandas())
Esempio n. 7
0
    def test_column_getitem(self):
        from pyspark.sql.functions import col

        self.assertIsInstance(col("foo")[1:3], Column)
        self.assertIsInstance(col("foo")[0], Column)
        self.assertIsInstance(col("foo")["bar"], Column)
        self.assertRaises(ValueError, lambda: col("foo")[0:10:2])
Esempio n. 8
0
def scalar_pandas_udf_example(spark):
    # $example on:scalar_pandas_udf$
    import pandas as pd

    from pyspark.sql.functions import col, pandas_udf
    from pyspark.sql.types import LongType

    # Declare the function and create the UDF
    def multiply_func(a, b):
        return a * b

    multiply = pandas_udf(multiply_func, returnType=LongType())

    # The function for a pandas_udf should be able to execute with local Pandas data
    x = pd.Series([1, 2, 3])
    print(multiply_func(x, x))
    # 0    1
    # 1    4
    # 2    9
    # dtype: int64

    # Create a Spark DataFrame, 'spark' is an existing SparkSession
    df = spark.createDataFrame(pd.DataFrame(x, columns=["x"]))

    # Execute function as a Spark vectorized UDF
    df.select(multiply(col("x"), col("x"))).show()
Esempio n. 9
0
    def test_smvPlusDateTime(self):
        df = self.createDF("t:Timestamp[yyyyMMdd]", "19760131;20120229")
        r1 = df.select(col("t").smvPlusDays(-10).alias("ts"))
        r2 = df.select(col("t").smvPlusMonths(1).alias("ts"))
        r3 = df.select(col("t").smvPlusWeeks(3).alias("ts"))
        r4 = df.select(col("t").smvPlusYears(2).alias("ts"))
        r5 = df.select(col("t").smvPlusYears(4).alias("ts"))

        s = "ts: Timestamp[yyyy-MM-dd hh:mm:ss.S]"
        e1 = self.createDF(
            s,
            "1976-01-21 00:00:00.0;" +
            "2012-02-19 00:00:00.0")
        e2 = self.createDF(
            s,
            "1976-02-29 00:00:00.0;" +
            "2012-03-29 00:00:00.0")
        e3 = self.createDF(
            s,
            "1976-02-21 00:00:00.0;" +
            "2012-03-21 00:00:00.0")
        e4 = self.createDF(
            s,
            "1978-01-31 00:00:00.0;" +
            "2014-02-28 00:00:00.0")
        e5 = self.createDF(
            s,
            "1980-01-31 00:00:00.0;" +
            "2016-02-29 00:00:00.0")

        self.should_be_same(e1, r1)
        self.should_be_same(e2, r2)
        self.should_be_same(e3, r3)
        self.should_be_same(e4, r4)
        self.should_be_same(e5, r5)
Esempio n. 10
0
    def test_vectorized_udf_dates(self):
        schema = StructType().add("idx", LongType()).add("date", DateType())
        data = [(0, date(1969, 1, 1),),
                (1, date(2012, 2, 2),),
                (2, None,),
                (3, date(2100, 4, 4),),
                (4, date(2262, 4, 12),)]
        df = self.spark.createDataFrame(data, schema=schema)

        date_copy = pandas_udf(lambda t: t, returnType=DateType())
        df = df.withColumn("date_copy", date_copy(col("date")))

        @pandas_udf(returnType=StringType())
        def check_data(idx, date, date_copy):
            msgs = []
            is_equal = date.isnull()
            for i in range(len(idx)):
                if (is_equal[i] and data[idx[i]][1] is None) or \
                        date[i] == data[idx[i]][1]:
                    msgs.append(None)
                else:
                    msgs.append(
                        "date values are not equal (date='%s': data[%d][1]='%s')"
                        % (date[i], idx[i], data[idx[i]][1]))
            return pd.Series(msgs)

        result = df.withColumn("check_data",
                               check_data(col("idx"), col("date"), col("date_copy"))).collect()

        self.assertEquals(len(data), len(result))
        for i in range(len(result)):
            self.assertEquals(data[i][1], result[i][1])  # "date" col
            self.assertEquals(data[i][1], result[i][2])  # "date_copy" col
            self.assertIsNone(result[i][3])  # "check_data" col
Esempio n. 11
0
    def test_udf_with_filter_function(self):
        df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
        from pyspark.sql.functions import udf, col
        from pyspark.sql.types import BooleanType

        my_filter = udf(lambda a: a < 2, BooleanType())
        sel = df.select(col("key"), col("value")).filter((my_filter(col("key"))) & (df.value < "2"))
        self.assertEqual(sel.collect(), [Row(key=1, value='1')])
Esempio n. 12
0
    def get_latest_data(self):
        from pyspark.sql import SparkSession
        import config
        import pandas as pd
        # initialise sparkContext
        spark1 = SparkSession.builder \
            .master(config.sp_master) \
            .appName(config.sp_appname) \
            .config('spark.executor.memory', config.sp_memory) \
            .config("spark.cores.max", config.sp_cores) \
            .getOrCreate()

        sc = spark1.sparkContext

        # using SQLContext to read parquet file
        from pyspark.sql import SQLContext
        sqlContext = SQLContext(sc)

        from datetime import datetime
        t1 = datetime.now()
        df = sqlContext.read.parquet(config.proj_path+'/datas/appid_datapoint_parquet1')
        df2 =  sqlContext.read.parquet(config.proj_path+'/datas/appid_attribute_parquet')
        df2 = df2[['attribute_id','source','target_address','location']]

        #renaming the column
        from pyspark.sql.functions import col

        df2 = df2.select(col("attribute_id").alias("target_attribute_id"),
                           col("source").alias("source_y"),
                               col("target_address").alias("target_address_y"),
                           col("location").alias("location"), 
                      )
        # merging the dfs

        df_merge = df.join(df2,how='left',on='target_attribute_id')

        # Needed data extraction
        t1 = datetime.now()
        data = df_merge.registerTempTable('dummy')
        data = sqlContext.sql('select sum(byte_count) as byte_count_sum  , time_stamp, location from dummy group by location, time_stamp')
        data = data[data.byte_count_sum > 0]

        # data cleaning
        self.p7_df=data.toPandas()
        t2 =datetime.now()
        time_to_fetch = str(t2-t1)

        self.p7_df['bw'] = self.p7_df['byte_count_sum']/(8*3600)
        self.p7_df = self.p7_df.sort_values(by='location',ascending=True)       
        dates_outlook = pd.to_datetime(pd.Series(self.p7_df.time_stamp),unit='ms')
        self.p7_df.index = dates_outlook   


        self.p7_df['date'] = self.p7_df.index.date
        self.p7_df = self.p7_df.sort_values(by='time_stamp')

        t2 =datetime.now()
        time_to_fetch = str(t2-t1)
Esempio n. 13
0
    def test_cast_to_string_with_udt(self):
        from pyspark.sql.functions import col
        row = (ExamplePoint(1.0, 2.0), PythonOnlyPoint(3.0, 4.0))
        schema = StructType([StructField("point", ExamplePointUDT(), False),
                             StructField("pypoint", PythonOnlyUDT(), False)])
        df = self.spark.createDataFrame([row], schema)

        result = df.select(col('point').cast('string'), col('pypoint').cast('string')).head()
        self.assertEqual(result, Row(point=u'(1.0, 2.0)', pypoint=u'[3.0, 4.0]'))
Esempio n. 14
0
    def test_smvRenameField_preserve_meta_for_unrenamed_fields(self):
        df = self.createDF("a:Integer; b:String", "1,abc;1,def;2,ghij")
        desc = "c description"
        res1 = df.groupBy(col("a")).agg(count(col("a")).alias("c"))\
                 .smvDesc(("c", desc))
        self.assertEqual(res1.smvGetDesc(), [("a", ""), ("c", desc)])

        res2 = res1.smvRenameField(("a", "d"))
        self.assertEqual(res2.smvGetDesc(), [("d", ""), ("c", desc)])
Esempio n. 15
0
    def test_smvDayMonth70(self):
        df = self.createDF("t:Timestamp[yyyyMMdd]", "19760131;20120229")
        r1 = df.select(col("t").smvDay70().alias("t_day70"))
        r2 = df.select(col("t").smvMonth70().alias("t_month70"))

        e1 = self.createDF("t_day70: Integer", "2221;15399")
        e2 = self.createDF("t_month70: Integer", "72;505")

        self.should_be_same(e1, r1)
        self.should_be_same(e2, r2)
Esempio n. 16
0
    def create_hist_data(df, column, minim, maxim, bins=10):

        def create_all_conditions(current_col, column, left_edges, count=1):
            """
            Recursive function that exploits the
            ability to call the Spark SQL Column method
            .when() in a recursive way.
            """
            left_edges = left_edges[:]
            if len(left_edges) == 0:
                return current_col
            if len(left_edges) == 1:
                next_col = current_col.when(col(column) >= float(left_edges[0]), count)
                left_edges.pop(0)
                return create_all_conditions(next_col, column, left_edges[:], count+1)
            next_col = current_col.when((float(left_edges[0]) <= col(column))
                                        & (col(column) < float(left_edges[1])), count)
            left_edges.pop(0)
            return create_all_conditions(next_col, column, left_edges[:], count+1)

        num_range = maxim - minim
        bin_width = num_range / float(bins)
        left_edges = [minim]
        for _bin in range(bins):
            left_edges = left_edges + [left_edges[-1] + bin_width]
        left_edges.pop()
        expression_col = when((float(left_edges[0]) <= col(column))
                              & (col(column) < float(left_edges[1])), 0)
        left_edges_copy = left_edges[:]
        left_edges_copy.pop(0)
        bin_data = (df.select(col(column))
                    .na.drop()
                    .select(col(column),
                            create_all_conditions(expression_col,
                                                  column,
                                                  left_edges_copy
                                                 ).alias("bin_id")
                           )
                    .groupBy("bin_id").count()
                   ).toPandas()

        # If no data goes into one bin, it won't 
        # appear in bin_data; so we should fill
        # in the blanks:
        bin_data.index = bin_data["bin_id"]
        new_index = list(range(bins))
        bin_data = bin_data.reindex(new_index)
        bin_data["bin_id"] = bin_data.index
        bin_data = bin_data.fillna(0)

        # We add the left edges and bin width:
        bin_data["left_edge"] = left_edges
        bin_data["width"] = bin_width

        return bin_data
Esempio n. 17
0
def spark_timestamp_split(
    data,
    ratio=0.75,
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_timestamp=DEFAULT_TIMESTAMP_COL,
):
    """Spark timestamp based splitter
    The splitter splits the data into sets by timestamps without stratification on either
    user or item.
    The ratios are applied on the timestamp column which is divided accordingly into
    several partitions.

    Args:
        data (spark.DataFrame): Spark DataFrame to be split.
        ratio (float or list): Ratio for splitting data. If it is a single float number
            it splits data into two sets and the ratio argument indicates the ratio of
            training data set; if it is a list of float numbers, the splitter splits
            data into several portions corresponding to the split ratios. If a list is
            provided and the ratios are not summed to 1, they will be normalized.
            Earlier indexed splits will have earlier times
            (e.g the latest time in split[0] <= the earliest time in split[1])
        col_user (str): column name of user IDs.
        col_item (str): column name of item IDs.
        col_timestamp (str): column name of timestamps. Float number represented in
        seconds since Epoch.

    Returns:
        list: Splits of the input data as spark.DataFrame.
    """
    multi_split, ratio = process_split_ratio(ratio)

    ratio = ratio if multi_split else [ratio, 1 - ratio]
    ratio_index = np.cumsum(ratio)

    window_spec = Window.orderBy(col(col_timestamp))
    rating = data.withColumn("rank", row_number().over(window_spec))

    data_count = rating.count()
    rating_rank = rating.withColumn("rank", row_number().over(window_spec) / data_count)

    splits = []
    for i, _ in enumerate(ratio_index):
        if i == 0:
            rating_split = rating_rank.filter(col("rank") <= ratio_index[i]).drop(
                "rank"
            )
        else:
            rating_split = rating_rank.filter(
                (col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1])
            ).drop("rank")

        splits.append(rating_split)

    return splits
Esempio n. 18
0
 def test_string_functions(self):
     from pyspark.sql.functions import col, lit
     df = self.spark.createDataFrame([['nick']], schema=['name'])
     self.assertRaisesRegexp(
         TypeError,
         "must be the same type",
         lambda: df.select(col('name').substr(0, lit(1))))
     if sys.version_info.major == 2:
         self.assertRaises(
             TypeError,
             lambda: df.select(col('name').substr(long(0), long(1))))
    def test_self_join_with_pandas(self):
        @pandas_udf('key long, col string', PandasUDFType.GROUPED_MAP)
        def dummy_pandas_udf(df):
            return df[['key', 'col']]

        df = self.spark.createDataFrame([Row(key=1, col='A'), Row(key=1, col='B'),
                                         Row(key=2, col='C')])
        df_with_pandas = df.groupBy('key').apply(dummy_pandas_udf)

        # this was throwing an AnalysisException before SPARK-24208
        res = df_with_pandas.alias('temp0').join(df_with_pandas.alias('temp1'),
                                                 col('temp0.key') == col('temp1.key'))
        self.assertEquals(res.count(), 5)
Esempio n. 20
0
    def test_smvTimestampToStr(self):
        df = self.createDF("ts:Timestamp[yyyyMMdd'T'HHmmssZ];tz:String", "20180428T025800+1000,+0000;,America/Los_Angeles;20180428T025800+1000,Australia/Sydney")
        # Use `Z`(RFC 822 time zone) in the SimpleDateFormat because it has only a single valid way to represent a given offset.
        # Avoid to use `z`(General Time Zone) because it may have different result in different platforms(e.g. UTC and +00:00).
        # Details in https://docs.oracle.com/javase/8/docs/api/java/text/SimpleDateFormat.html
        r1 = df.select(col("ts").smvTimestampToStr("+10:00","yyyyMMdd:HHmmssZ").alias("localDT"))
        r2 = df.select(col("ts").smvTimestampToStr(col("tz"),"yyyy-MM-dd HH:mm:ssZ").alias("localDT2"))

        e1 = self.createDF("localDT: String", "20180428:025800+1000;;20180428:025800+1000")
        e2 = self.createDF("localDT2: String", "2018-04-27 16:58:00+0000;;2018-04-28 02:58:00+1000")

        self.should_be_same(e1, r1)
        self.should_be_same(e2, r2)
    def create_tag_frequencies(self, dataframe):
        """Produces a PySpark dataframe containing a column representing the total frequency of the tags by record.

        The frequency of tags is determined by their proportion of the total number of tags in the dataframe.

        :param dataframe: the PySpark dataframe
        :returns: the PySpark dataframe containing the tag frequency field and all fields in the supplied dataframe
        """
        df_tags = dataframe.selectExpr("tag1 AS tag").union(dataframe.selectExpr("tag2 AS tag")).union(dataframe.selectExpr("tag3 AS tag")) \
                           .union(dataframe.selectExpr("tag4 AS tag")).union(dataframe.selectExpr("tag5 AS tag"))
        df_tags = df_tags.na.drop(subset=["tag"])
        tags_total_count = df_tags.count()
        print("Total number of tags used, including duplicates:",tags_total_count)
        df_tag_freq = df_tags.groupBy("tag").count().orderBy(desc("count"))
        df_tag_freq = df_tag_freq.withColumn("frequency", col("count")/tags_total_count)
        df_tag_freq.orderBy(desc("frequency")).show(10)

        def one_hot_encode_top_n_tags(dataframe,n):
            """Produces a PySpark dataframe containing columns indicating whether each of the top n tags are present.

            :param dataframe: the PySpark dataframe 
            :param n: the number of the top ranked tags to return as tag fields
            :returns: the PySpark dataframe containing the top n tag fields and all fields in the supplied dataframe
            """
            top_n = [t.tag for t in df_tag_freq.orderBy(desc("frequency")).select("tag").limit(n).collect()]
            for tag in top_n:
                # replace tag name ".net" with "dotnet", for example, to avoid problems with periods in tag names
                tag_column_name = ("tag_"+tag).replace(".","dot")
                dataframe = dataframe.withColumn(tag_column_name, array_contains(dataframe.tags_split, tag).cast("int"))
            return dataframe

        dataframe = one_hot_encode_top_n_tags(dataframe,20)
        tag_columns = [col for col in dataframe.columns if col.startswith('tag')]

        print("Tag-related columns")
        dataframe.select(tag_columns).show(10,False)

        dataframe.createOrReplaceTempView('df')
        df_tag_freq.createOrReplaceTempView('df_tag_freq')

        for n in range(1,6):
            dataframe = self.sqlContext.sql("SELECT df.*, df_tag_freq.frequency AS frequency_tag{} FROM df LEFT JOIN df_tag_freq ON df.tag{} = df_tag_freq.tag".format(n,n))
            dataframe = dataframe.na.fill({"frequency_tag{}".format(n): 0})
            dataframe.createOrReplaceTempView('df')

        dataframe = dataframe.withColumn("frequency_sum", col("frequency_tag1")+col("frequency_tag2")+col("frequency_tag3")+col("frequency_tag4")+col("frequency_tag5"))

        # Remove temporary columns
        dataframe = dataframe.select([c for c in dataframe.columns if c not in {"tags_split","tag1","tag2","tag3","tag4","tag5","frequency_tag1","frequency_tag2", \
                                      "frequency_tag3","frequency_tag4","frequency_tag5"}])
        return(dataframe)
Esempio n. 22
0
def parse_dates(df, format):
    """
    Parses dateinto year,month,day
    :param df: input df
    :param format: the format of the timestamp
    :return: dataframe
    """
    return df.withColumn('parsed_date',
                         f.to_timestamp(f.col('transaction_date'), format)) \
        .withColumn("year", f.year(f.col('parsed_date'))) \
        .withColumn("month", f.month(f.col('parsed_date'))) \
        .withColumn("day", f.dayofmonth(f.col('parsed_date'))) \
        .withColumn("unix_ts", f.unix_timestamp('parsed_date')) \
        .drop("transaction_date")
Esempio n. 23
0
 def test_register_vectorized_udf_basic(self):
     df = self.spark.range(10).select(
         col('id').cast('int').alias('a'),
         col('id').cast('int').alias('b'))
     original_add = pandas_udf(lambda x, y: x + y, IntegerType())
     self.assertEqual(original_add.deterministic, True)
     self.assertEqual(original_add.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF)
     new_add = self.spark.catalog.registerFunction("add1", original_add)
     res1 = df.select(new_add(col('a'), col('b')))
     res2 = self.spark.sql(
         "SELECT add1(t.a, t.b) FROM (SELECT id as a, id as b FROM range(10)) t")
     expected = df.select(expr('a + b'))
     self.assertEquals(expected.collect(), res1.collect())
     self.assertEquals(expected.collect(), res2.collect())
Esempio n. 24
0
 def test_smvDedupByKeyWithOrder_with_column(self):
     schema = "a:Integer; b:Double; c:String"
     df = self.createDF(
         schema,
         """1,2.0,hello;
         1,3.0,hello;
         2,10.0,hello2;
         2,11.0,hello3"""
     )
     r1 = df.smvDedupByKeyWithOrder(col("a"))(col("b").desc())
     expect = self.createDF(
         schema,
         """1,3.0,hello;
         2,11.0,hello3"""
     )
     self.should_be_same(expect, r1)
Esempio n. 25
0
    def splitStrCol(self, column, featureNames, mark):
        """This functions split a column into different ones. In the case of this method, the column provided should
        be a string of the following form 'word,foo'.

        :param column       Name of the target column, this column is going to be replaced.
        :param featureNames     List of strings of the new column names after splitting the strings.
        :param mark         String that specifies the splitting mark of the string, this frequently is ',' or ';'.
        """

        # Check if column argument is a string datatype:
        self.__assertTypeStr(column, "column")

        # Check if mark argument is a string datatype:
        self.__assertTypeStr(mark, "mark")

        assert (column in self.__df.columns), "Error: column specified does not exist in dataFrame."

        assert (type(featureNames) == type([])), "Error: featureNames must be a list of strings."

        # Setting a udf that split the string into a list of strings.
        # This is "word, foo" ----> ["word", "foo"]
        func = udf(lambda x: x.split(mark), ArrayType(StringType()))

        self.__df = self.__df.withColumn(column, func(col(column)))
        self.undoVecAssembler(column=column, featureNames=featureNames)
        self.__addTransformation()  # checkpoint in case

        return self
Esempio n. 26
0
 def test_smvExpandStruct(self):
     schema = "id:String;a:Double;b:Double"
     df1 = self.createDF(schema, "a,1.0,10.0;a,2.0,20.0;b,3.0,30.0")
     df2 = df1.select(col("id"), struct("a", "b").alias("c"))
     res = df2.smvExpandStruct("c")
     expect = self.createDF(schema, "a,1.0,10.0;a,2.0,20.0;b,3.0,30.0")
     self.should_be_same(expect, res)
Esempio n. 27
0
 def test_vectorized_udf_null_double(self):
     data = [(3.0,), (5.0,), (-1.0,), (None,)]
     schema = StructType().add("double", DoubleType())
     df = self.spark.createDataFrame(data, schema)
     double_f = pandas_udf(lambda x: x, DoubleType())
     res = df.select(double_f(col('double')))
     self.assertEquals(df.collect(), res.collect())
Esempio n. 28
0
 def test_vectorized_udf_null_string(self):
     data = [("foo",), (None,), ("bar",), ("bar",)]
     schema = StructType().add("str", StringType())
     df = self.spark.createDataFrame(data, schema)
     str_f = pandas_udf(lambda x: x, StringType())
     res = df.select(str_f(col('str')))
     self.assertEquals(df.collect(), res.collect())
Esempio n. 29
0
 def test_vectorized_udf_null_array(self):
     data = [([1, 2],), (None,), (None,), ([3, 4],), (None,)]
     array_schema = StructType([StructField("array", ArrayType(IntegerType()))])
     df = self.spark.createDataFrame(data, schema=array_schema)
     array_f = pandas_udf(lambda x: x, ArrayType(IntegerType()))
     result = df.select(array_f(col('array')))
     self.assertEquals(df.collect(), result.collect())
Esempio n. 30
0
 def test_vectorized_udf_null_decimal(self):
     data = [(Decimal(3.0),), (Decimal(5.0),), (Decimal(-1.0),), (None,)]
     schema = StructType().add("decimal", DecimalType(38, 18))
     df = self.spark.createDataFrame(data, schema)
     decimal_f = pandas_udf(lambda x: x, DecimalType(38, 18))
     res = df.select(decimal_f(col('decimal')))
     self.assertEquals(df.collect(), res.collect())
Esempio n. 31
0
def main():

    # get dynamic frame source

    ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh')
    today = datetime.now(ho_chi_minh_timezone)
    today_second =  long(today.strftime("%s"))
    print('today_id: ', today_second)


    #------------------------------------------------------------------------------------------------------------------#


    # ------------------------------------------------------------------------------------------------------------------#
    # diem thu tuan
    dyf_top_topica_question_mark_week = glueContext.create_dynamic_frame.from_catalog(database="moodle",
                                                                        table_name="top_topica_question_mark_week")

    dyf_top_topica_question_mark_week = dyf_top_topica_question_mark_week\
        .select_fields(['attemptid', 'grade', 'quiz_name'])\
        .rename_field('attemptid', 'attemptid_mark_week') \
        .rename_field('grade', 'grade_mark_week')\
        .rename_field('quiz_name', 'quiz_name_week')

    dyf_top_topica_question_mark_week = dyf_top_topica_question_mark_week\
        .resolveChoice(specs=[('attemptid_mark_week', 'cast:long'),
                              ('grade_mark_week', 'cast:float')])

    df_top_topica_question_mark_week = dyf_top_topica_question_mark_week.toDF()
    df_top_topica_question_mark_week = df_top_topica_question_mark_week.dropDuplicates(['attemptid_mark_week'])

    if is_dev:
        print ('df_top_topica_question_mark_week')
        df_top_topica_question_mark_week.printSchema()

    # diem thi thang
    dyf_top_topica_question_marks = glueContext.create_dynamic_frame.from_catalog(database="moodle",
                                                                        table_name="top_topica_question_marks")

    dyf_top_topica_question_marks = dyf_top_topica_question_marks \
        .select_fields(['attemptid', 'marks']) \
        .rename_field('attemptid', 'attemptid_mark')\
        .rename_field('marks', 'marks_month')

    dyf_top_topica_question_marks = dyf_top_topica_question_marks \
        .resolveChoice(specs=[('attemptid_mark', 'cast:long')])

    df_top_topica_question_marks = dyf_top_topica_question_marks.toDF()
    df_top_topica_question_marks = df_top_topica_question_marks.dropDuplicates(['attemptid_mark'])

    if is_dev:
        print ('df_top_topica_question_marks')
        df_top_topica_question_marks.printSchema()

    # ------------------------------------------------------------------------------------------------------------------#
    # dyf_student_package = glueContext.create_dynamic_frame.from_catalog(database="od_student_behavior",
    #                                                                     table_name="student_package")
    # 
    # print('dyf_student_package__0')
    # dyf_student_package.printSchema()
    # 
    # dyf_student_package = dyf_student_package \
    #     .select_fields(['student_id', 'package_code', 'start_time', 'end_time']) \
    #     .rename_field('student_id', 'student_id_pk')
    # 
    # dyf_student_package = dyf_student_package.resolveChoice(
    #     specs=[('start_time', 'cast:long'), ('end_time', 'cast:long')])
    # 
    # df_student_package = dyf_student_package.toDF()
    # df_student_package = df_student_package.drop_duplicates()

    # ------------------------------------------------------------------------------------------------------------------#
    # dyf_student_package_status = glueContext.create_dynamic_frame.from_catalog(database="od_student_behavior",
    #                                                                            table_name="student_status")
    #
    # dyf_student_package_status = dyf_student_package_status \
    #     .select_fields(['contact_id', 'status_code', 'start_date', 'end_date']) \
    #     .rename_field('contact_id', 'contact_id_ps')
    #
    # print('dyf_student_package_status::drop_duplicates')
    #
    # df_student_package_status = dyf_student_package_status.toDF()
    # df_student_package_status = df_student_package_status.drop_duplicates()
    # ------------------------------------------------------------------------------------------------------------------#
    dyf_result_ai = glueContext.create_dynamic_frame.from_catalog(
        database="moodle",
        table_name="top_result_ai"
    )
    dyf_result_ai = dyf_result_ai.select_fields(
        ['id', 'answer', '.speech_result', 'right_word', 'wrong_word', 'result', 'attempt_id'])\
        .rename_field('attempt_id', 'attempt_id_result_ai')

    dyf_result_ai = dyf_result_ai.resolveChoice(specs=[('attempt_id_result_ai', 'cast:long')])
    df_result_ai = dyf_result_ai.toDF()
    df_result_ai = df_result_ai.drop_duplicates(['attempt_id_result_ai'])


    # ------------------------------------------------------------------------------------------------------------------#
    dyf_moodle_top_user = glueContext.create_dynamic_frame.from_catalog(database="moodle",
                                                                        table_name="top_user")

    # Chon cac truong can thiet
    dyf_moodle_top_user = dyf_moodle_top_user.select_fields(
        ['id', 'username', 'levelstudy'])

    df_moodle_top_user = dyf_moodle_top_user.toDF()
    #------------------------------------------------------------------------------------------------------------------#
    dyf_student_contact = glueContext.create_dynamic_frame.from_catalog(database="tig_advisor",
                                                                        table_name="student_contact")

    dyf_student_contact = dyf_student_contact.select_fields(
        ['contact_id', 'student_id', 'user_name'])

    dyf_student_contact = Filter.apply(frame=dyf_student_contact,
                                      f=lambda x: x["contact_id"] is not None and x["contact_id"] != ''
                                                  and x["student_id"] is not None and x["student_id"] != ''
                                                  and x["user_name"] is not None and x["user_name"] != '')


    df_student_contact = dyf_student_contact.toDF()

    # -------------------------------------------------------------------------------------------------------------------#

    dyf_moodle_question_attempts = glueContext.create_dynamic_frame.from_catalog(
        database="moodle",
        table_name="top_question_attempts"
    )

    dyf_moodle_question_attempts = dyf_moodle_question_attempts.select_fields(
        ['id', 'rightanswer', 'responsesummary', 'timemodified', 'maxmark', 'questionusageid',
         'questionid']).rename_field('id', 'question_attempt_id')

    dyf_moodle_question_attempts = Filter.apply(frame=dyf_moodle_question_attempts,
                                                f=lambda x: x["questionusageid"] is not None and x["questionusageid"] != '')

    df_moodle_question_attempts = dyf_moodle_question_attempts.toDF()
    df_moodle_question_attempts = df_moodle_question_attempts.dropDuplicates(['questionusageid'])
    # -------------------------------------------------------------------------------------------------------------------#

    dyf_top_quiz = glueContext.create_dynamic_frame.from_catalog(
        database="moodle",
        table_name="top_quiz"
    )
    dyf_top_quiz = dyf_top_quiz.select_fields(['name', 'id']).rename_field('id', 'quiz_id')

    df_top_quiz = dyf_top_quiz.toDF()
    # -------------------------------------------------------------------------------------------------------------------#
    dyf_moodle_question_steps = glueContext.create_dynamic_frame.from_catalog(
        database="moodle",
        table_name="top_question_attempt_steps"
    )
    dyf_moodle_question_steps = dyf_moodle_question_steps\
        .select_fields(['id', 'state', 'questionattemptid', 'timecreated'])

    df_moodle_question_steps = dyf_moodle_question_steps.toDF()
    df_moodle_question_steps = df_moodle_question_steps.dropDuplicates(['id'])

    # get latest question_steps state
    w2 = Window.partitionBy("questionattemptid").orderBy(f.col("timecreated").desc())
    df_moodle_question_steps = df_moodle_question_steps.withColumn("row", f.row_number().over(w2)) \
        .where(f.col('row') <= 1)

    df_moodle_question_steps.cache()
    if is_dev:
        print('df_moodle_question_steps after getting latest question_steps state')
        df_moodle_question_steps.show(2)

    df_moodle_question_steps = df_moodle_question_steps.drop('row', 'timecreated')

    df_moodle_question_steps.cache()

    # -------------------------------------------------------------------------------------------------------------------#
    # dyf_mapping_grammar_lo = glueContext.create_dynamic_frame.from_catalog(
    #     database="moodle",
    #     table_name="mapping_grammar_lo"
    # )

    dyf_mapping_grammar_lo = glueContext.create_dynamic_frame.from_options(
        connection_type="redshift",
        connection_options={
            "url": "jdbc:redshift://datashine-dev.c4wxydftpsto.ap-southeast-1.redshift.amazonaws.com:5439/dts_odin",
            "user": "******",
            "password": "******",
            "dbtable": "mapping_grammar_lo",
            "redshiftTmpDir": "s3://dtsodin/temp/mapping_grammar_lo/v9"
        }
    )



    dyf_mapping_grammar_lo = dyf_mapping_grammar_lo \
        .select_fields(['question', 'lo', 'lc'])\
        .rename_field('question', 'question_grammar_id')

    df_mapping_grammar_lo = dyf_mapping_grammar_lo.toDF()
    # -------------------------------------------------------------------------------------------------------------------#

    dyf_moodle_quiz_attempts = glueContext.create_dynamic_frame.from_catalog(database="moodle",
                                                                        table_name="top_quiz_attempts")

    # Chon cac truong can thiet
    if is_dev:
        print('dyf_moodle_quiz_attempts::original')
        dyf_moodle_quiz_attempts.printSchema()

    # try:
    #     df_flag = spark.read.parquet("s3a://toxd-olap/transaction_log/flag/sb_native_test/sb_native_test.parquet")
    #     read_from_index = df_flag.collect()[0]['flag']
    #     print('read from index: ', read_from_index)
    #     dyf_moodle_quiz_attempts = Filter.apply(frame=dyf_moodle_quiz_attempts,
    #                                            f=lambda x: x["_key"] > read_from_index)
    # except:
    #     print('read flag file error ')

    dyf_moodle_quiz_attempts = dyf_moodle_quiz_attempts.select_fields(
        ['id', '_key', 'quiz', 'userid', 'sumgrades', 'uniqueid', 'timestart']) \
        .rename_field('id', 'attempt_id')\
        .rename_field('timestart', 'testing_time')




    # -------------------------------------------------------------------------------------------------------------------#
    if is_dev:
        print('df_moodle_question_attempts')
        df_moodle_question_attempts.printSchema()

        print('df_moodle_question_steps')
        df_moodle_question_steps.printSchema()

        print('df_top_quiz')
        df_top_quiz.printSchema()

        print('dyf_moodle_quiz_attempts')
        dyf_moodle_quiz_attempts.printSchema()

        print('df_moodle_top_user')
        df_moodle_top_user.printSchema()

        print('df_student_contact')
        df_student_contact.printSchema()

        # print ('df_student_package_status')
        # df_student_package_status.printSchema()


    #-------------------------------------------------------------------------------------------------------------------#
    dyf_moodle_quiz_attempts = Filter.apply(frame=dyf_moodle_quiz_attempts, f=lambda x: x['userid'] is not None
                                                                and x['quiz'] is not None
                                                                and x['testing_time'] > START_LOAD_DATE
                                            )

    if is_just_monthly_exam:
        dyf_moodle_quiz_attempts = Filter.apply(frame=dyf_moodle_quiz_attempts,
                                                f=lambda x: x['userid'] is not None
                                                and x['quiz'] in [6L, 7L, 9L, 918L])
    else:
        dyf_moodle_quiz_attempts = Filter.apply(frame=dyf_moodle_quiz_attempts,
                                                f=lambda x: x['userid'] is not None)

    df_moodle_quiz_attempts = dyf_moodle_quiz_attempts.toDF()
    df_moodle_quiz_attempts = df_moodle_quiz_attempts.dropDuplicates(['attempt_id'])
    df_moodle_quiz_attempts.cache()

    moodle_quiz_attempts_number = df_moodle_quiz_attempts.count()

    if is_dev:
        print ('moodle_quiz_attempts_number: ', moodle_quiz_attempts_number)

    if moodle_quiz_attempts_number < 1:
        return

    df_student_level = get_df_student_level(glueContext)
    df_student_level.cache()
    df_student_package = get_df_student_package(glueContext)
    df_student_package.cache()
    df_student_advisor = get_df_student_advisor(glueContext)
    df_student_advisor.cache()

    # Step 1: get user info, package_code, level, status
    df_quiz_student = df_moodle_quiz_attempts\
        .join(df_moodle_top_user, df_moodle_quiz_attempts.userid == df_moodle_top_user.id)\
        .join(df_student_contact, df_moodle_top_user.username == df_student_contact.user_name)

    package_endtime_unavailable = 99999999999L
    package_starttime_unavailable = 0L
    package_code_unavailable = 'UNAVAILABLE'
    student_level_code_unavailable = 'UNAVAILABLE'
    package_status_code_unavailable = 'UNAVAILABLE'

    df_quiz_student_original = df_quiz_student.select(

        'id',
        'uniqueid',
        'quiz',
        'levelstudy',
        'attempt_id',

        df_quiz_student.testing_time.alias('student_behavior_date'),
        getBehaviorIdByQuiz(df_quiz_student.quiz).alias('behavior_id'),
        df_quiz_student.student_id.cast('long').alias('student_id'),
        'contact_id',

        # f.lit(package_code_unavailable).alias('package_code'),
        # f.lit(package_endtime_unavailable).alias('package_endtime'),
        # f.lit(package_starttime_unavailable).alias('package_starttime'),

        # f.lit(student_level_code_unavailable).alias('student_level_code'),
        # f.lit(package_status_code_unavailable).alias('package_status_code'),

        f.lit(today_second).alias('transformed_at'),

        f.from_unixtime('testing_time', format="yyyyMM").alias('year_month_id')
    )
    df_quiz_student_original =df_quiz_student_original\
        .join(df_student_advisor,
          (df_quiz_student_original.contact_id == df_student_advisor.contact_id_advisor)
          & (df_quiz_student_original.student_behavior_date >= df_student_advisor.start_date)
          & (df_quiz_student_original.student_behavior_date < df_student_advisor.end_date),
          'left'
          ) \
        .join(df_student_package,
              (df_quiz_student_original.contact_id == df_student_package.contact_id_package)
              & (df_quiz_student_original.student_behavior_date >= df_student_package.package_start_time)
              & (df_quiz_student_original.student_behavior_date < df_student_package.package_end_time),
              'left'
              ) \
        .join(df_student_level,
              (df_quiz_student_original.contact_id == df_student_level.contact_id_level)
              & (df_quiz_student_original.student_behavior_date >= df_student_level.start_date)
              & (df_quiz_student_original.student_behavior_date < df_student_level.end_date),
              'left'
              )

    df_quiz_student_original = df_quiz_student_original \
        .withColumn('student_behavior_id',
                 f.md5(concaText(
                     df_quiz_student_original.student_behavior_date,
                     df_quiz_student_original.behavior_id,
                     df_quiz_student_original.student_id,
                     df_quiz_student_original.contact_id,
                     df_quiz_student_original.package_code,
                     df_quiz_student_original.package_status_code,
                     df_quiz_student_original.student_level_code,
                     df_quiz_student_original.transformed_at)))

    df_quiz_student_original.persist(StorageLevel.DISK_ONLY_2)

    # | -- id: long(nullable=true)
    # | -- uniqueid: long(nullable=true)
    # | -- quiz: long(nullable=true)
    # | -- levelstudy: string(nullable=true)
    # | -- attempt_id: long(nullable=true)
    # | -- student_behavior_date: long(nullable=true)
    # | -- behavior_id: long(nullable=true)
    # | -- student_id: long(nullable=true)
    # | -- contact_id: string(nullable=true)
    # | -- package_code: string(nullable=false)
    # | -- package_endtime: long(nullable=false)
    # | -- package_starttime: long(nullable=false)
    # | -- student_level_code: string(nullable=false)
    # | -- package_status_code: string(nullable=false)
    # | -- transformed_at: long(nullable=false)
    # | -- student_behavior_id: string(nullable=true)

    if is_dev:
        print('df_quiz_student_original')
        df_quiz_student_original.printSchema()
        df_quiz_student_original.show(1)


    # get data for getting testing detail
    df_quiz_student = df_quiz_student_original

    #1. save weekly native test for AI (speeking)

    # Step 2: Seperate result AI(Speaking) and question attempt
    # Step 2.1 Get result AI
    df_quiz_student_ai = df_quiz_student\
        .join(df_result_ai, df_quiz_student.attempt_id == df_result_ai.attempt_id_result_ai, 'inner')\
        .join(df_top_quiz, df_quiz_student.quiz == df_top_quiz.quiz_id, 'left')

    if is_limit_test:
        df_quiz_student_ai = df_quiz_student_ai.limit(100)

    if is_dev:
        print('df_quiz_student_ai')
        df_quiz_student_ai.printSchema()
        print('df_quiz_student_ai::after:separate:: ', df_quiz_student_ai.count())

    source_system_native_test_ai = 'NATIVE_TEST_AI'
    source_system_native_test_simple = 'NATIVE_TEST_SIMPLE'
    source_system_native_test_grammar = 'NATIVE_TEST_GRAMMAR'

    current_step_unavailable = -1L
    total_step_unavailable = -1L
    learning_category_id_unavailable = -1L
    learning_unit_code_unavailable = 'UNAVAILABLE'
    learning_object_type_code_unavailable = 'UNAVAILABLE'
    learning_object_id_unavailable = -1L
    learning_object_unavailable = 'UNAVAILABLE'
    learning_category_code_unavailable = 'UNAVAILABLE'

    student_answer_detail_unavailable = 'UNAVAILABLE'

    duration_unavailable = -1L
    max_point_unavailable = -1L
    received_point_unavailable = -2L

    test_type_unavailable = 'UNAVAILABLE'

    right_answer_unavailable = 'UNAVAILABLE'
    wrong_answer_unavailable = 'UNAVAILABLE'
    #

    #
    #
    # #------------------------------------------------------------------------------------------------------------------#
    #
    #
    #
    # #------------------------------------------------------------------------------------------------------------------#
    #
    if is_dev:
        print('df_quiz_student_ai')
        df_quiz_student_ai.printSchema()
        df_quiz_student_ai.show(1)

    # Step 2.2 Get data for result AI
    df_quiz_student_ai_full = df_quiz_student_ai.select(
        'student_behavior_id',
        'student_behavior_date',
        'behavior_id',
        'student_id',
        'contact_id',

        # 'package_code',
        # df_quiz_student_ai.end_time.cast('long').alias('package_endtime'),
        # df_quiz_student_ai.start_time.cast('long').alias('package_starttime'),
        #
        # df_quiz_student_ai.levelstudy.alias('student_level_code'),
        # df_quiz_student_ai.status_code.alias('package_status_code'),

        'package_code',
        # 'package_endtime',
        # 'package_starttime',

        'student_level_code',
        'package_status_code',


        'transformed_at',

        'attempt_id',

        #for student_test_detail
        f.lit(source_system_native_test_ai).alias('source_system'),
        df_quiz_student_ai.name.alias('test_type'),
        df_quiz_student_ai.attempt_id_result_ai.cast('long').alias('attempt_step_id'),

        f.lit(current_step_unavailable).cast('long').alias('current_step'),
        f.lit(total_step_unavailable).cast('long').alias('total_step'),

        f.lit(learning_category_id_unavailable).cast('long').alias('learning_category_id'),
        f.lit(learning_category_code_unavailable).alias('learning_category_code'),

        f.lit(learning_unit_code_unavailable).cast('string').alias('learning_unit_code'),
        f.lit(learning_object_type_code_unavailable).cast('string').alias('learning_object_type_code'),
        f.lit(learning_object_id_unavailable).cast('long').alias('learning_object_id'),
        f.lit(learning_object_unavailable).cast('string').alias('learning_object'),

        df_quiz_student_ai.answer.cast('string').alias('correct_answer'),
        df_quiz_student_ai.speech_result.cast('string').alias('student_answer'),
        f.lit(student_answer_detail_unavailable).cast('string').alias('student_answer_detail'),

        'result',

        df_quiz_student_ai.right_word.cast('string').alias('right_answer'),
        df_quiz_student_ai.wrong_word.cast('string').alias('wrong_answer'),

        f.lit(duration_unavailable).cast('long').alias('duration'),
        f.lit(max_point_unavailable).cast('long').alias('max_point'),
        f.lit(received_point_unavailable).cast('long').alias('received_point'),

        'year_month_id'
    )

    if is_dev:
        print('df_quiz_student_ai_full')
        df_quiz_student_ai_full.printSchema()


    # # Step 3.1 Get data for question_attempts
    df_quiz_student_question = df_quiz_student\
        .join(df_moodle_question_attempts, df_quiz_student.uniqueid == df_moodle_question_attempts.questionusageid, 'inner')\
        .join(df_moodle_question_steps, df_moodle_question_attempts.question_attempt_id == df_moodle_question_steps.questionattemptid, 'left')\
        .join(df_mapping_grammar_lo,
              df_moodle_question_attempts.question_attempt_id == df_mapping_grammar_lo.question_grammar_id, 'left')

    if is_limit_test:
        df_quiz_student_question = df_quiz_student_question.limit(100)

    if is_dev:
        print('df_quiz_student_question')
        df_quiz_student_question.printSchema()
        print('df_quiz_student_question: ', df_quiz_student_question.count())


    def getSourceSystemByLC(lc):
        if lc in ['G01', 'G02', 'G03', 'G04', 'G05']:
            return source_system_native_test_grammar
        return source_system_native_test_simple

    getSourceSystemByLC = f.udf(getSourceSystemByLC, StringType())

    if is_dev:
        print('df_quiz_student_question')
        df_quiz_student_question.printSchema()
        df_quiz_student_question.show(1)

    #Step 3.2 Get data for question_attempts
    df_quiz_student_question_full = df_quiz_student_question.select(
        'student_behavior_id',
        'student_behavior_date',

        'behavior_id',
        'student_id',
        'contact_id',

        # 'package_code',
        # df_quiz_student_question.end_time.cast('long').alias('package_endtime'),
        # df_quiz_student_question.start_time.cast('long').alias('package_starttime'),
        #
        # df_quiz_student_question.levelstudy.alias('student_level_code'),
        # df_quiz_student_question.status_code.alias('package_status_code'),

        'package_code',
        # 'package_endtime',
        # 'package_starttime',

        'student_level_code',
        'package_status_code',

        'transformed_at',

        'attempt_id',

        # for student_test_detail
        getSourceSystemByLC(df_quiz_student_question.lc).alias('source_system'),

        f.lit(test_type_unavailable).alias('test_type'),
        df_quiz_student_question.question_attempt_id.cast('long').alias('attempt_step_id'),

        f.lit(current_step_unavailable).cast('long').alias('current_step'),
        f.lit(total_step_unavailable).cast('long').alias('total_step'),

        f.lit(learning_category_id_unavailable).cast('long').alias('learning_category_id'),
        df_quiz_student_question.lc.cast('string').alias('learning_category_code'),

        f.lit(learning_unit_code_unavailable).cast('string').alias('learning_unit_code'),

        f.lit(learning_object_type_code_unavailable).cast('string').alias('learning_object_type_code'),
        f.lit(learning_object_id_unavailable).cast('long').alias('learning_object_id'),
        df_quiz_student_question.lo.cast('string').alias('learning_object'),

        df_quiz_student_question.rightanswer.cast('string').alias('correct_answer'),
        df_quiz_student_question.responsesummary.cast('string').alias('student_answer'),
        f.lit(student_answer_detail_unavailable).cast('string').alias('student_answer_detail'),

        df_quiz_student_question.state.alias('result'),

        f.lit(right_answer_unavailable).alias('right_answer'),
        f.lit(wrong_answer_unavailable).alias('wrong_answer'),

        f.lit(duration_unavailable).cast('long').alias('duration'),
        f.lit(max_point_unavailable).cast('long').alias('max_point'),
        df_quiz_student_question.maxmark.cast('long').alias('received_point'),

        'year_month_id'
    )

    if is_dev:
        print ('df_quiz_student_question_full')
        df_quiz_student_question_full.printSchema()
        print('df_quiz_student_ai_full::before::union')
        print('df_quiz_student_ai_full::number: ', df_quiz_student_ai_full.count())
        print('df_quiz_student_question_full::number: ', df_quiz_student_question_full.count())
    #
    df_quiz_full = df_quiz_student_ai_full.union(df_quiz_student_question_full)
    if is_dev:
        print('df_quiz_full')
        df_quiz_full.printSchema()
        df_quiz_full.show(3)
    #
    #
    #
    #
    # #save to student behavior
    #
    dyf_quiz_full = DynamicFrame.fromDF(df_quiz_full, glueContext, 'dyf_quiz_full')
    #
    # #Save to
    apply_ouput_test_detail = ApplyMapping.apply(frame=dyf_quiz_full,
                                     mappings=[("student_behavior_id", "string", "student_behavior_id", "string"),

                                               ("attempt_id", "long", "attempt_id", "long"),
                                               ("source_system", "string", "source_system", "string"),
                                               ("test_type", "string", "test_type", "string"),
                                               ("attempt_step_id", "long", "attempt_step_id", "long"),

                                               ("current_step", "long", "current_step", "long"),
                                               ("total_step", "long", "total_step", "long"),

                                               ("learning_category_id", "long", "learning_category_id", "long"),
                                               ("learning_category_code", "string", "learning_category_code", "string"),

                                               ("learning_unit_code", "string", "learning_unit_code", "string"),

                                               ("learning_object_type_code", "string", "learning_object_type_code", "string"),
                                               ("learning_object_id", "long", "learning_object_id", "long"),
                                               ("learning_object", "string", "learning_object", "string"),

                                               ("correct_answer", "string", "correct_answer", "string"),
                                               ("student_answer", "string", "student_answer", "string"),
                                               ("student_answer_detail", "string", "student_answer_detail", "string"),

                                               ("result", "string", "result", "string"),

                                               ("right_answer", "string", "right_answer", "string"),
                                               ("wrong_answer", "string", "wrong_answer", "string"),

                                               ("duration", "long", "duration", "long"),
                                               ("max_point", "long", "max_point", "long"),
                                               ("received_point", "long", "received_point", "long"),

                                               ("student_behavior_date", "long", "created_at", "long"),

                                               ("behavior_id", "long", "behavior_id", "long"),

                                               ("year_month_id", "string", "year_month_id", "long")

                                               ])

    dfy_output_test = ResolveChoice.apply(frame=apply_ouput_test_detail, choice="make_cols", transformation_ctx="resolvechoice2")
    # # save to s3
    glueContext.write_dynamic_frame.from_options(
        frame=dfy_output_test,
        connection_type="s3",
        connection_options={"path": "s3://toxd-olap/transaction_log/student_behavior/sb_student_test_detail",
                            "partitionKeys": ["behavior_id", "year_month_id"]},
        format="parquet")

    # save to redshift
    glueContext.write_dynamic_frame.from_jdbc_conf(frame=dfy_output_test,
                                                   catalog_connection="glue_redshift",
                                                   connection_options={
                                                       "dbtable": "sb_student_test_detail",
                                                       "database": "transaction_log"
                                                   },
                                                   redshift_tmp_dir="s3n://datashine-dev-redshift-backup/translation_log/student_behavior/sb_student_test_detail",
                                                   transformation_ctx="datasink4")



    # #-------------------------------------------------------------------------------------------------------------------#
    # # for save behavior
    df_quiz_full = df_quiz_full.dropDuplicates(['student_behavior_id'])
    dyf_quiz_student_behavior = DynamicFrame.fromDF(df_quiz_student_original, glueContext, 'dyf_quiz_student_behavior')
    apply_ouput_hehavior = ApplyMapping.apply(frame=dyf_quiz_student_behavior,
                                              mappings=[
                                                  ("student_behavior_id", "string", "student_behavior_id", "string"),
                                                  ("student_behavior_date", "long", "student_behavior_date", "long"),
                                                  ("behavior_id", "long", "behavior_id", "int"),
                                                  ("student_id", "long", "student_id", "long"),
                                                  ("contact_id", "string", "contact_id", "string"),

                                                  ("package_code", "string", "package_code", "string"),

                                                  ("student_level_code", "string", "student_level_code", "string"),
                                                  ("package_status_code", "string", "package_status_code", "string"),

                                                  ("advisor_id", "long", "advisor_id", "long"),

                                                  ("transformed_at", "long", "transformed_at", "long"),

                                                  ("year_month_id", "string", "year_month_id", "long")
                                                  ])

    dfy_output = ResolveChoice.apply(frame=apply_ouput_hehavior, choice="make_cols",
                                     transformation_ctx="resolvechoice2")


    # save to s3
    glueContext.write_dynamic_frame.from_options(
        frame=dfy_output,
        connection_type="s3",
        connection_options={"path": "s3://toxd-olap/transaction_log/student_behavior/sb_student_behavior",
                            "partitionKeys": ["behavior_id", "year_month_id"]},
        format="parquet")

    # save to redshift
    glueContext.write_dynamic_frame.from_jdbc_conf(frame=dfy_output,
                                                   catalog_connection="glue_redshift",
                                                   connection_options={
                                                       "dbtable": "sb_student_behavior",
                                                       "database": "transaction_log"
                                                   },
                                                   redshift_tmp_dir="s3n://datashine-dev-redshift-backup/translation_log/student_behavior/sb_student_behavior",
                                                   transformation_ctx="datasink4")

    # # -------------------------------------------------------------------------------------------------------------------#
    #
    # # Step 5 - get marks
    if is_dev:
        print('Check before get marks')
        print ('df_top_topica_question_marks')
        df_top_topica_question_marks.printSchema()

        print ('df_top_topica_question_marks')
        df_top_topica_question_mark_week.printSchema()

    df_quiz_mark = df_quiz_student_original.select('student_behavior_id', 'behavior_id', 'attempt_id', 'year_month_id')

    dyf_quiz_mark = DynamicFrame.fromDF(df_quiz_mark, glueContext, 'dyf_quiz_mark')

    #check du lieu df_quiz_mark
    datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dyf_quiz_mark,
                                                               catalog_connection="glue_redshift",
                                                               connection_options={
                                                                   "dbtable": "dyf_quiz_mark",
                                                                   "database": "dts_odin_checking"
                                                               },
                                                               redshift_tmp_dir="s3://dts-odin/dts_odin_checking/temp/dyf_quiz_mark",
                                                               transformation_ctx="datasink4")

    df_quiz_week = df_quiz_mark.where(df_quiz_mark.behavior_id == BEHAVIOR_ID_TEST_TUAN)
    df_quiz_month = df_quiz_mark.where(df_quiz_mark.behavior_id == BEHAVIOR_ID_TEST_THANG)
    #
    # # ------------------------------------------------------------------------------------------------------------------#
    df_quiz_week_marks = df_quiz_week.join(df_top_topica_question_mark_week,
                                     df_quiz_week.attempt_id == df_top_topica_question_mark_week.attemptid_mark_week,
                                     'left'
                                     )
    #
    if is_dev:
        print('df_quiz_week_marks::after_join_df_top_topica_question_mark_week')
        df_quiz_week_marks.printSchema()
        df_quiz_week_marks.show(10)
    #
    df_quiz_week_marks = df_quiz_week_marks.na.fill({'grade_mark_week': 0})
    #
    def convertIntergerToFloat(grade_mark_week):
        if grade_mark_week is None:
            return float(0.0)
        return float(grade_mark_week)

    convertIntergerToFloat = f.udf(convertIntergerToFloat, FloatType())
    #
    #
    df_quiz_week_marks = df_quiz_week_marks.select(
        'behavior_id',
        'attempt_id',
        'student_behavior_id',
        df_quiz_week_marks.quiz_name_week.alias('question_category'),
        df_quiz_week_marks.grade_mark_week.alias('grade_t'),
        'year_month_id'
    )
    #
    if is_dev:
        print('df_quiz_week_marks')
        df_quiz_week_marks.printSchema()
        df_quiz_week_marks.show(2)
    #
    dyf_quiz_week_marks = DynamicFrame.fromDF(df_quiz_week_marks, glueContext, 'dyf_quiz_week_marks')

    datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dyf_quiz_week_marks,
                                                               catalog_connection="glue_redshift",
                                                               connection_options={
                                                                   "dbtable": "dyf_quiz_week_marks",
                                                                   "database": "dts_odin_checking"
                                                               },
                                                               redshift_tmp_dir="s3://dts-odin/dts_odin_checking/temp/dyf_quiz_week_marks",
                                                               transformation_ctx="datasink4")

    #
    # #------------------------------------------------------------------------------------------------------------------#
    MARK_UNAVAILABLE = '-1'
    def getMapFromStringJson(str_value):
        if str_value is None:
            return {
                {"VOCABULARY": MARK_UNAVAILABLE,
                 "CONVERSATIONAL_EXPRESSION": MARK_UNAVAILABLE,
                 "LISTENING": MARK_UNAVAILABLE,
                 "DICTATION": MARK_UNAVAILABLE,
                 "GRAMMAR": MARK_UNAVAILABLE,
                 "READING": MARK_UNAVAILABLE
                 }
            }
        str_value = str(str_value)
        json_value = json.loads(str_value)
        return json_value
    #
    getMapFromStringJson = f.udf(getMapFromStringJson, MapType(StringType(), StringType()))
    #
    df_quiz_month_marks = df_quiz_month.join(df_top_topica_question_marks,
                                  df_quiz_month.attempt_id == df_top_topica_question_marks.attemptid_mark,
                                  'inner')

    if is_dev:
        print('df_quiz_month_marks after join question marks')
        df_quiz_month_marks.printSchema()
        df_quiz_month_marks.show(5)

    df_quiz_month_marks = df_quiz_month_marks.select(
        'behavior_id',
        'attempt_id',
        'student_behavior_id',
        getMapFromStringJson(df_quiz_month_marks.marks_month).alias('marks_month_dict'),

        'year_month_id'
    )



    #
    if is_dev:
        print('df_quiz_month_marks after join question marks::after convert marks_month_dict')
        df_quiz_month_marks.printSchema()
        df_quiz_month_marks.show(5)

    df_quiz_month_marks = df_quiz_month_marks.select(
        'behavior_id',
        'attempt_id',
        'student_behavior_id',
        f.explode(df_quiz_month_marks.marks_month_dict),

        'year_month_id'
    )
    #
    if is_dev:
        print('df_quiz_month_marks after join question marks::after explode')
        df_quiz_month_marks.printSchema()
        df_quiz_month_marks.show(5)
    #
    df_quiz_month_marks = df_quiz_month_marks.select(
        'behavior_id',
        'attempt_id',
        'student_behavior_id',
        df_quiz_month_marks.key.alias('question_category'),
        df_quiz_month_marks.value.cast('float').alias('grade_t'),

        'year_month_id'
    )
    #
    if is_dev:
        print('df_quiz_month_marks::complete')
        df_quiz_month_marks.printSchema()
        df_quiz_month_marks.show(3)

    dyf_quiz_month_marks = DynamicFrame.fromDF(df_quiz_month_marks, glueContext, 'dyf_quiz_month_marks')

    datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dyf_quiz_month_marks,
                                                               catalog_connection="glue_redshift",
                                                               connection_options={
                                                                   "dbtable": "dyf_quiz_month_marks",
                                                                   "database": "dts_odin_checking"
                                                               },
                                                               redshift_tmp_dir="s3://dts-odin/dts_odin_checking/temp/dyf_quiz_month_marks",
                                                               transformation_ctx="datasink4")
    # # ------------------------------------------------------------------------------------------------------------------#
    #
    df_quiz_month_marks_full = df_quiz_week_marks.union(df_quiz_month_marks)

    df_quiz_month_marks_full = df_quiz_month_marks_full.dropDuplicates(['attempt_id', 'question_category'])

    dyf_quiz_month_marks_full = DynamicFrame.fromDF(df_quiz_month_marks_full, glueContext, 'dyf_quiz_month_marks_full')

    datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dyf_quiz_month_marks_full,
                                                               catalog_connection="glue_redshift",
                                                               connection_options={
                                                                   "dbtable": "dyf_quiz_month_marks_full",
                                                                   "database": "dts_odin_checking"
                                                               },
                                                               redshift_tmp_dir="s3://dts-odin/dts_odin_checking/temp/dyf_quiz_month_marks_full",
                                                               transformation_ctx="datasink4")
    #
    QUESTION_CATEGORY__UNAVAILABLE = 'UNAVAILABLE'
    df_quiz_month_marks_full = df_quiz_month_marks_full.na.fill(
        {'question_category': QUESTION_CATEGORY__UNAVAILABLE,
         'grade_t': MARK_UNAVAILABLE}
    )

    if is_dev:
        print ('df_quiz_month_marks_full')
        df_quiz_month_marks_full.printSchema()
        df_quiz_month_marks_full.show(3)
    # #
    dyf_quiz_month_marks_full = DynamicFrame.fromDF(df_quiz_month_marks_full, glueContext, 'dyf_quiz_month_marks_full')
    # #
    apply_dyf_quiz_month_marks_full = ApplyMapping.apply(frame=dyf_quiz_month_marks_full,
                                                mappings=[("behavior_id", "long", "behavior_id", "long"),
                                                          ("attempt_id", "long", "attempt_id", "long"),
                                                          ("student_behavior_id", "string", "student_behavior_id", "string"),
                                                          ("question_category", "string", "question_category", "string"),
                                                          ("grade_t", "float", "grade", "float"),

                                                          ("year_month_id", "string", "year_month_id", "long")
                                                          ])
    #
    dyf_quiz_month_marks_full_output = ResolveChoice.apply(frame=apply_dyf_quiz_month_marks_full,
                                                           choice="make_cols", transformation_ctx="resolvechoice2")
    #
    # save to s3
    glueContext.write_dynamic_frame.from_options(
        frame=dyf_quiz_month_marks_full_output,
        connection_type="s3",
        connection_options={"path": "s3://toxd-olap/transaction_log/student_behavior/sb_student_test_mark",
                            "partitionKeys": ["behavior_id", "year_month_id"]},
        format="parquet")


    #save to redshift
    glueContext.write_dynamic_frame.from_jdbc_conf(frame=dyf_quiz_month_marks_full_output,
                                                   catalog_connection="glue_redshift",
                                                   connection_options={
                                                       "dbtable": "sb_student_test_mark",
                                                       "database": "transaction_log"
                                                   },
                                                   redshift_tmp_dir="s3n://datashine-dev-redshift-backup/translation_log/student_behavior/sb_student_test_mark",
                                                   transformation_ctx="datasink4")


    #
    # df_quiz_full.unpersist()
    # df_moodle_quiz_attempts.unpersist()
    # df_moodle_question_steps.unpersist()

    df_mdl_logsservice_in_out = dyf_moodle_quiz_attempts.toDF()
    flag = df_mdl_logsservice_in_out.agg({"_key": "max"}).collect()[0][0]
    flag_data = [flag]
    df = spark.createDataFrame(flag_data, "long").toDF('flag')
    df.write.parquet("s3a://toxd-olap/transaction_log/flag/sb_native_test/sb_native_test.parquet",
                     mode="overwrite")

    df_quiz_student_original.unpersist()
Esempio n. 32
0
from pyspark.sql.functions import col, when
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
import os
os.environ["SPARK_HOME"] = "C:/spark-2.4.5-bin-hadoop2.7"
os.environ["HADOOP_HOME"] = "C:/winutils"
# Create spark session
spark = SparkSession.builder.appName("ICP 14").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
# Load data and select feature and label columns
data = spark.read.format("csv").option("header", True).option(
    "inferSchema",
    True).option("delimiter",
                 ",").load("C:/Users/Lalith Chandra A/Downloads/car.csv")
data = data.withColumn("label",
                       when(col("engine-location") == "front",
                            1).otherwise(0)).select("label", "length")
# Create vector assembler for feature columns
assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features")
data = assembler.transform(data)
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
# Fit the model
model = lr.fit(data)
# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))
# We can also use the multinomial family for binary classification
mlr = LogisticRegression(maxIter=10,
                         regParam=0.3,
                         elasticNetParam=0.8,
                         family="multinomial")
from pyspark.sql.functions import col
#creating a dataframe
df3 = df2.withColumn('Weather_Station', df2['value'].substr(5, 6))\
.withColumn('WBAN', df2['value'].substr(11, 5))\
.withColumn('Observation_Date',to_date(df2['value'].substr(16,8),"yyyyMMdd"))\
.withColumn('Observation_Hour', df2['value'].substr(24, 4).cast(IntegerType()))\
.withColumn('Latitude', df2['value'].substr(29, 6).cast('float') / 1000)\
.withColumn('Longitude', df2['value'].substr(35, 7).cast('float') / 1000)\
.withColumn('Elevation', df2['value'].substr(47, 5).cast(IntegerType()))\
.withColumn('Wind_Direction', df2['value'].substr(61, 3).cast(IntegerType()))\
.withColumn('WD_Quality_Code', df2['value'].substr(64, 1).cast(IntegerType()))\
.withColumn('Sky_Ceiling_Height', df2['value'].substr(71, 5).cast(IntegerType()))\
.withColumn('SC_Quality_Code', df2['value'].substr(76, 1).cast(IntegerType()))\
.withColumn('Visibility_Distance', df2['value'].substr(79, 6).cast(IntegerType()))\
.withColumn('VD_Quality_Code', df2['value'].substr(86, 1).cast(IntegerType()))\
.withColumn('Air_Temperature', df2['value'].substr(88, 5).cast('float') /10)\
.withColumn('AT_Quality_Code', df2['value'].substr(93, 1).cast(IntegerType()))\
.withColumn('Dew_Point', df2['value'].substr(94, 5).cast('float'))\
.withColumn('DP_Quality_Code', df2['value'].substr(99, 1).cast(IntegerType()))\
.withColumn('Atmospheric_Pressure', df2['value'].substr(100, 5).cast('float')/ 10)\
.withColumn('AP_Quality_Code', df2['value'].substr(105, 1).cast(IntegerType()))
df3.show(10)

#filtering the air pressure data in which there's no 9999.9
df_AP_NoBadRecords = df3.filter(col("Atmospheric_Pressure") != 9999.9)

#writing the file
df_AP_NoBadRecords.write.format("csv").mode("overwrite").option(
    "header", "true").save(
        "hdfs://namenode/output/itmd-521/tdp/2001/valid-atmospheric-pressure")
	df_port_counts = df.filter(col('possible_HP')==True).select("DstAddr", "Dport").distinct().groupBy("Dport").count()
	df =  df.join(df_port_counts, ["Dport"],how="left")
	
	total_count = df.select('DstAddr').distinct().count()
	df = df.withColumn("total_count", lit(total_count))
	
	
	#place Honeypots
	df = df.withColumn("chosenToBeHP1", place_honeypots_udf_mix(df["Dport"], df["count"], df["total_count"]))
	df = df.withColumn("isHP1", col('chosenToBeHP1') & col('possible_HP'))
	df = df.withColumn("chosenToBeHP2", place_honeypots_udf_cznic(df["Dport"], df["count"], df["total_count"]))
	df = df.withColumn("isHP2", col('chosenToBeHP2') & col('possible_HP'))
	
	
	#filter out flows without honeypots
	df_att_det_mix = df.filter(df.isHP1).groupBy('SrcAddr').agg(F.min(F.col('timestamp')).alias("detectionTime_mix"))
	df_att_det_cznic = df.filter(df.isHP2).groupBy('SrcAddr').agg(F.min(F.col('timestamp')).alias("detectionTime_cznic"))
	df = df.join(df_att_det_mix, ['SrcAddr'], how="left")
	df = df.join(df_att_det_cznic, ['SrcAddr'], how="left")
	
	
	# count saved
	try:
		saved_mix = df.filter(col('detectionTime_mix').isNotNull()).filter(~df["possible_HP"]).filter(col('timestamp')>col('detectionTime_mix')).count()
		saved_cznic = df.filter(col('detectionTime_cznic').isNotNull()).filter(~df["possible_HP"]).filter(col('timestamp')>col('detectionTime_cznic')).count()
		print('In ', input_file, ' saved_mix ', str(saved_mix), ' saved_cznic ', str(saved_cznic))
	except py4j.protocol.Py4JJavaError:
		print('oops! Something went wrong.')
	sqlContext.clearCache()

print("DONE")
Esempio n. 35
0
schema = StructType([
    StructField("@metadata", StringType()),
    StructField("@timestamp", StringType()),
    StructField("name", StringType()),
    StructField("payload", StringType()),
    StructField("well_id", StringType())
])

print("Creating static df")
static_spark_reader = spark.read.format("kafka").option(
    "kafka.bootstrap.servers",
    "kafka-cluster-kafka-bootstrap.ddt-persistence.svc.cluster.local:9092"
).option("subscribe", "ddt").option("startingOffsets", "earliest").load()
#static_spark_reader.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)").write.format("parquet").mode("Overwrite").save(output_path)
static_spark_reader.selectExpr("CAST(key AS STRING) as key", "CAST(value AS STRING) as value")\
    .select(from_json(col("value").cast("string"), schema).alias("value"))\
    .write.format("parquet")\
    .mode("append")\
    .option("checkpointLocation", checkpoint_path)\
    .option("path", output_path)\
    .save()
"""
spark.readStream.format("kafka")\
    .option("kafka.bootstrap.servers", "kafka-cluster-kafka-bootstrap.ddt-persistence.svc.cluster.local:9092")\
    .option("subscribe", "ddt").option("startingOffsets", "latest")\
    .load()\
    .selectExpr("CAST(key AS STRING) as key", "CAST(value AS STRING) as value")\
    .select(from_json(col("value").cast("string"), schema).alias("value"))\
    .writeStream.format("parquet")\
    .outputMode("append")\
    .option("path", output_path)\
Esempio n. 36
0
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

spark = SparkSession.builder.appName("MostPopularSuperhero").getOrCreate()

schema = StructType([ \
                     StructField("id", IntegerType(), True), \
                     StructField("name", StringType(), True)])

names = spark.read.schema(schema).option(
    "sep", " ").csv("file:///SparkCourse/Marvel+Names.txt")

lines = spark.read.text("file:///SparkCourse/Marvel+Graph.txt")

# Small tweak vs. what's shown in the video: we trim each line of whitespace as that could
# throw off the counts.
connections = lines.withColumn("id", func.split(func.trim(func.col("value")), " ")[0]) \
    .withColumn("connections", func.size(func.split(func.trim(func.col("value")), " ")) - 1) \
    .groupBy("id").agg(func.sum("connections").alias("connections"))

minConnections = connections.agg(func.min("connections")).first()[0]

minConnectionHeroes = connections.filter(
    func.col("connections") == minConnections)

minConnectionHeroesNames = minConnectionHeroes.join(names, "id")

print("The following characters have only " + str(minConnections) +
      " co-appearances.")

minConnectionHeroesNames.select("name").show()
Esempio n. 37
0
    'Eritrea', 'Ethiopia', 'Gabon', 'Gambia', 'Ghana', 'Guinea',
    'Guinea-Bissau', 'Iran', 'Iraq', 'Israel', 'Ivory Coast', 'Jordan',
    'Kenya', 'Kuwait', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Madagascar',
    'Malawi', 'Mali', 'Mauritania', 'Morocco', 'Mozambique', 'Namibia',
    'Niger', 'Nigeria', 'Oman', 'Palestine', 'Republic of Congo', 'Rwanda',
    'Saudi Arabia', 'Senegal', 'Sierra Leone', 'Somalia', 'South Africa',
    'South Sudan', 'Sudan', 'Syria', 'Tanzania', 'Togo', 'Tunisia', 'Turkey',
    'Uganda', 'United Arab Emirates', 'Yemen', 'Zambia', 'Zimbabwe'
]

conflict_quads = ['Verbal Conflict', 'Material Conflict']

# COMMAND ----------

# DBTITLE 1,Select specified preprocessed data
gdelt2021 = preprocessedGDELT.filter(F.col('ActionGeo_FullName').isin(countries)) \
                             .filter(F.col('EventTimeDate') >= F.lit('2021-03-01'))

# add conflict, not binary column
gdelt2021 = gdelt2021.withColumn(
    'Conflict',
    F.when(F.col('QuadClassString').isin(conflict_quads),
           True).otherwise(False))
gdelt2021.limit(2).toPandas()

# COMMAND ----------

# DBTITLE 1,Create Initial Report Variables
# create function to calculate median
median_udf = F.udf(lambda x: float(np.quantile(x, 0.5)), FloatType())
Esempio n. 38
0
    def compute_hist(psdf, bins):
        # 'data' is a Spark DataFrame that selects one column.
        assert isinstance(bins, (np.ndarray, np.generic))

        sdf = psdf._internal.spark_frame
        scols = []
        input_column_names = []
        for label in psdf._internal.column_labels:
            input_column_name = name_like_string(label)
            input_column_names.append(input_column_name)
            scols.append(
                psdf._internal.spark_column_for(label).alias(
                    input_column_name))
        sdf = sdf.select(*scols)

        # 1. Make the bucket output flat to:
        #     +----------+-------+
        #     |__group_id|buckets|
        #     +----------+-------+
        #     |0         |0.0    |
        #     |0         |0.0    |
        #     |0         |1.0    |
        #     |0         |2.0    |
        #     |0         |3.0    |
        #     |0         |3.0    |
        #     |1         |0.0    |
        #     |1         |1.0    |
        #     |1         |1.0    |
        #     |1         |2.0    |
        #     |1         |1.0    |
        #     |1         |0.0    |
        #     +----------+-------+
        colnames = sdf.columns
        bucket_names = ["__{}_bucket".format(colname) for colname in colnames]

        output_df = None
        for group_id, (colname,
                       bucket_name) in enumerate(zip(colnames, bucket_names)):
            # creates a Bucketizer to get corresponding bin of each value
            bucketizer = Bucketizer(splits=bins,
                                    inputCol=colname,
                                    outputCol=bucket_name,
                                    handleInvalid="skip")

            bucket_df = bucketizer.transform(sdf)

            if output_df is None:
                output_df = bucket_df.select(
                    SF.lit(group_id).alias("__group_id"),
                    F.col(bucket_name).alias("__bucket"))
            else:
                output_df = output_df.union(
                    bucket_df.select(
                        SF.lit(group_id).alias("__group_id"),
                        F.col(bucket_name).alias("__bucket")))

        # 2. Calculate the count based on each group and bucket.
        #     +----------+-------+------+
        #     |__group_id|buckets| count|
        #     +----------+-------+------+
        #     |0         |0.0    |2     |
        #     |0         |1.0    |1     |
        #     |0         |2.0    |1     |
        #     |0         |3.0    |2     |
        #     |1         |0.0    |2     |
        #     |1         |1.0    |3     |
        #     |1         |2.0    |1     |
        #     +----------+-------+------+
        result = (output_df.groupby("__group_id", "__bucket").agg(
            F.count("*").alias("count")).toPandas().sort_values(
                by=["__group_id", "__bucket"]))

        # 3. Fill empty bins and calculate based on each group id. From:
        #     +----------+--------+------+
        #     |__group_id|__bucket| count|
        #     +----------+--------+------+
        #     |0         |0.0     |2     |
        #     |0         |1.0     |1     |
        #     |0         |2.0     |1     |
        #     |0         |3.0     |2     |
        #     +----------+--------+------+
        #     +----------+--------+------+
        #     |__group_id|__bucket| count|
        #     +----------+--------+------+
        #     |1         |0.0     |2     |
        #     |1         |1.0     |3     |
        #     |1         |2.0     |1     |
        #     +----------+--------+------+
        #
        # to:
        #     +-----------------+
        #     |__values1__bucket|
        #     +-----------------+
        #     |2                |
        #     |1                |
        #     |1                |
        #     |2                |
        #     |0                |
        #     +-----------------+
        #     +-----------------+
        #     |__values2__bucket|
        #     +-----------------+
        #     |2                |
        #     |3                |
        #     |1                |
        #     |0                |
        #     |0                |
        #     +-----------------+
        output_series = []
        for i, (input_column_name,
                bucket_name) in enumerate(zip(input_column_names,
                                              bucket_names)):
            current_bucket_result = result[result["__group_id"] == i]
            # generates a pandas DF with one row for each bin
            # we need this as some of the bins may be empty
            indexes = pd.DataFrame({"__bucket": np.arange(0, len(bins) - 1)})
            # merges the bins with counts on it and fills remaining ones with zeros
            pdf = indexes.merge(current_bucket_result,
                                how="left",
                                on=["__bucket"]).fillna(0)[["count"]]
            pdf.columns = [input_column_name]
            output_series.append(pdf[input_column_name])

        return output_series
Esempio n. 39
0
 def outliers(data, colname, lfence, ufence):
     # Builds expression to identify outliers
     expression = F.col("`%s`" % colname).between(lfence, ufence)
     # Creates a column to flag rows as outliers or not
     return data._psdf._internal.resolved_copy.spark_frame.withColumn(
         "__{}_outlier".format(colname), ~expression)
Esempio n. 40
0
from pyspark.sql.column import Column
from pyspark.sql.column import _to_java_column
from pyspark.sql.column import _to_seq
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType

if __name__ == "__main__":

    spark = SparkSession.builder.appName("SimpleApp").getOrCreate()

    sc = spark.sparkContext

    def udfIpToIntScalaWrapper(ipString):
        _ipToIntUDF = sc._jvm.CustomUDFs.ipToIntUDF()
        return Column(
            _ipToIntUDF.apply(_to_seq(sc, [ipString], _to_java_column)))

    df = spark.createDataFrame(["192.168.0.1"], "string").toDF("ip")

    df\
        .withColumn("ip_int_scala", udfIpToIntScalaWrapper(col("ip")))\
        .show()
Esempio n. 41
0
sys.path.append('/home/kanak/spark-2.4.7-bin-hadoop2.7/python')
sys.path.append(
    '/home/kanak/spark-2.4.7-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip')

from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

if __name__ == "__main__":

    spark = SparkSession\
        .builder\
        .appName("Rate Streaming")\
        .getOrCreate()

    spark.sparkContext.setLogLevel("ERROR")

    df = spark.readStream.format("rate").option("rowsPerSecond", 3).load()

    resultDF = df.withColumn("newValue", col("value") + 1)

    query = resultDF\
        .writeStream\
        .outputMode('append')\
        .option("truncate", False)\
        .format('console')\
        .start()

    query.awaitTermination()
Esempio n. 42
0
    def value_counts(self,
                     normalize=False,
                     sort=True,
                     ascending=False,
                     bins=None,
                     dropna=True):
        """
        Return a Series containing counts of unique values.
        The resulting object will be in descending order so that the
        first element is the most frequently-occurring element.
        Excludes NA values by default.

        Parameters
        ----------
        normalize : boolean, default False
            If True then the object returned will contain the relative
            frequencies of the unique values.
        sort : boolean, default True
            Sort by values.
        ascending : boolean, default False
            Sort in ascending order.
        bins : Not Yet Supported
        dropna : boolean, default True
            Don't include counts of NaN.

        Returns
        -------
        counts : Series

        See Also
        --------
        Series.count: Number of non-NA elements in a Series.

        Examples
        --------
        >>> df = ks.DataFrame({'x':[0, 0, 1, 1, 1, np.nan]})
        >>> df.x.value_counts() # doctest: +NORMALIZE_WHITESPACE
        1.0    3
        0.0    2
        Name: x, dtype: int64

        With `normalize` set to `True`, returns the relative frequency by
        dividing all values by the sum of values.

        >>> df.x.value_counts(normalize=True) # doctest: +NORMALIZE_WHITESPACE
        1.0    0.6
        0.0    0.4
        Name: x, dtype: float64

        **dropna**
        With `dropna` set to `False` we can also see NaN index values.

        >>> df.x.value_counts(dropna=False) # doctest: +NORMALIZE_WHITESPACE
        1.0    3
        0.0    2
        NaN    1
        Name: x, dtype: int64
        """
        if bins is not None:
            raise NotImplementedError(
                "value_counts currently does not support bins")

        if dropna:
            sdf_dropna = self._kdf._sdf.filter(self.notna()._scol)
        else:
            sdf_dropna = self._kdf._sdf
        sdf = sdf_dropna.groupby(self._scol).count()
        if sort:
            if ascending:
                sdf = sdf.orderBy(F.col('count'))
            else:
                sdf = sdf.orderBy(F.col('count').desc())

        if normalize:
            sum = sdf_dropna.count()
            sdf = sdf.withColumn('count', F.col('count') / F.lit(sum))

        index_name = 'index' if self.name != 'index' else 'level_0'
        kdf = DataFrame(sdf)
        kdf.columns = [index_name, self.name]
        kdf._metadata = Metadata(column_fields=[self.name],
                                 index_info=[(index_name, None)])
        return _col(kdf)
Esempio n. 43
0
    def final_table(self):
        final_df = self.df_dict('empty_df')

        for module_name in self.config['module_names']:
            if (self.config['module_names'][module_name] == 'Y'):
                try:
                    module_df = self.sqlContext.table(
                        self.config_dict['output_db'] + '.' +
                        self.config_dict['output_prefix'] + '_' + module_name +
                        '_' + self.config_dict['output_suffix'])
                    module_df.cache()
                    module_df.show()
                except Exception as e:
                    print 'Unable to read table : {}.{}_{}_{}'.format(
                        self.config_dict['output_db'],
                        self.config_dict['output_prefix'], module_name,
                        self.config_dict['output_suffix'])
                    print(e)

                if utils.valid_df(module_df):
                    module_df.cache()
                    module_df.show()

                    common = list(
                        set(module_df.columns).intersection(final_df.columns))
                    union = list(
                        set(module_df.columns).union(final_df.columns))

                    if len(common) > 0:
                        measures = list(set(union) - set(common))
                        diff_grouping_cloumns = list(
                            set(measures) - set(self.measures))
                        final_df = module_df.join(
                            final_df, common,
                            'outer').fillna('total',
                                            diff_grouping_cloumns).fillna(
                                                '0', self.measures)
                    else:
                        union_df = self.sqlContext.createDataFrame(
                            [[''] * len(union)],
                            union).filter(col(union[1]) != '')

                        final_df = functions.union_multi_df(
                            union_df,
                            final_df,
                            module_df,
                            column_sequence_df=1)
                        final_df.cache()
                        final_df.show()

                if utils.valid_df(final_df):
                    final_df.cache()
                    final_df.show()

                    final_df.registerTempTable('final_df_table')
                    self.sqlContext.sql("drop table if exists " +
                                        self.config_dict['output_db'] + "." +
                                        self.config_dict['output_prefix'] +
                                        "_" +
                                        self.config_dict['final_table_name'] +
                                        "_" +
                                        self.config_dict['output_suffix'])
                    self.sqlContext.sql("create table " +
                                        self.config_dict['output_db'] + "." +
                                        self.config_dict['output_prefix'] +
                                        "_" +
                                        self.config_dict['final_table_name'] +
                                        "_" +
                                        self.config_dict['output_suffix'] +
                                        " as select * from final_df_table")

#                     final_df.write.saveAsTable(
#                         self.output_db + '.' + self.output_prefix  + '_' + self.final_table_name + '_' + self.output_suffix,
#                         mode = self.write_mode
#                     )
        if (self.config_dict['flush_module_tables'] == 'Y'):
            self.flush_module_tables()
        return final_df
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('languages', ArrayType(StringType()), True),
         StructField('state', StringType(), True),
         StructField('gender', StringType(), True)
         ])


df = spark.createDataFrame(data = arrayStructureData, schema = arrayStructureSchema)
df.printSchema()
df.show(truncate=False)

df.filter(df.state == "OH") \
    .show(truncate=False)

df.filter(col("state") == "OH") \
    .show(truncate=False)    
    
df.filter("gender  == 'M'") \
    .show(truncate=False)    

df.filter( (df.state  == "OH") & (df.gender  == "M") ) \
    .show(truncate=False)        

df.filter(array_contains(df.languages,"Java")) \
    .show(truncate=False)        

df.filter(df.name.lastname == "Williams") \
    .show(truncate=False) 
    
Esempio n. 45
0
    # Step 4: Extract Insights About Cat And Dog Owners
    # Topic extraction with LDA
    docs_ddf = LDA_dataset_preparation(pet_owners_ddf.limit(20000)). \
        persist(StorageLevel.DISK_ONLY)
    # docs_ddf.show()
    topics = TopicExtraction(docs_ddf, topic_num=20)
    topics.process()

    # Step 5: Identify Creators With Cat And Dog Owners In The Audience
    creators_ddf = dataset_ddf.join(pet_owners_ddf, 'userid', 'inner'). \
        select('creator_name',
               pet_owners_ddf['userid'],
               'predict_dog_owner',
               'predict_cat_owner'). \
        groupby('creator_name').sum('predict_dog_owner', 'predict_cat_owner'). \
        withColumn('dog_count', col('sum(predict_dog_owner)')). \
        withColumn('cat_count', col('sum(predict_cat_owner)')). \
        selectExpr('creator_name',
                   'dog_count',
                   'cat_count',
                   'dog_count*dog_count AS dog_count2',
                   'cat_count*cat_count AS cat_count2')

    dog_m, dog2, cat_m, cat2 = creators_ddf.groupby().avg(
        'dog_count', 'dog_count2', 'cat_count', 'cat_count2').collect()[0]
    ranking_ddf = creators_ddf. \
        withColumn('dog_significance', significance_udf(dog_m, dog2)(col('dog_count'))). \
        withColumn('cat_significance', significance_udf(cat_m, cat2)(col('cat_count'))). \
        persist(StorageLevel.MEMORY_AND_DISK)

    print('Top 10 creators with higher number of dog owners')
Esempio n. 46
0
        "kafka.bootstrap.servers",
        "sandbox-hdp.hortonworks.com:9092").option("subscribe",
                                                   "ratp-api").load()

    #On vient recuperer la requete type pour recuperer le schema global
    json_example = spark.read.json("hdfs:///user/root/test.json",
                                   multiLine=True)
    schema_json = json_example.schema

    #On recupere les colonnes voulues des messages du producer : value correspond a la reponse, et timestamp a l'heure ou a ete effectue la requete
    df = kafkaStream.selectExpr("CAST(value as STRING)",
                                "CAST(timestamp as TIMESTAMP)")

    #Selection de la partie du json qui nous interesse
    df = df.select(
        F.from_json(F.col("value"), schema_json).alias("test"), "timestamp"
    ).select(
        "test.Siri.ServiceDelivery.StopMonitoringDelivery.MonitoredStopVisit",
        "timestamp").select(F.explode("MonitoredStopVisit"),
                            "timestamp").select(
                                F.explode("col"), "timestamp").select(
                                    "col.MonitoredVehicleJourney", "timestamp",
                                    "col.MonitoringRef")

    #On selectionne les colonnes qui nous interessent
    df = df.select(
        "MonitoredVehicleJourney.DestinationName",
        "MonitoredVehicleJourney.LineRef",
        "MonitoredVehicleJourney.TrainNumbers.TrainNumberRef",
        "MonitoredVehicleJourney.MonitoredCall.ExpectedArrivalTime",
        "MonitoredVehicleJourney.MonitoredCall.StopPointName", "timestamp",
Esempio n. 47
0
def process_log_data(spark, input_data, output_data):
    """
    Extract log data from JSON files and write
    data to parquet files on S3
    :param spark: The Spark session object
    :param input_data: Source JSON files on S3
    :param output_data: Target S3 bucket where to write Parquet files
    :return: None
    """

    # get filepath to log data file
    log_data = os.path.join(input_data, 'log_data/*/*/*.json')
    # # song data needed for join
    song_data = os.path.join(input_data, 'song_data/*/*/*/*.json')

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong')

    # extract columns for users table
    # users - users in the app
    # user_id, first_name, last_name, gender, level
    users_table = df.select(
        'userId', 'firstName', 'lastName', 'gender',
        'level').where(col("userId").isNotNull()).withColumnRenamed(
            "userId",
            "user_id").withColumnRenamed("firstName",
                                         "first_name").withColumnRenamed(
                                             "lastName",
                                             "last_name").distinct()

    # write users table to parquet files
    users_table.write.parquet(os.path.join(output_data, 'users.parquet'),
                              'overwrite')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: int(x) / 1000)
    df = df.withColumn('start_time', get_timestamp('ts'))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: datetime.fromtimestamp(int(x) / 1000))
    df = df.withColumn('datetime', get_datetime('ts'))

    # extract columns to create time table
    # time - timestamps of records in songplays broken down into specific units
    # start_time, hour, day, week, month, year, weekday
    time_table = df.select('start_time', 'datetime').withColumn(
        'hour',
        hour('datetime')).withColumn('day', dayofmonth('datetime')).withColumn(
            'week', weekofyear('datetime')).withColumn(
                'month', month('datetime')).withColumn(
                    'year', year('datetime')).withColumn(
                        'weekday', dayofweek('datetime')).distinct()

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'time.parquet'), 'overwrite')

    # read in song data to use for songplays table
    song_df = spark.read.json(song_data).alias('song_df')

    # extract columns from joined song and log datasets to create songplays table
    # songplays - records in log data associated with song plays i.e. records with page NextSong
    # songplay_id, start_time, user_id, level, song_id, artist_id,
    # session_id, location, user_agent
    songplays_table = df.join(song_df,
                              col("artist") == col("song_df.artist_name"),
                              'inner').select(
                                  col('start_time'),
                                  col('userId').alias('user_id'), col('level'),
                                  col('song_df.song_id').alias('song_id'),
                                  col('song_df.artist_id').alias('artist_id'),
                                  col('sessionId').alias('session_id'),
                                  col('location'),
                                  col('userAgent').alias('user_agent'),
                                  year('datetime').alias('year'),
                                  month('datetime').alias('month')).withColumn(
                                      'songplay_id',
                                      monotonically_increasing_id())

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'songplays.parquet'), 'overwrite')
Esempio n. 48
0
def run_spark_job(spark):

    # TODO Create Spark Configuration
    # Create Spark configurations with max offset of 200 per trigger
    # set up correct bootstrap server and port
    df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers","localhost:9092") \
        .option("subscribe","com.udacity.police.calls") \
        .option("startingOffsets","earliest") \
        .option("maxRatePerPartition",100) \
        .option("maxOffsetsPerTrigger",200) \
        .option("stopGracefullyOnShutdown","true") \
        .load()

    # Show schema for the incoming resources for checks
    df.printSchema()

    # TODO extract the correct column from the kafka input resources
    # Take only value and convert it to String
    kafka_df = df.selectExpr("CAST(value AS STRING)")

    service_table = kafka_df\
        .select(psf.from_json(psf.col('value'), schema).alias("DF"))\
        .select("DF.*")

    # TODO select original_crime_type_name and disposition
    #     distinct_table = service_table \
    #                 .select(psf.to_timestamp(psf.col("call_date_time")).alias("call_date_time"),
    #                         psf.col('original_crime_type_name'),
    #                         psf.col('disposition'))

    distinct_table = service_table \
        .select(psf.to_timestamp(psf.col("call_date_time")).alias("call_date_time"),
                psf.col('original_crime_type_name'),
                psf.col('disposition'))

    distinct_table.printSchema()
    # count the number of original crime type
    agg_df = distinct_table \
            .withWatermark("call_date_time", "60 minutes") \
            .groupBy(
                psf.window(distinct_table.call_date_time, "10 minutes", "5 minutes"),
                psf.col('original_crime_type_name')
                 ) \
            .count()

    # TODO Q1. Submit a screen shot of a batch ingestion of the aggregation
    # TODO write output stream
    query = agg_df \
        .writeStream \
        .outputMode("complete") \
        .format("console") \
        .start()

    # TODO attach a ProgressReporter
    query.awaitTermination()

    # TODO get the right radio code json path
    radio_code_json_filepath = "./radio_code.json"
    radio_code_df = spark.read.json(radio_code_json_filepath)

    # clean up your data so that the column names match on radio_code_df and agg_df
    # we will want to join on the disposition code

    # TODO rename disposition_code column to disposition
    radio_code_df = radio_code_df.withColumnRenamed("disposition_code",
                                                    "disposition")

    # TODO join on disposition column
    join_query = agg_df.join(radio_code_df, "disposition")

    join_query.awaitTermination()
Esempio n. 49
0
# 1. model_path = path to the pre-trained models. (E.g. path/to/model/bigdl_inception-v1_imagenet_0.4.0.model)
#
# 2. image_path = path to the folder of the training images. (E.g. path/to/data/dogs-vs-cats/demo/\*/\*)

model_path = "hdfs:///user/example/dogscats/bigdl_inception-v1_imagenet_0.4.0.model"
image_path = "hdfs:///user/example/dogscats/demo/*/*"
imageDF = NNImageReader.readImages(image_path, sc)
imageDF.printSchema()

getName = udf(
    lambda row: re.search(r'(cat|dog)\.([\d]*)\.jpg', row[0], re.IGNORECASE).
    group(0), StringType())
getLabel = udf(lambda name: 1.0
               if name.startswith('cat') else 2.0, DoubleType())

labelDF = imageDF.withColumn("name", getName(col("image"))).withColumn(
    "label", getLabel(col('name')))
(trainingDF, validationDF) = labelDF.randomSplit([0.9, 0.1])
labelDF.select("name", "label").show(10)

# ## Fine-tune a pre-trained model
# We fine-tune a pre-trained model by removing the last few layers, freezing the first few layers, and adding some new layers.

transformer = ChainedPreprocessing([
    RowToImageFeature(),
    ImageResize(256, 256),
    ImageCenterCrop(224, 224),
    ImageChannelNormalize(123.0, 117.0, 104.0),
    ImageMatToTensor(),
    ImageFeatureToTensor()
])
Esempio n. 50
0
def join_logs(hive_context, batch_config, interval_time_in_seconds,
              log_table_names):
    def union_logs(df_clicklog, df_showlog):
        # union click log and show log.
        columns = [
            'did', 'is_click', 'action_time', 'keyword', 'keyword_index',
            'media', 'media_category', 'net_type', 'gender', 'age', 'adv_id'
        ]

        df_clicklog = df_clicklog.withColumn('is_click', lit(1))
        df_clicklog = df_clicklog.select(columns)

        df_showlog = df_showlog.withColumn('is_click', lit(0))
        df_showlog = df_showlog.select(columns)

        df_unionlog = df_showlog.union(df_clicklog)
        return df_unionlog

    def transform_action_time(df_logs, interval_time_in_seconds):
        _udf_time = udf(
            lambda x: int(
                datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f').strftime("%s")),
            IntegerType())
        df_logs = df_logs.withColumn('action_time_seconds',
                                     _udf_time(col('action_time')))

        _udf_interval_time = udf(lambda x: x - x % interval_time_in_seconds,
                                 IntegerType())
        df_logs = df_logs.withColumn(
            'interval_starting_time',
            _udf_interval_time(col('action_time_seconds')))

        return df_logs

    timer_start = timeit.default_timer()
    start_date, end_date, load_minutes = batch_config
    starting_time = datetime.strptime(start_date, "%Y-%m-%d")
    ending_time = datetime.strptime(end_date, "%Y-%m-%d")
    showlog_table_name, clicklog_table_name, logs_table_name = log_table_names

    batched_round = 1
    while starting_time < ending_time:
        batched_time_start_str = starting_time.strftime("%Y-%m-%d %H:%M:%S")
        batched_time_end = starting_time + \
            timedelta(minutes=load_minutes)
        batched_time_end_str = batched_time_end.strftime("%Y-%m-%d %H:%M:%S")
        print_batching_info("Main logs", batched_round, batched_time_start_str,
                            batched_time_end_str)
        command = """select did, action_time, keyword, keyword_index, 
                     media, media_category, net_type, gender, 
                     age, adv_id from {} where action_time >= '{}' 
                     and action_time < '{}'"""
        df_clicklog_batched = hive_context.sql(
            command.format(clicklog_table_name, batched_time_start_str,
                           batched_time_end_str))
        df_showlog_batched = hive_context.sql(
            command.format(showlog_table_name, batched_time_start_str,
                           batched_time_end_str))
        df_logs_batched = union_logs(df_clicklog_batched, df_showlog_batched)
        df_logs_batched = transform_action_time(df_logs_batched,
                                                interval_time_in_seconds)
        df_logs_batched = df_logs_batched.withColumn(
            'uckey',
            concat_ws(",", col('media'), col('media_category'),
                      col('net_type'), col('gender'), col('age')))
        mode = 'overwrite' if batched_round == 1 else 'append'
        write_to_table(df_logs_batched, logs_table_name, mode=mode)
        batched_round += 1
        starting_time = batched_time_end

    timer_end = timeit.default_timer()
    print('Total batching seconds: ' + str(timer_end - timer_start))
Esempio n. 51
0
def is_null(df, column):
    return df.filter(F.col(column).isNull() | F.isnan(column)).count() > 0
Esempio n. 52
0
def create_column_with_power(df, column_name: str):
    return df.withColumn("power_2", power_two_UDF(col(column_name)))
Esempio n. 53
0
df_mongo = sesh.read.json(
    'hdfs://ip-{}.ec2.internal:9000/user/ubuntu/metadata/metadata.json'.format(
        private_ip))

# drop these columns from metadata
df_mongo = df_mongo.drop('id')\
                   .drop('_id')\
                   .drop('brand')\
                   .drop('categories')\
                   .drop('description')\
                   .drop('related')\
                   .drop('salesRank')\
                   .drop('title')\
                   .drop('imUrl')\
                   .dropna()\
                   .withColumn('price', col('price').cast('float'))

# make sure prices are positive
df_mongo = df_mongo.where(df_mongo.price > 0)

# structure of kindle reviews
schema = StructType().add('id', IntegerType(), True)\
                     .add('asin', StringType(), True)\
                     .add('helpful', StringType(), True)\
                     .add('overall', IntegerType(), True)\
                     .add('reviewText', StringType(), True)\
                     .add('reviewTime', StringType(), True)\
                     .add('reviewerID', StringType(), True)\
                     .add('reviewerName', StringType(), True)\
                     .add('summary', StringType(), True)\
                     .add('unixReviewTime', IntegerType(), True)
    
    # 1. Calculate L(orig, priv) and H(orig, priv) for
    #    detailed cells, marginals, total (for the queries listed in "queries")
    df_L1 = queryLp(querydf, 1)
    df_L2 = queryLp(querydf, 2)
    df_Linf = queryLp(querydf, "inf")
    
    sdftools.show(df_L1, "L^1 norm for the queries")
    sdftools.show(df_L2, "L^2 norm for the queries")
    sdftools.show(df_Linf, "L^inf norm for the queries")
    
    df_H = queryHellinger(querydf)
    
    sdftools.show(df_H, "Hellinger metric for the queries")
    
    # 2. Average L^p and H across geounits in the geolevel
    # removed AC.GEOCODE from the groupby to aggregate across all geounits
    groupby = [AC.GEOLEVEL, AC.QUERY, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]
    df_L1_avg = df_L1.groupBy(groupby).agg(sf.avg(sf.col("L^1_norm"))).persist()
    df_L2_avg = df_L2.groupBy(groupby).agg(sf.avg(sf.col("L^2_norm"))).persist()
    df_Linf_avg = df_Linf.groupBy(groupby).agg(sf.avg(sf.col("L^inf_norm"))).persist()
    
    sdftools.show(df_L1_avg, "L^1 norm for the queries")
    sdftools.show(df_L2_avg, "L^2 norm for the queries")
    sdftools.show(df_Linf_avg, "L^inf norm for the queries")
    
    df_H_avg = df_H.groupBy(groupby).agg(sf.avg(sf.col("H"))).persist()
    
    sdftools.show(df_H_avg, "Hellinger metric for the queries")
    
Esempio n. 55
0
def read_and_agg_gdelt_data(spark):
    """
    Reads the preprocessed gdelt data of stage 1.
    
    Creates aggregations by country and date. 
    Creates several metrics, see Readme.md for details.
    
    Merging the subregions 
    is very costly and considered for a later step of the project.

    spark: spark session

    returns a spark dataframe with aggregated gdelt data
    """

    #read in gdelt data of stage 1 and create temp view
    df_gdelt = spark.read.parquet(folder_s1 + "gdelt/gdelt.parquet")
    df_gdelt.createOrReplaceTempView("gdelt")

    #create df with covid related metrics, grouped by country and date
    df_gdelt_facts_covid = spark.sql("""SELECT country_code, `date`, 
                        COUNT(DISTINCT GLOBALEVENTID) as gd_events_covid,
                        SUM(NumMentions) as gd_nummentions_covid,
                        SUM(NumSources) as gd_numsources_covid,
                        SUM(NumArticles) as gd_numarticles_covid,
                        AVG(AvgTone) as gd_avgtone_covid,
                        AVG(GoldsteinScale) as gd_gtscale_covid
                     FROM gdelt 
                     WHERE covid = true 
                     GROUP BY country_code, `date`
                     """)

    #create df with general metrics, grouped by country and date
    df_gdelt_facts_general = spark.sql("""SELECT country_code, `date`, 
                        COUNT(DISTINCT GLOBALEVENTID) as gd_events_general,
                        SUM(NumMentions) as gd_nummentions_general,
                        SUM(NumSources) as gd_numsources_general,
                        SUM(NumArticles) as gd_numarticles_general,
                        AVG(AvgTone) as gd_avgtone_general,
                        AVG(GoldsteinScale) as gd_gtscale_general
                     FROM gdelt
                     GROUP BY country_code, `date`
                     """)

    #join the two metric frames together
    df_gdelt_facts = df_gdelt_facts_general \
                        .join(df_gdelt_facts_covid, \
                            on=['country_code','date'],how="outer")

    #calculate proportions of covid metric on general metric
    df_gdelt_facts = df_gdelt_facts \
                        .withColumn('gd_events_covid_perc',\
                         col('gd_events_covid') / col('gd_events_general'))
    df_gdelt_facts = df_gdelt_facts \
                        .withColumn('gd_nummentions_covid_perc',\
                         col('gd_nummentions_covid') / col('gd_nummentions_general'))
    df_gdelt_facts = df_gdelt_facts \
                        .withColumn('gd_numsources_covid_perc',\
                         col('gd_numsources_covid') / col('gd_numsources_general'))
    df_gdelt_facts = df_gdelt_facts \
                        .withColumn('gd_numarticles_covid_perc',\
                         col('gd_numarticles_covid') / col('gd_numarticles_general'))

    df_gdelt_facts = df_gdelt_facts.withColumnRenamed('country_code',
                                                      'regionId')

    return df_gdelt_facts
def dedupe_splink_scores(
    df_e_with_dupes: DataFrame,
    unique_id_colname: str,
    score_colname: str = None,
    selection_fn: str = "abs_val",
):
    """Sometimes, multiple Splink jobs with different blocking rules are combined
    into a single dataset of edges.  Sometimes,the same pair of nodes will be
    scored multiple times, once by each job.  We need to deduplicate this dataset
    so each pair of nodes appears only once

    Args:
        df_e_with_dupes (DataFrame): Dataframe with dupes
        unique_id_colname (str): Unique id column name e.g. unique_id
        score_colname (str, optional): Which column contains scores? If none, inferred from
            df_e_with_dupes.columns. Defaults to None.
        selection_fn (str, optional): Where we have several different scores for a given
            pair of records, how do we decide the final score?
            Options are 'abs_val' and 'mean'.
            abs_val:  Take the value furthest from 0.5 i.e. the value that expresses most certainty
            mean: Take the mean of all values
            Defaults to 'abs_val'.
    """

    # Looking in blocking.py, the position of unique ids
    # (whether they appear in _l or _r) is guaranteed
    # in blocking outputs so we don't need to worry about
    # inversions

    # This is not the case for labelled data - hence the need
    # _sql_gen_unique_id_keygen to join labels to df_e

    possible_vals = ["abs_val", "mean"]
    if selection_fn not in possible_vals:
        raise ValueError(
            f"selection function should be in {possible_vals}, you passed {selection_fn}"
        )

    score_colname = _get_score_colname(df_e_with_dupes, score_colname)

    if selection_fn == "abs_val":
        df_e_with_dupes = df_e_with_dupes.withColumn(
            "absval", f.expr(f"0.5 - abs({score_colname})"))

        win_spec = Window.partitionBy(
            [f"{unique_id_colname}_l",
             f"{unique_id_colname}_r"]).orderBy(f.col("absval").desc())
        df_e_with_dupes = df_e_with_dupes.withColumn(
            "ranking",
            f.row_number().over(win_spec))
        df_e = df_e_with_dupes.filter(f.col("ranking") == 1)
        df_e = df_e.drop("absval")
        df_e = df_e.drop("ranking")

    if selection_fn == "mean":

        win_spec = Window.partitionBy(
            [f"{unique_id_colname}_l",
             f"{unique_id_colname}_r"]).orderBy(f.col(score_colname).desc())

        df_e_with_dupes = df_e_with_dupes.withColumn(
            "ranking",
            f.row_number().over(win_spec))

        df_e_with_dupes = df_e_with_dupes.withColumn(
            score_colname,
            f.avg(score_colname).over(
                win_spec.rowsBetween(Window.unboundedPreceding,
                                     Window.unboundedFollowing)),
        )
        df_e = df_e_with_dupes.filter(f.col("ranking") == 1)

        df_e = df_e.drop("ranking")

    return df_e
Esempio n. 57
0
import sys
import pyspark.sql.functions as f
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("task4c-sql").getOrCreate()

df = spark.read.format('csv').options(header='false', inferschema='false') \
    .load(sys.argv[1]).na.fill('')

data = df.select(
    df._c20.cast('string').alias('name'),
    df._c5.cast('DECIMAL(10, 2)').alias('fare'))

result = data.groupBy('name').agg(f.sum('fare')) \
    .select('name', f.col('sum(fare)').alias('revenue')) \
    .sort(f.col('revenue').desc()).limit(10) \
    .write.csv('task4c-sql.out', quoteAll=False, header=False,
               quote='', ignoreTrailingWhiteSpace=False)
'''
module load python/gnu/3.6.5
module load spark/2.4.0
rm -rf task4c-sql.out
hfs -rm -R task4c-sql.out
spark-submit --conf \
spark.pyspark.python=/share/apps/python/3.6.5/bin/python \
task4c-sql.py task1b-sql.out
hfs -getmerge task4c-sql.out task4c-sql.out
hfs -rm -R task4c-sql.out
cat task4c-sql.out
'''
Esempio n. 58
0
from pyspark.sql import functions as F


@udf("string")
def lat_lng_2_h3(lat, lng, res):
    import h3
    try:
        result = h3.geo_to_h3(lat, lng, res)
        return result
    except:
        return None  # invalid coordinates will result in null index value.


taxi_trips = taxi_trips.withColumn(
    "h3_pickup",
    lat_lng_2_h3(F.col("pickup_latitude"), F.col("pickup_longitude"),
                 F.lit(10))).withColumn(
                     "h3_dropoff",
                     lat_lng_2_h3(F.col("dropoff_latitude"),
                                  F.col("dropoff_longitude"), F.lit(10)))

# COMMAND ----------

# DBTITLE 1,View that H3 columns
display(taxi_trips)

# COMMAND ----------

# DBTITLE 1,Define location for data storage (silver layer)
username = "******"  #please update with a correct user
silver_data_location = f"Users/{username}/geospatial/workshop/data/silver"
Esempio n. 59
0
# "score":0.0
# }]
# }
#
# (Note: The Redis Source for Kafka has redundant fields zSetEntries and zsetentries, only one should be parsed)
#
# and create separated fields like this:
# +------------+-----+-----------+------------+---------+-----+-----+-----------------+
# |         key|value|expiredType|expiredValue|existType|   ch| incr|      zSetEntries|
# +------------+-----+-----------+------------+---------+-----+-----+-----------------+
# |U29ydGVkU2V0| null|       null|        null|     NONE|false|false|[[dGVzdDI=, 0.0]]|
# +------------+-----+-----------+------------+---------+-----+-----+-----------------+
#
# storing them in a temporary view called RedisSortedSet
kafkaRedisDF.withColumn("value", from_json("value", kafkaRedisSchema))\
            .select(col('value.existType'), col('value.Ch'),\
                    col('value.Incr'), col('value.zSetEntries'))\
            .createOrReplaceTempView("RedisSortedSet")

# TO-DO: execute a sql statement against a temporary view, which statement takes the element field from the 0th element in the array of structs and create a column called encodedCustomer
# the reason we do it this way is that the syntax available select against a view is different than a dataframe, and it makes it easy to select the nth element of an array in a sql column
zSetEntriesEncodedStreamingDF = spark.sql(
    "select zSetEntries[0].element as encodedCustomer from RedisSortedSet")

# TO-DO: take the encodedCustomer column which is base64 encoded at first like this:
# +--------------------+
# |            customer|
# +--------------------+
# |[7B 22 73 74 61 7...|
# +--------------------+
spark = SparkSession.Builder().master("local").appName(
    "houseprice").getOrCreate()
sc = spark.sparkContext

train_df = spark.read.csv("/home/luminar/Downloads/train(1).csv",
                          header=True,
                          inferSchema=True)
train_df.show()

train_df.printSchema()
print(train_df.count())

#to find missing values
for c in train_df.columns:
    print(c, train_df.filter(col(c).isNull()).count())

for c in train_df.columns:
    print(c, train_df.filter(col(c) == "NA").count())

#to drop the columns with missig value greater than 1000
for c in train_df.columns:
    if train_df.filter(col(c) == "NA").count() > 1000:
        train_df.drop(c)

#MAY 4TH
#fill the null values of columns by using when and 0therwise

train_df.groupBy("LotFrontage").count().show()
train_df = train_df.withColumn(
    "LotFrontage",