def import_twitter_data(spark_session, tweets_file_path):
    """Imports the twitter data and returns resulting DataFrame.
    
    Args:
        spark_session    --    An active SparkSession.
        tweets_file_path    --    A file path.
    """

    tweets_schema = types.StructType([
        types.StructField('id', types.LongType()),
        types.StructField('timestamp', types.LongType(), nullable=False),
        types.StructField('postalCode', types.StringType()),
        types.StructField('lon', types.DoubleType(), nullable=False),
        types.StructField('lat', types.DoubleType(), nullable=False),
        types.StructField('tweet', types.StringType(), nullable=False),
        types.StructField('user_id', types.LongType()),
        types.StructField('application', types.StringType()),
        types.StructField('source', types.StringType())
    ])

    tweets_df = spark_session.read.csv(tweets_file_path,
                                       escape='"',
                                       header='true',
                                       schema=tweets_schema,
                                       mode='DROPMALFORMED')

    tweets_df = tweets_df.select(['timestamp', 'lon', 'lat', 'tweet'])
    return tweets_df
def test_df() -> DataFrame:
    """Return some data."""
    spark = package_spark

    schema = st.StructType([
        st.StructField('id', st.LongType()),
        st.StructField('money', st.StringType()),
        st.StructField('timestamp', st.LongType()),
        st.StructField('structtype', st.StructType([
            st.StructField('number1', st.StringType()),
            st.StructField('number2', st.StringType()),
            st.StructField('number3', st.StringType()),
        ])),
        st.StructField('rootstructtype', st.StructType([
            st.StructField('nestedstructtype', st.StructType([
                st.StructField('fieldtype', st.StringType()),
            ])),
        ])),
        st.StructField('arraytype', st.ArrayType(st.StringType())),
    ])

    test_df = spark.createDataFrame([
        [1, '$100.000', 17, (1, 2, 3), ((2,),), ['meta', 'data']],
        [1, '$200.000', 17, (3, 2, 1), ((2,),), ['meta', 'data']],
        [1, '$10.000', 16, (1, 3, 2), ((2,),), ['meta', 'data']],
        [2, '-$100', 17, (3, 1, 2), ((2,),), ['meta', 'data']],
        [2, '$100', 14, (2, 1, 3), ((2,),), ['meta', 'data']],
    ], [
        'id', 'money', 'timestamp', 'structtype', 'rootstructtype', 'arraytype'
    ])

    return spark.createDataFrame(test_df.rdd, schema)
def benchmarkCalculatePiUsingDF(spark, samples, parallelism, jobLogger):
    def inside(p):
        x, y = random.random(), random.random()
        return x * x + y * y < 1

    jobLogger.info(
        '****************************************************************')
    jobLogger.info(
        'Starting benchmark test calculatng Pi via dataframe manipulations '
        'with {0:,} samples'.format(samples))

    start_time = timer()

    # Note that the random seed for each of the columns must be different otherwise
    # each column will have identical values on each row
    pi_df = (spark.range(0, samples, numPartitions=parallelism).withColumn(
        'x', F.rand(seed=8675309)
    ).withColumn('y', F.rand(seed=17760704)).withColumn(
        'within_circle',
        F.when(
            (F.pow(F.col('x'), F.lit(2)) + F.pow(F.col('y'), F.lit(2)) <= 1.0),
            F.lit(1).cast(T.LongType())).otherwise(
                F.lit(0).cast(T.LongType()))).agg(
                    F.sum('within_circle').alias('count_within_circle'),
                    F.count('*').alias('count_samples')))
    res = pi_df.collect()
    pi_val = 4.0 * (res[0].count_within_circle) / (res[0].count_samples)
    end_time = timer()
    return (end_time - start_time), pi_val
Esempio n. 4
0
    def test_datatype(self):
        first = T.StructType([
            T.StructField('f1', T.BooleanType()),
            T.StructField('f2', T.ByteType()),
            T.StructField('f3', T.IntegerType()),
            T.StructField('f4', T.LongType()),
        ])
        second = T.StructType([
            T.StructField('f3', T.IntegerType()),
            T.StructField('f2', T.ByteType()),
            T.StructField('f4', T.LongType()),
            T.StructField('f1', T.BooleanType()),
        ])

        SparklyTest().assertRowsEqual(first, second, ignore_order=True)
        with self.assertRaises(AssertionError):
            self.assertEqual(first, second)

        # change entry (f4, T.LongType)
        second = T.StructType([
            T.StructField('f3', T.IntegerType()),
            T.StructField('f2', T.ByteType()),
            T.StructField('f4', T.StringType()),
            T.StructField('f1', T.BooleanType()),
        ])

        with self.assertRaises(AssertionError):
            SparklyTest().assertRowsEqual(first, second, ignore_order=True)
Esempio n. 5
0
class Ensembler:
    _PRIMITIVE_TYPE_MAP = {
        pb2.Ensembler.ResultType.DOUBLE: types.DoubleType(),
        pb2.Ensembler.ResultType.FLOAT: types.FloatType(),
        pb2.Ensembler.ResultType.INTEGER: types.LongType(),
        pb2.Ensembler.ResultType.LONG: types.LongType(),
        pb2.Ensembler.ResultType.STRING: types.StringType(),
    }

    _CAST_TYPE_MAP = {
        pb2.Ensembler.ResultType.INTEGER: types.IntegerType(),
    }

    def __init__(self,
                 ensembler_uri: str,
                 result_column_name: str,
                 result_type: types.DataType,
                 cast_type: types.DataType):
        self._ensembler_uri = ensembler_uri
        self._result_column_name = result_column_name
        self._result_type = result_type
        self._cast_type = cast_type

    def ensemble(self, combined_df: DataFrame, spark: SparkSession) -> DataFrame:
        udf = mlflow.pyfunc.spark_udf(
            spark,
            self._ensembler_uri,
            self._result_type
        )

        return combined_df.withColumn(
            self._result_column_name,
            udf(struct(combined_df.columns))
            if self._cast_type is None
            else udf(struct(combined_df.columns)).cast(self._cast_type)
        )

    @classmethod
    def from_config(cls, config: pb2.Ensembler) -> 'Ensembler':
        result_type = None
        cast_type = None
        if config.result.type == pb2.Ensembler.ResultType.ARRAY:
            if config.result.item_type in cls._PRIMITIVE_TYPE_MAP:
                result_type = types.ArrayType(
                    cls._PRIMITIVE_TYPE_MAP.get(config.result.item_type)
                )
                if config.result.item_type in cls._CAST_TYPE_MAP:
                    cast_type = types.ArrayType(
                        cls._CAST_TYPE_MAP.get(config.result.item_type)
                    )
            else:
                raise ValueError(f'unknown item type for array: {config.result.item_type}')
        else:
            result_type = cls._PRIMITIVE_TYPE_MAP.get(config.result.type)
            cast_type = cls._CAST_TYPE_MAP.get(config.result.type)

        if result_type is None:
            raise ValueError(f'unknown result type: {config.result.type}')

        return Ensembler(config.uri, config.result.column_name, result_type, cast_type)
Esempio n. 6
0
def main():
    # do things...
    schema = types.StructType([
        types.StructField('ID', types.StringType(), False),
        types.StructField('DATE', types.LongType(), False),
        types.StructField('TYPE', types.StringType(), False),
        types.StructField('VALUE1', types.LongType(), False),
        types.StructField('MFlag', types.StringType(), True),
        types.StructField('QFlag', types.StringType(), True)
    ])

    t = spark.read.csv(inputs, schema=schema)
    t = t.where(col("QFlag").isNull())
    # t.show()

    p = t.filter((col("TYPE") == "TMAX") | (col("TYPE") == "TMIN")) \
        .groupby('DATE', 'ID').agg( (2 * max("VALUE1") - sum("VALUE1")).alias("Range"))

    max_table = p.groupby('DATE').agg(max("Range").alias('MaxRange'))

    cond = [
        p['DATE'] == max_table['DATE'], p['Range'] == max_table['MaxRange']
    ]
    df_result = p.join(max_table, cond,
                       'inner').select(p['DATE'], p['ID'],
                                       p['Range']).sort(col("DATE"))

    df_result.show()
Esempio n. 7
0
    def test_futureLeftJoin(self):
        import pyspark.sql.types as pyspark_types
        price = self.price()
        vol = self.vol()
        expected_pdf = test_utils.make_pdf([
            (1000, 7, 0.5, 400, 1050),
            (1000, 3, 1.0, 300, 1050),
            (1050, 3, 1.5, 500, 1100),
            (1050, 7, 2.0, 600, 1100),
            (1100, 3, 2.5, 700, 1150),
            (1100, 7, 3.0, 800, 1150),
            (1150, 3, 3.5, 900, 1200),
            (1150, 7, 4.0, 1000, 1200),
            (1200, 3, 4.5, 1100, 1250),
            (1200, 7, 5.0, 1200, 1250),
            (1250, 3, 5.5, None, None),
            (1250, 7, 6.0, None, None),
        ], ["time", "id", "price", "volume", "time2"])

        new_pdf = price.futureLeftJoin(vol.withColumn(
            "time2", vol.time.cast(pyspark_types.LongType())),
                                       tolerance=pd.Timedelta("100ns"),
                                       key=["id"],
                                       strict_lookahead=True).toPandas()
        new_pdf1 = price.futureLeftJoin(vol.withColumn(
            "time2", vol.time.cast(pyspark_types.LongType())),
                                        tolerance=pd.Timedelta("100ns"),
                                        key="id",
                                        strict_lookahead=True).toPandas()
        test_utils.assert_same(new_pdf, new_pdf1)
        test_utils.assert_same(new_pdf, expected_pdf)
Esempio n. 8
0
def main(inputs, output):

    pathfunction = functions.udf(path_to_hour, returnType=types.StringType())

    comments_schema = types.StructType([
        types.StructField('language', types.StringType()),
        types.StructField('title', types.StringType()),
        types.StructField('views', types.LongType()),
        types.StructField('bytes', types.LongType()),
    ])

    wikipage = spark.read.csv(inputs, schema=comments_schema,
                              sep=' ').withColumn(
                                  'hour',
                                  pathfunction(functions.input_file_name()))
    filtered_page = wikipage.filter(
        (wikipage.language == 'en') & (wikipage.title != 'Main Page')
        & (~wikipage.title.startswith('Special:'))).cache()

    max_view = filtered_page.groupBy(wikipage.hour).agg(
        functions.max(wikipage.views).alias('total_views'))
    conditions = [
        filtered_page.views == max_view.total_views,
        filtered_page.hour == max_view.hour
    ]

    # regular join: join_page = filtered_page.join(functions.broadcast(max_view), conditions).select(filtered_page.hour, 'title', 'views')
    # broadcast join as following:
    join_page = filtered_page.join(functions.broadcast(max_view),
                                   conditions).select(filtered_page.hour,
                                                      'title', 'views')
    join_page.sort('hour', 'title').write.json(output, mode='overwrite')
    join_page.explain()
def main(input_stream, sentiment_model_file):
    # main logic starts here
    headline_schema = types.StructType([
        types.StructField('title', types.StringType()),
        types.StructField('score', types.LongType()),
        types.StructField('num_comments', types.LongType()),
    ])

    #load the sentiment model
    sentiment_model = PipelineModel.load(sentiment_model_file)

    #Load the headline stream
    headlines_stream = spark.readStream.format('json').schema(
        headline_schema).load(input_stream)

    #match the schema our sentiment model needs
    headlines_stream = prepare_for_process(headlines_stream)

    #make the prediction 0 = lowest,1 = neutral, 2 = sentiment
    predictions_df = sentiment_model.transform(headlines_stream)
    predictions_df = predictions_df.select(predictions_df['title'],
                                           predictions_df['prediction'])

    predictions_df.writeStream.format('console').outputMode('append').option(
        'truncate', False).start().awaitTermination(600)
Esempio n. 10
0
def my_compute_function(ctx, site_counts, **domains):

    data = []
    for domain_name, domain_df in domains.items():
        row_count = domain_df.count()
        data.append((domain_name.lower(), row_count))

    # Create dataframe with row counts for each domain
    df = ctx.spark_session.createDataFrame(data,
                                           ['domain', 'parsed_row_count'])

    try:
        # Join in row counts from DATA_COUNT csv
        for col_name in site_counts.columns:
            site_counts = site_counts.withColumnRenamed(
                col_name, col_name.upper())
        df = df.join(site_counts, df.domain == F.lower(site_counts.TABLE_NAME),
                     'left')
        df = df.withColumn("delta_row_count",
                           df.ROW_COUNT - df.parsed_row_count)
        df = df.selectExpr("domain",
                           "cast(ROW_COUNT as long) as loaded_row_count",
                           "parsed_row_count", "delta_row_count")

    except:
        schema = T.StructType([
            T.StructField("domain", T.StringType(), True),
            T.StructField("loaded_row_count", T.LongType(), True),
            T.StructField("parsed_row_count", T.LongType(), True),
            T.StructField("delta_row_count", T.DoubleType(), True),
        ])
        df = ctx.spark_session.createDataFrame([], schema)

    return df
Esempio n. 11
0
def createDeltaBackedState(tableName, overwrite=False):

    from delta.tables import DeltaTable
    import pyspark.sql.types as T

    db_location = "dbfs:/home/[email protected]/streamingWorkshop/db"
    db_table_name = "sw_db." + tableName
    checkpoint_location = db_location + "/checkpointTables/" + db_table_name

    delta_schema = (T.StructType([
        T.StructField("item_id", T.LongType()),
        T.StructField("timestamp", T.TimestampType()),
        T.StructField("sales", T.LongType())
    ]))

    # Create an empty Delta table if it does not exist. This is required for the MERGE to work in the first mini batch.
    if overwrite or not DeltaTable.isDeltaTable(
            spark, db_location + "/" + db_table_name):
        (spark.createDataFrame(
            [], delta_schema).write.mode("overwrite").option(
                "overwriteSchema",
                "true").format("delta").saveAsTable(db_table_name))
        spark.sql(
            f"ALTER TABLE {db_table_name} SET TBLPROPERTIES (delta.autoOptimize.optimizeWrite = true, delta.autoOptimize.autoCompact = false)"
        )
Esempio n. 12
0
    def test_undefined_field(self):
        with six.assertRaisesRegex(self, KeyError, 'f2'):
            schema_has(
                T.StructType([T.StructField('f1', T.IntegerType())]),
                T.StructType([T.StructField('f2', T.LongType())]),
            )

        with six.assertRaisesRegex(self, KeyError, 'f1\.element\.s2'):
            schema_has(
                T.StructType([
                    T.StructField(
                        'f1',
                        T.ArrayType(T.StructType([T.StructField('s1', T.IntegerType())])),
                    ),
                ]),
                T.StructType([
                    T.StructField(
                        'f1',
                        T.ArrayType(T.StructType([T.StructField('s2', T.LongType())])),
                    ),
                ]),
            )

        with six.assertRaisesRegex(self, TypeError, 'element is IntegerType, expected LongType'):
            schema_has(
                T.ArrayType(T.IntegerType()),
                T.ArrayType(T.LongType()),
            )
Esempio n. 13
0
def df_regex_make(wikiqtsv):
    # make wikiq tsv into a dataframe
    tsv2df = reader.csv(wikiqtsv,
                        sep="\t",
                        inferSchema=False,
                        header=True,
                        mode="PERMISSIVE",
                        quote="")
    #tsv2df = tsv2df.repartition(args.num_partitions)

    # basic structure
    struct = types.StructType().add("anon",types.StringType(),True)
    struct = struct.add("articleid",types.LongType(),True)
    struct = struct.add("date_time",types.TimestampType(), True)
    struct = struct.add("deleted",types.BooleanType(), True)
    struct = struct.add("editor",types.StringType(),True)
    struct = struct.add("editor_id",types.LongType(), True)
    struct = struct.add("minor", types.BooleanType(), True)
    struct = struct.add("namespace", types.LongType(), True)
    struct = struct.add("revert", types.BooleanType(), True)
    struct = struct.add("reverteds", types.StringType(), True)
    struct = struct.add("revid", types.LongType(), True)
    struct = struct.add("sha1", types.StringType(), True)
    struct = struct.add("text_chars", types.LongType(), True)
    struct = struct.add("title",types.StringType(), True)

    # structure the df to get the def with columns of metadata and regexes
    regex_one_df = df_structurize(tsv2df,struct)

    return regex_one_df
Esempio n. 14
0
def specifySchema():
    wiki_schema = types.StructType([  # commented-out fields won't be read
        types.StructField('lang', types.StringType(), True),
        types.StructField('page_name', types.StringType(), True),
        types.StructField('viewcount', types.LongType(), True),
        types.StructField('bytes', types.LongType(), True),
    ])
    return wiki_schema
Esempio n. 15
0
def main(inputs, keyspace, table):
    if table == "yelp_business":
        business_schema = StructType([
            types.StructField('business_id', types.StringType(), True),
            types.StructField('name', types.StringType(), True),
            types.StructField('neighborhood', types.StringType(), True),
            types.StructField('address', types.StringType(), True),
            types.StructField('city', types.StringType(), True),
            types.StructField('state', types.StringType(), True),
            types.StructField('postal_code', types.StringType(), True),
            types.StructField('latitude', types.FloatType(), True),
            types.StructField('longitude', types.FloatType(), True),
            types.StructField('stars', types.FloatType(), True),
            types.StructField('review_count', types.LongType(), True),
            types.StructField('is_open', types.IntegerType(), True)
        ])
        business = spark.read.json(inputs, schema=business_schema)
        df = business.drop('neighborhood').filter(business.is_open == 1)
        df.cache()
        business_data = sc.textFile(inputs).map(json_key_value_1).map(
            lambda x: Row(x[0], x[1], x[2], x[3]))
        df_1 = business_data.toDF()
        df_2 = df_1.withColumnRenamed("_1", "bus_id").withColumnRenamed(
            "_2", "attributes").withColumnRenamed(
                "_3", "categories").withColumnRenamed("_4", "hours")
        df_2.cache()
        result = df.join(df_2, df.business_id == df_2.bus_id,
                         how='inner').drop(df_2.bus_id)

    elif table == "yelp_checkin":

        checkin_data = sc.textFile(inputs).map(json_key_value_2).map(
            lambda x: Row(str(uuid.uuid1()), x[0], x[1]))
        df = checkin_data.toDF().cache()
        df_1 = df.withColumnRenamed("_1", "id").withColumnRenamed(
            "_2", "time").withColumnRenamed("_3", "business_id")
        result = df_1

    if table == "yelp_review":
        reviews_schema = types.StructType([
            types.StructField('business_id', types.StringType(), True),
            types.StructField('cool', types.LongType(), True),
            types.StructField('date', types.DateType(), True),
            types.StructField('funny', types.LongType(), True),
            types.StructField('review_id', types.StringType(), True),
            types.StructField('stars', types.LongType(), True),
            types.StructField('text', types.StringType(), True),
            types.StructField('useful', types.LongType(), True),
            types.StructField('user_id', types.StringType(), True)
        ])

        reviews = spark.read.json(inputs, schema=reviews_schema)
        uuidUdf = udf(lambda: str(uuid.uuid1()), types.StringType())
        result = reviews.withColumn("id", uuidUdf())
    result.repartition(300).write.format(
        "org.apache.spark.sql.cassandra").options(table=table,
                                                  keyspace=keyspace).save()
Esempio n. 16
0
    def test_type_mismatch(self):
        with six.assertRaisesRegex(self, AssertionError, 'Cannot compare heterogeneous types'):
            schema_has(
                T.StructType([T.StructField('f1', T.IntegerType())]),
                T.ArrayType(T.IntegerType()),
            )

        with six.assertRaisesRegex(self, AssertionError, 'Cannot compare heterogeneous types'):
            schema_has(
                T.ArrayType(T.IntegerType()),
                {'f1': T.IntegerType()},
            )

        with six.assertRaisesRegex(self, TypeError, 'f1 is IntegerType, expected LongType'):
            schema_has(
                T.StructType([T.StructField('f1', T.IntegerType())]),
                T.StructType([T.StructField('f1', T.LongType())]),
            )

        with six.assertRaisesRegex(
                self,
                TypeError,
                'f1\.element\.s1 is IntegerType, expected LongType',
        ):
            schema_has(
                T.StructType([
                    T.StructField(
                        'f1',
                        T.ArrayType(T.StructType([T.StructField('s1', T.IntegerType())])),
                    ),
                ]),
                T.StructType([
                    T.StructField(
                        'f1',
                        T.ArrayType(T.StructType([T.StructField('s1', T.LongType())])),
                    ),
                ]),
            )

        with six.assertRaisesRegex(self, TypeError, 'element is IntegerType, expected LongType'):
            schema_has(
                T.ArrayType(T.IntegerType()),
                T.ArrayType(T.LongType()),
            )

        with six.assertRaisesRegex(self, TypeError, 'key is StringType, expected LongType'):
            schema_has(
                T.MapType(T.StringType(), T.IntegerType()),
                T.MapType(T.LongType(), T.IntegerType()),
            )

        with six.assertRaisesRegex(self, TypeError, 'value is IntegerType, expected LongType'):
            schema_has(
                T.MapType(T.StringType(), T.IntegerType()),
                T.MapType(T.StringType(), T.LongType()),
            )
Esempio n. 17
0
 def test_arrays_nested_subset(self):
     schema_has(
         T.ArrayType(T.ArrayType(T.StructType([
             T.StructField('f1', T.ArrayType(T.LongType())),
             T.StructField('f2', T.ArrayType(T.StringType())),
         ]))),
         T.ArrayType(T.ArrayType(T.StructType([
             T.StructField('f1', T.ArrayType(T.LongType()))
         ]))),
     )
Esempio n. 18
0
 def test_validate_work_success(self):
     validator = TransformationsValidator(
         TransformationOperations(CONFIG_PATH), self.data_structure_pyspark)
     fields = validator.validate(
         ["src_ip", "dst_ip", "packet_size", "sampling_rate"])
     self.assertEqual(
         fields,
         types.StructType([
             types.StructField('src_ip', types.StringType()),
             types.StructField('dst_ip', types.StringType()),
             types.StructField('packet_size', types.LongType()),
             types.StructField('sampling_rate', types.LongType())
         ]), 'StructType should be equal')
Esempio n. 19
0
def load_prices(spark):
    data = [
        (10, 1546300799000, 37.50, 37.51),
        (10, 1546300802000, 37.51, 37.52),
        (10, 1546300806000, 37.50, 37.51),
    ]
    schema = T.StructType([
        T.StructField("id", T.LongType()),
        T.StructField("timestamp", T.LongType()),
        T.StructField("bid", T.DoubleType()),
        T.StructField("ask", T.DoubleType()),
    ])

    return spark.createDataFrame(data, schema)
Esempio n. 20
0
def test_data(spark_session):

    sa = (
        T.StructType()
        .add('id', T.IntegerType(), False, None)
        .add('tag', T.StringType(), False, None)
        .add('a1', T.IntegerType(), False, None)
        .add('a2', T.IntegerType(), False, None)
        )
        

    sb = (
        T.StructType()
        .add('id', T.IntegerType(), False, None)
        .add('tag', T.StringType(), False, None)
        .add('b1', T.LongType(), False, None)
        .add('b2', T.IntegerType(), False, None)
        )

    sc = (
        T.StructType()
        .add('id', T.IntegerType(), False, None)
        .add('tag', T.StringType(), False, None)
        .add('c1', T.LongType(), False, None)
        .add('c2', T.FloatType(), False, None)
        .add('c3', T.IntegerType(), False, None)
        )

    da = [
            (1,'a',1,1),
            (2,'a',1,1),
            (3,'a',1,1)]

    db = [
            (1,'b',2,2),
            (2,'b',2,2),
            (3,'b',2,2)]

    dc = [
            (1,'c',3,0.010,1),
            (2,'c',3,3.0,0),
            (3,'c',3,3.0,1)]

    return {
        "dfa" : spark_session.createDataFrame(da, sa),
        "dfb" : spark_session.createDataFrame(db, sb),
        "dfc" : spark_session.createDataFrame(dc, sc)
        }

    
Esempio n. 21
0
def load_trades(spark):
    data = [
        (10, 1546300800000, 37.50, 100.000),
        (10, 1546300801000, 37.51, 100.000),
        (20, 1546300804000, 12.67, 300.000),
        (10, 1546300807000, 37.50, 200.000),
    ]
    schema = T.StructType([
        T.StructField("id", T.LongType()),
        T.StructField("timestamp", T.LongType()),
        T.StructField("price", T.DoubleType()),
        T.StructField("quantity", T.DoubleType()),
    ])

    return spark.createDataFrame(data, schema)
Esempio n. 22
0
File: vep.py Progetto: Hoeze/firefly
    def add_colocated_variants(self):
        empty_field = t.StructField('colocated_variants',
                                    t.ArrayType(t.StructType()))

        f = self.get_output_struct_field(
            "colocated_variants",
            create_if_nonexistent=empty_field).elementType
        f.add(t.StructField("seq_region_name", t.StringType()))
        f.add(t.StructField("strand", t.IntegerType()))
        f.add(t.StructField("start", t.LongType()))
        f.add(t.StructField("end", t.LongType()))
        f.add(t.StructField("id", t.StringType()))
        f.add(t.StructField("allele_string", t.StringType()))

        return self
    def test_validate_with_correct_two_level_subtree(self):
        validator = TransformatoinsValidator(
            TransformationOperations({
                "country": "./GeoLite2-Country.mmdb",
                "city": "./GeoLite2-City.mmdb",
                "asn": "./GeoLite2-ASN.mmdb"
            }), self.data_structure_pyspark)

        syntaxtree = SyntaxTree()
        syntaxtree.operation = "sum"
        syntaxtree.children = ["sampling_rate", "packet_size"]

        main_syntax_tree = SyntaxTree()
        main_syntax_tree.operation = "mult"
        main_syntax_tree.children = [syntaxtree, "sampling_rate"]

        fields = validator.validate(
            [FieldTransformation("result", main_syntax_tree), "dst_ip"])

        self.assertEqual(
            fields,
            types.StructType([
                types.StructField('result', types.LongType()),
                types.StructField('dst_ip', types.StringType())
            ]))
Esempio n. 24
0
def run(rucio_path, dbs_path, output, verbose):
    start = time.time()
    spark = SparkSession.builder.appName("rucio_dumps_test").getOrCreate()
    csvreader = spark.read.format("csv") \
        .option("nullValue", "null") \
        .option("mode", "FAILFAST")
    avroreader = spark.read.format("avro")
    rucio_info = avroreader.load(rucio_path) \
        .withColumn("filename", fn.input_file_name())
    logger.debug("Rucio data types")
    logger.debug(rucio_info.dtypes)
    # rucio_info.show(5, False)
    dbs_files = csvreader.schema(schemas.schema_files()) \
        .load(dbs_path) \
        .select("f_logical_file_name", "f_dataset_id")    
    # dbs_files.show(5, False)
    rucio_df = (rucio_info.withColumn("tmp1", fn.substring_index("filename", "/rucio/", -1))
                .withColumn("tally_date", fn.substring_index("tmp1", "/", 1))
                .withColumn('create_day', fn.date_format(fn.to_date((rucio_info.CREATED_AT / fn.lit(1000))
                                                                    .cast(types.LongType())
                                                                    .cast(types.TimestampType())),
                                                         'yyyyMMdd')
                            )
                .withColumn('tally_day', fn.date_format(fn.to_date("tally_date", "yyyy-MM-dd"), 'yyyyMMdd'))
                .select("RSE_ID", "BYTES", "NAME", "SCOPE", "tally_day", "create_day")
                )
    # rucio_df.show(5, False)
    rucio_df = rucio_df \
        .join(dbs_files, dbs_files.f_logical_file_name == rucio_df.NAME) \
        .groupBy("RSE_ID", "f_dataset_id", "SCOPE", "tally_day", "create_day") \
        .agg(fn.sum("BYTES").alias("rep_size"))
    # rucio_df.show(5, False)
    rucio_df.write.option("compression", "snappy").parquet(output, mode="overwrite")
    end = time.time()
    logger.info("Elapsed Time: {min} min, {sec} sec.".format(min=(end - start) // 60, sec=(end - start) % 60))
Esempio n. 25
0
def recommend(num, user_id, spark, ratings_model):
    user_df = spark.createDataFrame([user_id], types.LongType())
    user_df = user_df.select(user_df['value'].alias('user_id'))
    rec_df_raw = ratings_model.recommendForUserSubset(
        user_df, num).select('recommendations')
    rec_rdd = rec_df_raw.rdd\
        .flatMap(lambda x: x['recommendations'])\
        .map(lambda x: (x['business_id'], x['rating']))\
        .map(lambda x: Row(business_id=x[0], rating=x[1]))
    if rec_rdd.isEmpty():
        return []
    rec_df = spark.createDataFrame(rec_rdd)\
        .withColumn('user_id', functions.lit(user_id))\
        .withColumn('timestamp', functions.current_timestamp())
    try:
        rec_df.write.format('jdbc').options(
            url='jdbc:mysql://localhost/YelpRecommender',
            driver='com.mysql.jdbc.Driver',
            dbtable='Recommend',
            user='******',
            password='******').mode('append').save()
    except Exception as e:
        print('recommend() function in use_model.py\n', str(e))
    # rec_df.show()
    l = list(
        rec_df.select('business_id').rdd.map(lambda x:
                                             (x['business_id'])).collect())
    return l
def main(inputs, output):
    # main logic starts here
    wiki_schema = types.StructType([
        types.StructField('language', types.StringType()),
        types.StructField('title', types.StringType()),
        types.StructField('views', types.IntegerType()),
        types.StructField('size', types.LongType()),
    ])
    #reading data
    wikiData = spark.read.csv(inputs, schema=wiki_schema, sep=" ").withColumn(
        'hour', path_to_hour(functions.input_file_name()))
    #filtering data
    filteredWikiData = wikiData[(wikiData['language'] == 'en')
                                & (wikiData['title'] != 'Main_Page') &
                                (wikiData['title'] != 'Special:Page')].cache()
    #finding max views per hour.
    maxCount = filteredWikiData.groupBy('hour').agg(
        functions.max(filteredWikiData['views']).alias('max'))
    #joining data to obtain hour and title.
    joinData = filteredWikiData.join(
        maxCount, filteredWikiData.views == maxCount.max).select(
            filteredWikiData["hour"], filteredWikiData["title"],
            filteredWikiData["views"])
    #sorting data based on hour and storing it in json file.
    joinData.sort(functions.asc('hour')).write.json(output, mode='overwrite')
Esempio n. 27
0
def _generate_select_expression_for_extended_string_to_long(
        source_column, name):
    """
    More robust conversion from StringType to LongType.
    Is able to additionally handle (compared to implicit Spark conversion):

    * Preceding whitespace
    * Trailing whitespace
    * Preceeding and trailing whitespace
    * underscores as thousand separators

    Hint
    ----
    Please have a look at the tests to get a better feeling how it behaves under
    tests/unit/transformer/test_mapper_custom_data_types.py::TestExtendedStringConversions and
    tests/data/test_fixtures/mapper_custom_data_types_fixtures.py

    Example
    -------
    >>> from spooq.transformer import Mapper
    >>>
    >>> input_df.head(3)
    [Row(input_string="  21474836470 "),
     Row(input_string="Hello"),
     Row(input_string="21_474_836_470")]
    >>> mapping = [("output_value", "input_string", "extended_string_to_long")]
    >>> output_df = Mapper(mapping).transform(input_df)
    >>> output_df.head(3)
    [Row(input_string=21474836470),
     Row(input_string=None),
     Row(input_string=21474836470)]
    """
    return F.regexp_replace(F.trim(source_column), "_",
                            "").cast(T.LongType()).alias(name)
Esempio n. 28
0
def _generate_select_expression_for_timestamp_s_to_s(source_column, name):
    """
    This Constructor is used for unix timestamps. The values are cleaned
    next to casting and renaming.
    If the values are not between `01.01.1970` and `31.12.2099`,
    NULL will be returned.
    Cast to :any:`pyspark.sql.types.LongType`

    Example
    -------
    >>> from pyspark.sql import Row
    >>> from spooq.transformer import Mapper
    >>>
    >>> input_df = spark.createDataFrame([
    >>>     Row(time_sec=1581540839),  # 02/12/2020 @ 8:53pm (UTC)
    >>>     Row(time_sec=-4887839),    # Invalid!
    >>>     Row(time_sec=4737139200)   # 02/12/2120 @ 12:00am (UTC)
    >>> ])
    >>>
    >>> mapping = [("unix_ts", "time_sec", "timestamp_s_to_ms")]
    >>> output_df = Mapper(mapping).transform(input_df)
    >>> output_df.head(3)
    [Row(unix_ts=1581540839), Row(unix_ts=None), Row(unix_ts=None)]

    Note
    ----
    *input*  in **seconds**
    *output* in **seconds**
    """
    return (F.when(source_column.between(MIN_TIMESTAMP_SEC, MAX_TIMESTAMP_SEC),
                   source_column).otherwise(F.lit(None)).cast(
                       T.LongType()).alias(name))
Esempio n. 29
0
def _numpy_to_spark_mapping():
    """Returns a mapping from numpy to pyspark.sql type. Caches the mapping dictionary inorder to avoid instantiation
    of multiple objects in each call."""

    # Refer to the attribute of the function we use to cache the map using a name in the variable instead of a 'dot'
    # notation to avoid copy/paste/typo mistakes
    cache_attr_name = 'cached_numpy_to_pyspark_types_map'
    if not hasattr(_numpy_to_spark_mapping, cache_attr_name):
        import pyspark.sql.types as T

        setattr(_numpy_to_spark_mapping, cache_attr_name,
                {
                    np.int8: T.ByteType(),
                    np.uint8: T.ShortType(),
                    np.int16: T.ShortType(),
                    np.uint16: T.IntegerType(),
                    np.int32: T.IntegerType(),
                    np.int64: T.LongType(),
                    np.float32: T.FloatType(),
                    np.float64: T.DoubleType(),
                    np.string_: T.StringType(),
                    np.str_: T.StringType(),
                    np.unicode_: T.StringType(),
                    np.bool_: T.BooleanType(),
                })

    return getattr(_numpy_to_spark_mapping, cache_attr_name)
def main():
    args = parseArguments()

    spark = SparkSession.builder.getOrCreate()

    Logger = spark._jvm.org.apache.log4j.Logger
    joblogger = Logger.getLogger(__name__)
    joblogger.info(
        '****************************************************************')
    joblogger.info('')
    joblogger.info('Starting creation of test data file with {0} rows and {1} '
                   'partitions at {2}'.format(args.rows, args.partitions,
                                              args.outfile))
    joblogger.info('')
    joblogger.info(
        '****************************************************************')

    udfGetUUID = F.udf(getUUID, T.StringType())

    df = (spark.range(0, args.rows, numPartitions=args.partitions).withColumn(
        'value', udfGetUUID()).withColumn(
            'prefix2', F.substring(F.col('value'), 1, 2)).withColumn(
                'prefix4', F.substring(F.col('value'), 1, 4)).withColumn(
                    'prefix8', F.substring(F.col('value'), 1, 8)).withColumn(
                        'float_val',
                        F.rand(seed=8675309) * 1000000).withColumn(
                            'integer_val',
                            F.col('float_val').cast(T.LongType())).drop('id'))

    df.write.csv(args.outfile, mode='overwrite', header=True)
    joblogger.info('Done writing to {0}'.format(args.outfile))