def spark_data_flow(): ''' 构建最终输出df ''' tid_new_df = tid_spark_data_flow() prd_new_df = tid_new_df.where( tid_new_df.bbd_qyxx_id.isNotNull() ).select( tid_new_df.bbd_qyxx_id.alias('id'), 'province', 'city', tid_new_df.county.alias('area'), tid_new_df.company_name.alias('company'), fun.round('risk_index', 1).alias('risk_index'), tid_new_df.risk_rank.alias('risk_level'), tid_new_df.risk_change.alias('risk_rise'), tid_new_df.is_rise.alias('rise'), tid_new_df.company_type.alias('industry'), tid_new_df.risk_composition.alias('index_radar'), tid_new_df.risk_tags.alias('risk_scan'), tid_new_df.risk_sequence_version.alias('index_sort'), tid_new_df.xgxx_info_with_change.alias('company_detail'), fun.current_timestamp().alias('gmt_create'), fun.current_timestamp().alias('gmt_update') ).fillna( u'无' ).fillna( {'city': u'无', 'area': u'无', 'province': u'无'} ).dropDuplicates( ['id'] ) return prd_new_df
def transform_raw(spark: SparkSession, raw: DataFrame) -> DataFrame: return raw.select( lit("files.training.databricks.com").alias("datasource"), current_timestamp().alias("ingesttime"), "value", current_timestamp().cast("date").alias("p_ingestdate"), )
def get_listens_for_rec_generation_window(mapped_df): """ Get listens to fetch top artists. Args: mapped_df (dataframe): Dataframe with all the columns/fields that a typical listen has. """ df = mapped_df.select('*') \ .where((col('listened_at') >= to_timestamp(date_sub(current_timestamp(), config.RECOMMENDATION_GENERATION_WINDOW))) & (col('listened_at') <= current_timestamp())) return df
def process(dfs: List[DataFrame]) -> DataFrame: [df1, df2] = dfs df1 = df1.withColumn("current_timestamp", F.current_timestamp()).withWatermark( "current_timestamp", "2 hours") df2 = df2.withColumn("current_timestamp", F.current_timestamp()).withWatermark( "current_timestamp", "2 hours") return df1.join( df2, (df1.field1 == df2.field1id) & (df1.current_timestamp >= df2.current_timestamp) & (df1.current_timestamp <= (df2.current_timestamp + F.expr("INTERVAL 1 HOURS"))), ).select("field1", "value1", "value2")
def execute_process(options): spark = pyspark.sql.session.SparkSession \ .builder \ .appName("criar_tabela_acervo") \ .config("spark.sql.sources.partitionOverwriteMode", "dynamic") \ .enableHiveSupport() \ .getOrCreate() schema_exadata = options['schema_exadata'] schema_exadata_aux = options['schema_exadata_aux'] table_name = options['table_name'] table = spark.sql(""" SELECT D.docu_orgi_orga_dk_responsavel AS cod_orgao, cod_pct as cod_atribuicao, count(D.docu_dk) as acervo, docu_cldc_dk as tipo_acervo FROM {0}.mcpr_documento D LEFT JOIN {1}.atualizacao_pj_pacote ON D.docu_orgi_orga_dk_responsavel = id_orgao LEFT JOIN {1}.tb_documentos_arquivados A ON D.docu_dk = A.docu_dk WHERE docu_fsdc_dk = 1 AND docu_tpst_dk != 11 AND A.docu_dk IS NULL GROUP BY D.docu_orgi_orga_dk_responsavel, cod_pct, docu_cldc_dk """.format(schema_exadata, schema_exadata_aux)) table = table.withColumn( "dt_inclusao", from_unixtime( unix_timestamp(current_timestamp(), 'yyyy-MM-dd'), 'yyyy-MM-dd') \ .cast('timestamp')) \ .withColumn("dt_partition", date_format(current_timestamp(), "ddMMyyyy")) is_exists_table_acervo = check_table_exists(spark, schema_exadata_aux, table_name) table_name = "{}.{}".format(schema_exadata_aux, table_name) if is_exists_table_acervo: table.coalesce(1).write.mode("overwrite").insertInto(table_name, overwrite=True) else: table.write.partitionBy("dt_partition").mode("overwrite").saveAsTable( table_name) execute_compute_stats(table_name)
def main(args, spark): arguments = parse_arguments(args) # Load metadata to process batch_metadata = get_batch_file_metadata( table_name=arguments.batch_metadata_table_name, batch_id=arguments.batch_id, region=arguments.region) input_bucket = arguments.input_bucket input_data = load_and_union_data(spark, batch_metadata, input_bucket) input_dfs = [] for dataset, df in input_data.items(): input_dfs.append(df) # get input dataframe input_df = union_all(input_dfs) # add extra column to input dataframe input_df = input_df.withColumn("current_ts", F.current_timestamp()) input_df.printSchema() input_df.show()
def create_delta_target(source_data_frame: DataFrame, path: str, hash_exclude_columns: list = []): """ Creates a target table using delta from DataFrame provided This is required if no data has yet been ingested. It will generate a temporary table with uuid then drop it. It will overwrite any existing table so use with caution Parameters: source_data_frame (DataFrame): A source DataFrame path (str): The path to write Dataframe to hash_exclude_columns (list): A list of columns to ignore changes on. Does not include in hash """ _df = source_data_frame _hash_columns = __subtract_list(_df.columns, hash_exclude_columns) # get a list of columns to hash _uuid_value = uuid.uuid4().hex # unique value used to name temp table. # Add SCD Attribute columns and write Delta _df = _df.withColumn(SCD_FLAG_NAME, lit(SCD_FLAG_ACTIVE)) _df = _df.withColumn(SCD_START_NAME, current_timestamp()) _df = _df.withColumn(SCD_END_NAME, to_timestamp(lit(SCD_DEFAULT_END_VALUE))) _df = _df.withColumn(SCD_HASHKEY_NAME, __sha1_concat(_hash_columns)) _df.write.format("delta").option("overwriteSchema", "true").partitionBy(SCD_FLAG_NAME).saveAsTable(_uuid_value, mode="overwrite", path=path) #Drop the table from metastore - the files remain since path=, makes the table EXTERNAL sql("OPTIMIZE {}".format(_uuid_value)) sql("VACUUM {}".format(_uuid_value)) sql("DROP TABLE IF EXISTS {}".format(_uuid_value))
def diff_forex(rdd): if rdd.isEmpty(): print("Forex RDD is empty") else: df = rdd.toDF() df = df.na.drop() df = df.selectExpr("_1 as time", "_2 as code", "_3 as bid_price", "_4 ask_price") df = df.withColumn("mid_price", (df["bid_price"] + df["ask_price"]) / 2) df = df.withColumn("bid_ask_spread", (df["ask_price"] - df["bid_price"])) df = df.withColumn( "lagged_mid_price", func.lag(df["mid_price"]).over( Window.partitionBy("code").orderBy("time"))) df = df.withColumn("percent_change", ((df["mid_price"] - df["lagged_mid_price"]) / df["lagged_mid_price"]) * 100) df = df.withColumn("processing_time", func.current_timestamp()) df = df.na.drop() df = df.select([ "processing_time", "code", "bid_price", "ask_price", "mid_price", "bid_ask_spread", "lagged_mid_price", "percent_change" ]) addToDB(df, "all_differenced") detect_anomaly(df)
def kafka_wrapper( kafka: KafkaStruct, process: Callable[[List[DataFrame]], DataFrame], inputs: List[InputStruct], spark: SparkSession, ) -> DataFrame: """ Read data from kafka ... Attributes ---------- kafka: kafka parameters process: function to apply to dataframes inputs: for each topic, input parameters spark: the instantiated sparksession """ confluent_config = get_confluent_config(kafka.brokers, prefix="kafka.") dfs = [ spark.readStream.format("kafka").option("startingOffsets", "earliest").option( "failOnDataLoss", "false"). option("subscribe", input.topic).options(**confluent_config).option( "kafka.sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username='******' password='******';" .format(kafka.confluent_api_key, kafka.confluent_secret), ).load().selectExpr("CAST(value AS STRING) as json").select( F.from_json( F.col("json"), schema=map_avro_to_spark_schema( input.topic_schema)).alias("data")).select("data.*") for input in inputs ] return process(dfs).withColumn("topic_timestamp", F.current_timestamp())
def usuarios_features(df, categoria=-1.0): logger.info("Calculando features para usuarios...") resultado = (df.select( df["user.id"].alias("user_id"), nullToInt("user.profile_use_background_image").alias( "con_imagen_fondo"), u_parse_time("user.created_at").cast('timestamp').alias( "cuenta_creada"), df["user.favourites_count"].alias("n_favoritos"), nullToInt("user.description").alias("con_descripcion"), F.length("user.description").alias("longitud_descripcion"), nullToInt("user.verified").alias("con_perfil_verificado"), nullToInt("user.default_profile_image").alias("con_imagen_default"), df["user.listed_count"].alias("n_listas"), nullToInt("user.geo_enabled").alias("con_geo_activo"), reputacion("user.followers_count", "user.friends_count").alias("reputacion"), df["user.statuses_count"].alias("n_tweets"), followersRatio("user.followers_count", "user.friends_count").alias("followers_ratio"), df["user.screen_name"].alias("nombre_usuario"), entropia("lista_intertweet").alias("entropia")).withColumn( "ano_registro", F.year("cuenta_creada")).withColumn( "categoria", F.lit(categoria)).withColumn("createdAt", F.current_timestamp())) return resultado
def save_to_stage(rdd): """ This method handles the kafka messages - we simply want to save them to the staging index for processing. """ # If we get an empty message, do nothing (should not happen!) if rdd.isEmpty(): return esconf = {} esconf["es.mapping.id"] = 'message_id' esconf["es.index.auto.create"] = "true" esconf["es.nodes"] = ip esconf["es.port"] = port esconf["es.nodes.wan.only"] = "true" esconf["es.write.operation"] = "index" sqlContext = SQLContext.getOrCreate(SparkContext.getOrCreate()) df = sqlContext.createDataFrame(rdd, samplingRatio=1).toDF( "topic", "key", "value") # Add Identifier, Recieved Timestamp, and Boolean Flag to indicate not processed df = df.drop(df.key) df = df.withColumn('message_id', f.md5(df.value)) df = df.withColumn("message_recieved_ts", f.lit(f.current_timestamp())) df = df.withColumn("message_processed", f.lit('false')) df.write.format("org.elasticsearch.spark.sql").options( **esconf).mode("append").save(resource)
def recommend(num, user_id, spark, ratings_model): user_df = spark.createDataFrame([user_id], types.LongType()) user_df = user_df.select(user_df['value'].alias('user_id')) rec_df_raw = ratings_model.recommendForUserSubset( user_df, num).select('recommendations') rec_rdd = rec_df_raw.rdd\ .flatMap(lambda x: x['recommendations'])\ .map(lambda x: (x['business_id'], x['rating']))\ .map(lambda x: Row(business_id=x[0], rating=x[1])) if rec_rdd.isEmpty(): return [] rec_df = spark.createDataFrame(rec_rdd)\ .withColumn('user_id', functions.lit(user_id))\ .withColumn('timestamp', functions.current_timestamp()) try: rec_df.write.format('jdbc').options( url='jdbc:mysql://localhost/YelpRecommender', driver='com.mysql.jdbc.Driver', dbtable='Recommend', user='******', password='******').mode('append').save() except Exception as e: print('recommend() function in use_model.py\n', str(e)) # rec_df.show() l = list( rec_df.select('business_id').rdd.map(lambda x: (x['business_id'])).collect()) return l
def add_meta_data_and_primary_key(self, data_frame: DataFrame) -> DataFrame: """ Add standardized meta data and primary key columns """ config = self.get_config() key_columns = config['key_columns'] primary_key = config['target_name'] + '_id' app_name = config['app_name'] non_key_columns = [ i for i in data_frame.columns if i not in key_columns ] df = (data_frame.withColumn( primary_key, F.expr(hash_columns(key_columns))).withColumn( "iptmeta_process_name", F.lit(app_name)).withColumn( "iptmeta_mod_dttm", F.current_timestamp()).withColumn( "iptmeta_diff_md5", F.expr(hash_columns(non_key_columns)))) # Prepare column order for output column_list = [primary_key] + key_columns column_list.append('iptmeta_process_name') column_list.append('iptmeta_mod_dttm') column_list.append('iptmeta_diff_md5') column_list = column_list + non_key_columns df = df.select(*column_list) return df
def on_data(self, data): """" Método que irá receber o dados da API e, após decorrido o tempo passado no parâmetro 'persist_time', irá salvá-los em uma temp view. """ try: tweet = Tweet() json_data = json.loads(data) if "limit" not in json_data: tweet.insert(json_data) self.listTweets.append(tweet.get_list()) print(f"Tweets:{len(self.listTweets)}, Time:{(time.time() - self.start)}") if (time.time() - self.start) > self.persist_time: try: df = sqlContext.createDataFrame(data=self.listTweets, schema=self.schema) df = df.withColumn("etl_load", F.current_timestamp()) df = df.withColumn("etl_load_partition_year", F.date_format("etl_load", "yyyy")) df = df.withColumn("etl_load_partition_month", F.date_format("etl_load", "MM")) df = df.withColumn("etl_load_partition_day", F.date_format("etl_load", "dd")) df = df.withColumn("etl_load_partition_hour", F.date_format("etl_load", "HH")) df.createOrReplaceTempView("tweets") except BaseException as e: print("Erro ao contruir 'df': " + str(e)) return False except BaseException as e: print("Error: " + str(e), "JSON fora do esperado:", json_data) return True
def main(): spark = SparkSession.builder \ .master('local') \ .appName('nyc-taxi') \ .config('spark.executor.memory', '1gb') \ .getOrCreate() sc: SparkContext = spark.sparkContext sc.setLogLevel('WARN') logger.info("app_id = {}".format(sc.applicationId)) df_line = spark.readStream.format('kafka') \ .option('kafka.bootstrap.servers', 'localhost:9092') \ .option('subscribe', 'word-count') \ .option('startingOffsets', 'latest') \ .load() \ .selectExpr('CAST(value AS STRING)') df_word = df_line \ .select(funs.explode(funs.split(df_line.value, " ")).alias("word")) df_word = df_word \ .withColumn('word', funs.regexp_replace('word', '[^a-zA-Z0-9]', '')) \ .filter(df_word['word'] != '') \ .selectExpr('LOWER(word) AS word') \ .withColumn('process_time', funs.current_timestamp()) df_grouped = df_word.groupBy( funs.window('process_time', '20 seconds', '10 seconds'), 'word').count() write_stream(df_grouped)
def gen_test_data(spark: SparkSession, verbose: int = 1) -> DataFrame: """ """ # Create a Spark data frame schema = T.StructType([ T.StructField("date", T.StringType(), True), T.StructField("user_id", T.IntegerType(), True), T.StructField("user_name", T.StringType(), True), T.StructField("total_orders", T.IntegerType(), True), T.StructField("total_amount", T.FloatType(), True), ]) data = [ ("2020-01-01", 1, "AA", 111, 111.11), ("2020-01-01", 2, "BB", 222, 222.22), ("2020-04-04", 1, "AA", 444, 444.44), ("2020-04-01", 3, "CC", 333, 333.33), ] data = spark.createDataFrame(data, schema=schema) proc = RateProcessor() proc_udf = F.udf(proc.run, T.FloatType()) # Convert a normal Python into a Spark UDF data = data.withColumn("rate", proc_udf("total_orders", "total_amount")) logger.info("Successfully added 'rate' column") data = data.withColumn( "updated_at", F.current_timestamp().cast("string")) # Added updated_at column logger.info("Successfully created the test data in Spark\n%s\n" % (data.toPandas().to_string(line_width=120))) return data
def run(self): print('----> Started Metadata Job') location = super().getDataFromMySQL('amrs', 'location', { 'partitionColumn': 'location_id', 'fetchsize': 100, 'lowerBound': 1, 'upperBound': 500, 'numPartitions': 1}).\ select('name', 'uuid').\ withColumn('_id', f.col('uuid')).\ withColumn('build_date', f.current_timestamp()).\ withColumn('type', f.lit('location')) program = super().getDataFromMySQL('amrs', 'program', { 'partitionColumn': 'program_id', 'fetchsize': 100, 'lowerBound': 1, 'upperBound': 500, 'numPartitions': 1}).\ select('name', 'uuid', 'concept_id').\ withColumn('_id', f.col('uuid')).\ withColumn('build_date', f.current_timestamp()).\ withColumn('type', f.lit('program')) concept = super().getDataFromMySQL('amrs', 'concept', { 'partitionColumn': 'concept_id', 'fetchsize': 100, 'lowerBound': 1, 'upperBound': 500, 'numPartitions': 1}).\ select('concept_id', 'uuid').\ withColumnRenamed('uuid', 'concept_uuid') program_object = program.join(concept, on="concept_id")\ .withColumn('concept', f.struct(f.col('concept_uuid').alias('uuid')))\ .drop('concept_uuid', 'concept_id') metadata_couch = Job.getSpark().read.format("org.apache.bahir.cloudant").load('openmrs-metadata').select('_id', '_rev') replace_program = program_object.join(metadata_couch, on='_id', how='left') replace_location = location.join(metadata_couch, on='_id', how='left') super().saveToCouchDB(replace_program, 'openmrs-metadata') super().saveToCouchDB(replace_location, 'openmrs-metadata')
def process_batch(batch: DataFrame, epoch_id: int): message: DataFrame = batch \ .agg(concat_ws(", ", collect_list("value")).alias("tweets")) \ .withColumn("content", regexp_replace("tweets", "http\S+|www.\S+|#|RT:", "")) \ .select("content") \ .withColumn("timestamp", current_timestamp()) \ .withColumn("total_case_count", total_cases()) save(message)
def test_timeformat_nodate_dateincolumns(self): format = "HH:mm:ss" now = str(datetime.datetime.now())[11:19] # test wrong type of column data = pd.DataFrame() times = [i for i in range(100)] data["c1"] = times df = self.spark.createDataFrame(data) with self.assertRaises(SystemExit) as cm: r1 = freshness(["c1"], timeFormat=format, df=df) # test correct type data = pd.DataFrame() times = [now for _ in range(100)] data["c1"] = times df = self.spark.createDataFrame(data) df = df.withColumn("c2", current_timestamp()) df = df.withColumn("c3", current_timestamp()) r1, r2, r3 = freshness(["c1", "c2", "c3"], timeFormat=format, df=df) r1 = float(r1.split(" ")[0]) r2 = float(r2.split(" ")[0]) r3 = float(r3.split(" ")[0]) self.assertLessEqual(r1, 10.0) self.assertLessEqual(r2, 10.0) self.assertLessEqual(r3, 10.0) data = pd.DataFrame() times = [now for _ in range(100)] for i in range(20): times[i] = "" data["c1"] = times df = self.spark.createDataFrame(data) df = df.withColumn("c1", replace_empty_with_null(df["c1"])) df = df.withColumn("c2", current_timestamp()) df = df.withColumn("c3", current_timestamp()) r1, r2, r3 = freshness(["c1", "c2", "c3"], timeFormat=format, df=df) r1 = float(r1.split(" ")[0]) r2 = float(r2.split(" ")[0]) r3 = float(r3.split(" ")[0]) self.assertLessEqual(r1, 10.0) self.assertLessEqual(r2, 10.0) self.assertLessEqual(r3, 10.0)
def execute_process(options): spark = pyspark.sql.session.SparkSession \ .builder \ .appName("criar_tabela_distribuicao") \ .enableHiveSupport() \ .getOrCreate() schema_exadata_aux = options['schema_exadata_aux'] table_name = options['table_name'] date_now = datetime.now() data_atual = date_now.strftime("%Y-%m-%d") qtd_acervo = spark.sql(""" select A.cod_orgao, A.cod_atribuicao as cod_atribuicao, SUM(A.acervo) as acervo from {0}.tb_acervo A inner join {0}.tb_regra_negocio_investigacao B on A.cod_atribuicao = B.cod_atribuicao AND A.tipo_acervo = B.classe_documento where A.dt_inclusao = '{1}' group by A.cod_orgao, A.cod_atribuicao """.format(schema_exadata_aux, data_atual)) qtd_acervo.registerTempTable('qtd_acervo_table') estatisticas = spark.sql(""" select cod_orgao, acervo, dist.* from qtd_acervo_table inner join ( select cod_atribuicao, min(acervo) as minimo, max(acervo) as maximo, avg(acervo) as media, percentile(acervo, 0.25) as primeiro_quartil, percentile(acervo, 0.5) as mediana, percentile(acervo, 0.75) as terceiro_quartil, percentile(acervo, 0.75) - percentile(acervo, 0.25) as IQR, percentile(acervo, 0.25) - 1.5*(percentile(acervo, 0.75) - percentile(acervo, 0.25)) as Lout, percentile(acervo, 0.75) + 1.5*(percentile(acervo, 0.75) - percentile(acervo, 0.25)) as Hout from qtd_acervo_table t group by cod_atribuicao) dist ON dist.cod_atribuicao = qtd_acervo_table.cod_atribuicao """).withColumn( "dt_inclusao", from_unixtime( unix_timestamp(current_timestamp(), 'yyyy-MM-dd HH:mm:ss'), 'yyyy-MM-dd HH:mm:ss').cast('timestamp')) table_name = "{}.{}".format(schema_exadata_aux, table_name) estatisticas.write.mode("overwrite").saveAsTable("temp_table_distribuicao") temp_table = spark.table("temp_table_distribuicao") temp_table.write.mode("overwrite").saveAsTable(table_name) spark.sql("drop table temp_table_distribuicao") execute_compute_stats(table_name)
def add_audit_cols(df, changedt): """ Adds audit columns to the dataframe """ df = df.withColumn("operation", f.lit("I")) \ .withColumn("processeddate", f.current_timestamp().cast("String")) \ .withColumn("changedate", f.lit(changedt)) \ .withColumn('changedate_year', f.year('changedate').cast("String")) \ .withColumn('changedate_month', f.month('changedate').cast("String")) \ .withColumn('changedate_day', f.dayofmonth('changedate').cast("String")) return df
def transform_data(data_stg): data_stg.dropna() #data_stg.fillna #check for zip code #check for email id # data_stg_trans = data_stg.withColumn("New Date", current_timestamp()).\ withColumn("Customer UUID",monotonically_increasing_id()) return data_stg_trans
def extract_stats_step(self, x): return x.select( *(F.lit(str(v)).alias(k) for k, v in self.training_info().items()), F.current_timestamp().alias('trained_at'), functions.stats( F.to_timestamp('date_received')).alias('date_received_stats'), functions.stats('committed_at').alias('committed_at_stats'), F.collect_list(F.struct( 'complaint_id', 'committed_at', )).alias('records'))
def save_function(df, batch_id): df.persist() df.write.mode("append").format("parquet").save(TWEETS_OUTPUT) agg_df = df.agg(F.min("created_at_unix").alias("created_at_unix"), F.count("text").alias("count")) agg_df.persist() agg_df = agg_df.withColumn("timestamp", F.current_timestamp()) agg_df = agg_df.withColumn("latency", (F.unix_timestamp("timestamp") - F.col("created_at_unix").alias("latency"))) agg_df = agg_df.withColumn("batch_id", F.lit(batch_id)) agg_df.write.mode("append").format("parquet").save(BATCH_DIAGNOSTIC_OUTPUT) agg_df.unpersist() df.unpersist()
def save_vector(self, model, index): def convert_vector(x): '''Convert a list or numpy array to delimited token filter format''' return " ".join(["%s|%s" % (i, v) for i, v in enumerate(x)]) def vector_to_struct(x, version, ts): ''' Convert a vector to a SparkSQL Struct with string-format vector and version fields ''' return (convert_vector(x), version, ts) vector_struct = udf( vector_to_struct, StructType([ StructField("factor", StringType(), True), StructField("version", StringType(), True), StructField("timestamp", LongType(), True) ])) start = time() ver = model.uid ts = unix_timestamp(current_timestamp()) item_vectors = model.itemFactors.\ select("id", vector_struct("features", lit(ver), ts).alias("@model")) user_vectors = model.userFactors.\ select("id", vector_struct("features", lit(ver), ts).alias("@model")) # write data to ES, use: # - "id" as the column to map to ES movie id # - "update" write mode for ES, since you want to update new fields only # - "append" write mode for Spark item_vectors.write.format('es').\ option('es.mapping.id', 'id').\ option('es.write.operation', 'update').\ save(index + '/' + 'movies', mode='append') # write data to ES, use: # - "id" as the column to map to ES movie id # - "index" write mode for ES, since you have not written to the user index previously # - "append" write mode for Spark user_vectors.write.format('es').\ option('es.mapping.id', 'id').\ option('es.write.operation', 'index').\ save(index + '/' + 'users', mode='append') dur = time() - start print( 'Save trained feature vectors into Elasticsearch in %.3f seconds.' % dur) return dur
def test_timeformat_withdate(self): format = "yyyy-MM-dd HH:mm:ss" time = str(datetime.datetime.now())[11:19] time = "1970-01-01 " + time # test wrong type of column data = pd.DataFrame() times = [i for i in range(100)] data["c1"] = times df = self.spark.createDataFrame(data) with self.assertRaises(SystemExit) as cm: r1 = freshness(["c1"], timeFormat=format, df=df) # test correct type data = pd.DataFrame() times = [time for _ in range(100)] data["c1"] = times df = self.spark.createDataFrame(data) df = df.withColumn("c2", to_timestamp(df["c1"], format)) df = df.withColumn("c3", to_timestamp(df["c1"], format)) df = df.withColumn( "c4", current_timestamp().cast("long") - to_timestamp(lit(time), format).cast("long")) # seconds from 1970 plus 10 seconds for computation time seconds = df.collect()[0][3] + 10 r1, r2, r3 = freshness(["c1", "c2", "c3"], timeFormat=format, df=df) r1 = float(r1.split(" ")[0]) r2 = float(r2.split(" ")[0]) r3 = float(r3.split(" ")[0]) self.assertLessEqual(r1, seconds) self.assertLessEqual(r2, seconds) self.assertLessEqual(r3, seconds) data = pd.DataFrame() times = [time for _ in range(100)] for i in range(20): times[i] = "" data["c1"] = times df = self.spark.createDataFrame(data) df = df.withColumn("c1", replace_empty_with_null(df["c1"])) df = df.withColumn("c2", to_timestamp(df["c1"], format)) df = df.withColumn("c3", to_timestamp(df["c1"], format)) r1, r2, r3 = freshness(["c1", "c2", "c3"], timeFormat=format, df=df) r1 = float(r1.split(" ")[0]) r2 = float(r2.split(" ")[0]) r3 = float(r3.split(" ")[0]) self.assertLessEqual(r1, seconds) self.assertLessEqual(r2, seconds) self.assertLessEqual(r3, seconds)
def add_audit_columns(self, df, processing_dt): try: df = ( df.withColumn("date_uploaded", F.lit(datetime.strftime(processing_dt, "%Y-%m-%d"))) \ .withColumn("error_desc", F.lit(None).cast(ArrayType(StringType()))) \ .withColumn("ingestion_dttm", F.current_timestamp()) ) except BaseException as ex: self.logger.error("Failed to add audit table because of error: %s", str(ex)) sys.exit(-1) return df
def main(spark: SparkSession): df_acct_trsn_itm_etry_typ = sc.parallelize([ \ Row(P_SRC_SYS_CD='1051041', ACCT_TRSN_ITM_ETRY_TYP_CD='S', ACCT_TRSN_ITM_ETRY_TYP_NM='Debit', ACCT_TRSN_ITM_ETRY_TYP_DN='', RADAR_UPD_BY_PRS_ID='ic_fi1_1051041_acct_trsn_itm_etry_typ', RADAR_DLT_IND='N'), \ Row(P_SRC_SYS_CD='1051041', ACCT_TRSN_ITM_ETRY_TYP_CD='H', ACCT_TRSN_ITM_ETRY_TYP_NM='Credit', ACCT_TRSN_ITM_ETRY_TYP_DN='', RADAR_UPD_BY_PRS_ID='ic_fi1_1051041_acct_trsn_itm_etry_typ', RADAR_DLT_IND='N'), \ Row(P_SRC_SYS_CD='1051041', ACCT_TRSN_ITM_ETRY_TYP_CD='DR', ACCT_TRSN_ITM_ETRY_TYP_NM='Debit', ACCT_TRSN_ITM_ETRY_TYP_DN='', RADAR_UPD_BY_PRS_ID='ic_fi1_1051041_acct_trsn_itm_etry_typ', RADAR_DLT_IND='N'), \ Row(P_SRC_SYS_CD='1051041', ACCT_TRSN_ITM_ETRY_TYP_CD='CR', ACCT_TRSN_ITM_ETRY_TYP_NM='Credit', ACCT_TRSN_ITM_ETRY_TYP_DN='', RADAR_UPD_BY_PRS_ID='ic_fi1_1051041_acct_trsn_itm_etry_typ', RADAR_DLT_IND='N')]).toDF() df_acct_trsn_itm_etry_typ.select( 'P_SRC_SYS_CD', 'ACCT_TRSN_ITM_ETRY_TYP_CD', 'ACCT_TRSN_ITM_ETRY_TYP_NM', 'ACCT_TRSN_ITM_ETRY_TYP_DN', 'RADAR_UPD_BY_PRS_ID', 'RADAR_DLT_IND', func.current_timestamp()).write.mode('owerwrite').insertinto( 'radar.acct_trsn_itm_etry_typ', overwrite=True)
def transform(self): self.df_order_table = self.df_order_table.withColumn( 'order_date', to_date(col('order_datetime'), 'yyyy-MM-dd')) self.df_order_table = self.df_order_table.withColumn( 'order_month', func.month(col('order_datetime'))) df_filter_cust = self.df_customer_table.where(col('age') > 18) ###inner join df_order_customer = self.df_order_table.join( df_filter_cust, on=(self.df_order_table['customer_id'] == df_filter_cust['customer_id']), how='inner').select(df_filter_cust['customer_id'], self.df_order_table['order_id'], self.df_order_table['order_month'], self.df_order_table['amount']) # total sales amount for each month of each customer who are greater than age 18 wind = Window.partitionBy('customer_id', 'order_month') df_order_customer = df_order_customer.withColumn( 'total_sale', func.sum(col('amount')).over(wind)) df_order_customer.distinct() df_order_customer.show() ###list the cutomer_id and their second order_id of customers who places more than 2 order in last 20 dayssss ######################## wind = Window.partitionBy('customer_id', 'order_date').orderBy( func.col('order_id').asc()) df_temp = self.df_order_table.withColumn('row', func.row_number().over(wind))\ df_temp = df_temp.withColumn( 'current_date', to_date(func.current_timestamp(), 'yyyy-MM-dd')) df_temp = df_temp.withColumn( 'diff_days', func.datediff('current_date', 'order_date')) df_temp = df_temp.withColumn( "diff", when((col('diff_days') <= lit(20)), lit(1)).otherwise(0)) df_temp = df_temp.where(col('diff') == 1) wind = Window.partitionBy('customer_id') df_temp = df_temp.withColumn('count', func.count('order_id').over(wind)) df_temp = df_temp.where((col('count') > 2) & (col('row') == 2)) df_temp.show()
def verify_checks_on_datasets(self, tables, sections): checker = Parser.get(sections, 'checks').split('|') check = Check(spark, CheckLevel.Error, Parser.get(sections,'check_name')) checker.insert(0, "check") checks = ".".join(checker) Verifying_Checks = (VerificationSuite(spark) .onData(tables) .addCheck(eval(checks)) .run()) Check_Reports = VerificationResult.checkResultsAsDataFrame(spark, Verifying_Checks) Check_Reports_Dataframe = (Check_Reports.withColumn('dataset_name', lit(sections)) .withColumn('check_run_tsp', lit(current_timestamp()))) return Check_Reports_Dataframe
return locate(color_string.upper(), column)\ .cast("boolean")\ .alias("is_" + color_string) selectedColumns = [color_locator(df.Description, c) for c in simpleColors] selectedColumns.append(expr("*")) # has to a be Column type df.select(*selectedColumns).where(expr("is_white OR is_red"))\ .select("Description").show(3, False) # COMMAND ---------- from pyspark.sql.functions import current_date, current_timestamp dateDF = spark.range(10)\ .withColumn("today", current_date())\ .withColumn("now", current_timestamp()) dateDF.createOrReplaceTempView("dateTable") # COMMAND ---------- from pyspark.sql.functions import date_add, date_sub dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1) # COMMAND ---------- from pyspark.sql.functions import datediff, months_between, to_date dateDF.withColumn("week_ago", date_sub(col("today"), 7))\ .select(datediff(col("week_ago"), col("today"))).show(1)