コード例 #1
0
def spark_data_flow():
    '''
    构建最终输出df
    '''        
    tid_new_df = tid_spark_data_flow()

    
    prd_new_df = tid_new_df.where(
        tid_new_df.bbd_qyxx_id.isNotNull()
    ).select(
        tid_new_df.bbd_qyxx_id.alias('id'),
        'province',
        'city',
        tid_new_df.county.alias('area'),
        tid_new_df.company_name.alias('company'),
        fun.round('risk_index', 1).alias('risk_index'),
        tid_new_df.risk_rank.alias('risk_level'),
        tid_new_df.risk_change.alias('risk_rise'),
        tid_new_df.is_rise.alias('rise'),
        tid_new_df.company_type.alias('industry'),
        tid_new_df.risk_composition.alias('index_radar'),
        tid_new_df.risk_tags.alias('risk_scan'),
        tid_new_df.risk_sequence_version.alias('index_sort'),
        tid_new_df.xgxx_info_with_change.alias('company_detail'),
        fun.current_timestamp().alias('gmt_create'),
        fun.current_timestamp().alias('gmt_update')
    ).fillna(
        u'无'
    ).fillna(
        {'city': u'无', 'area': u'无', 'province': u'无'}
    ).dropDuplicates(
        ['id']
    )    
    return prd_new_df
コード例 #2
0
def transform_raw(spark: SparkSession, raw: DataFrame) -> DataFrame:
    return raw.select(
        lit("files.training.databricks.com").alias("datasource"),
        current_timestamp().alias("ingesttime"),
        "value",
        current_timestamp().cast("date").alias("p_ingestdate"),
    )
コード例 #3
0
def get_listens_for_rec_generation_window(mapped_df):
    """ Get listens to fetch top artists.

        Args:
            mapped_df (dataframe): Dataframe with all the columns/fields that a typical listen has.
    """
    df = mapped_df.select('*') \
        .where((col('listened_at') >= to_timestamp(date_sub(current_timestamp(),
        config.RECOMMENDATION_GENERATION_WINDOW))) & (col('listened_at') <= current_timestamp()))
    return df
コード例 #4
0
 def process(dfs: List[DataFrame]) -> DataFrame:
     [df1, df2] = dfs
     df1 = df1.withColumn("current_timestamp",
                          F.current_timestamp()).withWatermark(
                              "current_timestamp", "2 hours")
     df2 = df2.withColumn("current_timestamp",
                          F.current_timestamp()).withWatermark(
                              "current_timestamp", "2 hours")
     return df1.join(
         df2,
         (df1.field1 == df2.field1id)
         & (df1.current_timestamp >= df2.current_timestamp)
         & (df1.current_timestamp <=
            (df2.current_timestamp + F.expr("INTERVAL 1 HOURS"))),
     ).select("field1", "value1", "value2")
コード例 #5
0
def execute_process(options):

    spark = pyspark.sql.session.SparkSession \
            .builder \
            .appName("criar_tabela_acervo") \
            .config("spark.sql.sources.partitionOverwriteMode", "dynamic") \
            .enableHiveSupport() \
            .getOrCreate()

    schema_exadata = options['schema_exadata']
    schema_exadata_aux = options['schema_exadata_aux']
    table_name = options['table_name']

    table = spark.sql("""
            SELECT 
                D.docu_orgi_orga_dk_responsavel AS cod_orgao, 
                cod_pct as cod_atribuicao,
                count(D.docu_dk) as acervo,
                docu_cldc_dk as tipo_acervo
            FROM {0}.mcpr_documento D
            LEFT JOIN {1}.atualizacao_pj_pacote ON D.docu_orgi_orga_dk_responsavel = id_orgao
            LEFT JOIN {1}.tb_documentos_arquivados A ON D.docu_dk = A.docu_dk 
            WHERE docu_fsdc_dk = 1
            AND docu_tpst_dk != 11
            AND A.docu_dk IS NULL
            GROUP BY D.docu_orgi_orga_dk_responsavel, cod_pct, docu_cldc_dk
    """.format(schema_exadata, schema_exadata_aux))

    table = table.withColumn(
            "dt_inclusao",
            from_unixtime(
                unix_timestamp(current_timestamp(), 'yyyy-MM-dd'), 'yyyy-MM-dd') \
            .cast('timestamp')) \
            .withColumn("dt_partition", date_format(current_timestamp(), "ddMMyyyy"))

    is_exists_table_acervo = check_table_exists(spark, schema_exadata_aux,
                                                table_name)

    table_name = "{}.{}".format(schema_exadata_aux, table_name)

    if is_exists_table_acervo:
        table.coalesce(1).write.mode("overwrite").insertInto(table_name,
                                                             overwrite=True)
    else:
        table.write.partitionBy("dt_partition").mode("overwrite").saveAsTable(
            table_name)

    execute_compute_stats(table_name)
コード例 #6
0
def main(args, spark):
    arguments = parse_arguments(args)

    # Load metadata to process
    batch_metadata = get_batch_file_metadata(
        table_name=arguments.batch_metadata_table_name,
        batch_id=arguments.batch_id,
        region=arguments.region)

    input_bucket = arguments.input_bucket
    input_data = load_and_union_data(spark, batch_metadata, input_bucket)

    input_dfs = []
    for dataset, df in input_data.items():
        input_dfs.append(df)

    # get input dataframe
    input_df = union_all(input_dfs)

    # add extra column to input dataframe
    input_df = input_df.withColumn("current_ts", F.current_timestamp())

    input_df.printSchema()

    input_df.show()
コード例 #7
0
def create_delta_target(source_data_frame: DataFrame, path: str, hash_exclude_columns: list = []):
    """
    Creates a target table using delta from DataFrame provided

    This is required if no data has yet been ingested.
    It will generate a temporary table with uuid then drop it.
    It will overwrite any existing table so use with caution

    Parameters:
    source_data_frame (DataFrame): A source DataFrame
    path (str): The path to write Dataframe to
    hash_exclude_columns (list): A list of columns to ignore changes on. Does not include in hash

    """
    _df = source_data_frame
    _hash_columns =  __subtract_list(_df.columns, hash_exclude_columns) # get a list of columns to hash
    _uuid_value = uuid.uuid4().hex  # unique value used to name temp table.

    # Add SCD Attribute columns and write Delta
    _df = _df.withColumn(SCD_FLAG_NAME, lit(SCD_FLAG_ACTIVE))
    _df = _df.withColumn(SCD_START_NAME, current_timestamp())
    _df = _df.withColumn(SCD_END_NAME, to_timestamp(lit(SCD_DEFAULT_END_VALUE)))
    _df = _df.withColumn(SCD_HASHKEY_NAME, __sha1_concat(_hash_columns))
    _df.write.format("delta").option("overwriteSchema", "true").partitionBy(SCD_FLAG_NAME).saveAsTable(_uuid_value,
                                                                                                       mode="overwrite",
                                                                                                       path=path)
    #Drop the table from metastore - the files remain since path=, makes the table EXTERNAL
    sql("OPTIMIZE {}".format(_uuid_value))
    sql("VACUUM {}".format(_uuid_value))
    sql("DROP TABLE IF EXISTS {}".format(_uuid_value))
コード例 #8
0
def diff_forex(rdd):
    if rdd.isEmpty():
        print("Forex RDD is empty")
    else:
        df = rdd.toDF()
        df = df.na.drop()
        df = df.selectExpr("_1 as time", "_2 as code", "_3 as bid_price",
                           "_4 ask_price")
        df = df.withColumn("mid_price",
                           (df["bid_price"] + df["ask_price"]) / 2)
        df = df.withColumn("bid_ask_spread",
                           (df["ask_price"] - df["bid_price"]))
        df = df.withColumn(
            "lagged_mid_price",
            func.lag(df["mid_price"]).over(
                Window.partitionBy("code").orderBy("time")))
        df = df.withColumn("percent_change",
                           ((df["mid_price"] - df["lagged_mid_price"]) /
                            df["lagged_mid_price"]) * 100)
        df = df.withColumn("processing_time", func.current_timestamp())

        df = df.na.drop()
        df = df.select([
            "processing_time", "code", "bid_price", "ask_price", "mid_price",
            "bid_ask_spread", "lagged_mid_price", "percent_change"
        ])

        addToDB(df, "all_differenced")

        detect_anomaly(df)
コード例 #9
0
def kafka_wrapper(
    kafka: KafkaStruct,
    process: Callable[[List[DataFrame]], DataFrame],
    inputs: List[InputStruct],
    spark: SparkSession,
) -> DataFrame:
    """
    Read data from kafka
    ...

    Attributes
    ----------
    kafka: kafka parameters
    process: function to apply to dataframes
    inputs: for each topic, input parameters
    spark: the instantiated sparksession
    """
    confluent_config = get_confluent_config(kafka.brokers, prefix="kafka.")

    dfs = [
        spark.readStream.format("kafka").option("startingOffsets",
                                                "earliest").option(
                                                    "failOnDataLoss", "false").
        option("subscribe", input.topic).options(**confluent_config).option(
            "kafka.sasl.jaas.config",
            "org.apache.kafka.common.security.plain.PlainLoginModule required username='******' password='******';"
            .format(kafka.confluent_api_key, kafka.confluent_secret),
        ).load().selectExpr("CAST(value AS STRING) as json").select(
            F.from_json(
                F.col("json"),
                schema=map_avro_to_spark_schema(
                    input.topic_schema)).alias("data")).select("data.*")
        for input in inputs
    ]
    return process(dfs).withColumn("topic_timestamp", F.current_timestamp())
コード例 #10
0
ファイル: tools.py プロジェクト: JosemyDuarte/twitterJudge
def usuarios_features(df, categoria=-1.0):
    logger.info("Calculando features para usuarios...")

    resultado = (df.select(
        df["user.id"].alias("user_id"),
        nullToInt("user.profile_use_background_image").alias(
            "con_imagen_fondo"),
        u_parse_time("user.created_at").cast('timestamp').alias(
            "cuenta_creada"), df["user.favourites_count"].alias("n_favoritos"),
        nullToInt("user.description").alias("con_descripcion"),
        F.length("user.description").alias("longitud_descripcion"),
        nullToInt("user.verified").alias("con_perfil_verificado"),
        nullToInt("user.default_profile_image").alias("con_imagen_default"),
        df["user.listed_count"].alias("n_listas"),
        nullToInt("user.geo_enabled").alias("con_geo_activo"),
        reputacion("user.followers_count",
                   "user.friends_count").alias("reputacion"),
        df["user.statuses_count"].alias("n_tweets"),
        followersRatio("user.followers_count",
                       "user.friends_count").alias("followers_ratio"),
        df["user.screen_name"].alias("nombre_usuario"),
        entropia("lista_intertweet").alias("entropia")).withColumn(
            "ano_registro", F.year("cuenta_creada")).withColumn(
                "categoria",
                F.lit(categoria)).withColumn("createdAt",
                                             F.current_timestamp()))

    return resultado
def save_to_stage(rdd):
    """
    This method handles the kafka messages - we simply want to save them to the staging index for processing.
    """
    # If we get an empty message, do nothing (should not happen!)
    if rdd.isEmpty():
        return

    esconf = {}
    esconf["es.mapping.id"] = 'message_id'
    esconf["es.index.auto.create"] = "true"
    esconf["es.nodes"] = ip
    esconf["es.port"] = port
    esconf["es.nodes.wan.only"] = "true"
    esconf["es.write.operation"] = "index"
    sqlContext = SQLContext.getOrCreate(SparkContext.getOrCreate())
    df = sqlContext.createDataFrame(rdd, samplingRatio=1).toDF(
        "topic", "key", "value")
    # Add Identifier, Recieved Timestamp, and Boolean Flag to indicate not processed
    df = df.drop(df.key)
    df = df.withColumn('message_id', f.md5(df.value))
    df = df.withColumn("message_recieved_ts", f.lit(f.current_timestamp()))
    df = df.withColumn("message_processed", f.lit('false'))
    df.write.format("org.elasticsearch.spark.sql").options(
        **esconf).mode("append").save(resource)
コード例 #12
0
def recommend(num, user_id, spark, ratings_model):
    user_df = spark.createDataFrame([user_id], types.LongType())
    user_df = user_df.select(user_df['value'].alias('user_id'))
    rec_df_raw = ratings_model.recommendForUserSubset(
        user_df, num).select('recommendations')
    rec_rdd = rec_df_raw.rdd\
        .flatMap(lambda x: x['recommendations'])\
        .map(lambda x: (x['business_id'], x['rating']))\
        .map(lambda x: Row(business_id=x[0], rating=x[1]))
    if rec_rdd.isEmpty():
        return []
    rec_df = spark.createDataFrame(rec_rdd)\
        .withColumn('user_id', functions.lit(user_id))\
        .withColumn('timestamp', functions.current_timestamp())
    try:
        rec_df.write.format('jdbc').options(
            url='jdbc:mysql://localhost/YelpRecommender',
            driver='com.mysql.jdbc.Driver',
            dbtable='Recommend',
            user='******',
            password='******').mode('append').save()
    except Exception as e:
        print('recommend() function in use_model.py\n', str(e))
    # rec_df.show()
    l = list(
        rec_df.select('business_id').rdd.map(lambda x:
                                             (x['business_id'])).collect())
    return l
コード例 #13
0
    def add_meta_data_and_primary_key(self,
                                      data_frame: DataFrame) -> DataFrame:
        """
        Add standardized meta data and primary key columns
        """
        config = self.get_config()
        key_columns = config['key_columns']
        primary_key = config['target_name'] + '_id'
        app_name = config['app_name']

        non_key_columns = [
            i for i in data_frame.columns if i not in key_columns
        ]

        df = (data_frame.withColumn(
            primary_key, F.expr(hash_columns(key_columns))).withColumn(
                "iptmeta_process_name", F.lit(app_name)).withColumn(
                    "iptmeta_mod_dttm", F.current_timestamp()).withColumn(
                        "iptmeta_diff_md5",
                        F.expr(hash_columns(non_key_columns))))

        # Prepare column order for output
        column_list = [primary_key] + key_columns
        column_list.append('iptmeta_process_name')
        column_list.append('iptmeta_mod_dttm')
        column_list.append('iptmeta_diff_md5')
        column_list = column_list + non_key_columns

        df = df.select(*column_list)

        return df
コード例 #14
0
 def on_data(self, data):
     """"
     Método que irá receber o dados da API e, após decorrido o tempo
     passado no parâmetro 'persist_time', irá salvá-los em uma temp view.
     """
     try:
         tweet = Tweet()
         json_data = json.loads(data)
         if "limit" not in json_data:
             tweet.insert(json_data)
             self.listTweets.append(tweet.get_list())
             
             print(f"Tweets:{len(self.listTweets)}, Time:{(time.time() - self.start)}")
             if (time.time() - self.start) > self.persist_time:
                 try:
                     df = sqlContext.createDataFrame(data=self.listTweets, schema=self.schema)
                     df = df.withColumn("etl_load", F.current_timestamp())
                     df = df.withColumn("etl_load_partition_year", F.date_format("etl_load", "yyyy"))
                     df = df.withColumn("etl_load_partition_month", F.date_format("etl_load", "MM"))
                     df = df.withColumn("etl_load_partition_day", F.date_format("etl_load", "dd"))
                     df = df.withColumn("etl_load_partition_hour", F.date_format("etl_load", "HH"))
                     df.createOrReplaceTempView("tweets")
                 except BaseException as e:
                     print("Erro ao contruir 'df': " + str(e))
                 return False
     except BaseException as e:
         print("Error: " + str(e), "JSON fora do esperado:", json_data)
     return True
コード例 #15
0
ファイル: main.py プロジェクト: nex3z/spark-exercise
def main():
    spark = SparkSession.builder \
        .master('local') \
        .appName('nyc-taxi') \
        .config('spark.executor.memory', '1gb') \
        .getOrCreate()
    sc: SparkContext = spark.sparkContext
    sc.setLogLevel('WARN')
    logger.info("app_id = {}".format(sc.applicationId))

    df_line = spark.readStream.format('kafka') \
        .option('kafka.bootstrap.servers', 'localhost:9092') \
        .option('subscribe', 'word-count') \
        .option('startingOffsets', 'latest') \
        .load() \
        .selectExpr('CAST(value AS STRING)')

    df_word = df_line \
        .select(funs.explode(funs.split(df_line.value, " ")).alias("word"))

    df_word = df_word \
        .withColumn('word', funs.regexp_replace('word', '[^a-zA-Z0-9]', '')) \
        .filter(df_word['word'] != '') \
        .selectExpr('LOWER(word) AS word') \
        .withColumn('process_time', funs.current_timestamp())

    df_grouped = df_word.groupBy(
        funs.window('process_time', '20 seconds', '10 seconds'),
        'word').count()

    write_stream(df_grouped)
コード例 #16
0
def gen_test_data(spark: SparkSession, verbose: int = 1) -> DataFrame:
    """
    """
    # Create a Spark data frame
    schema = T.StructType([
        T.StructField("date", T.StringType(), True),
        T.StructField("user_id", T.IntegerType(), True),
        T.StructField("user_name", T.StringType(), True),
        T.StructField("total_orders", T.IntegerType(), True),
        T.StructField("total_amount", T.FloatType(), True),
    ])
    data = [
        ("2020-01-01", 1, "AA", 111, 111.11),
        ("2020-01-01", 2, "BB", 222, 222.22),
        ("2020-04-04", 1, "AA", 444, 444.44),
        ("2020-04-01", 3, "CC", 333, 333.33),
    ]
    data = spark.createDataFrame(data, schema=schema)

    proc = RateProcessor()
    proc_udf = F.udf(proc.run,
                     T.FloatType())  # Convert a normal Python into a Spark UDF
    data = data.withColumn("rate", proc_udf("total_orders", "total_amount"))
    logger.info("Successfully added 'rate' column")

    data = data.withColumn(
        "updated_at",
        F.current_timestamp().cast("string"))  # Added updated_at column
    logger.info("Successfully created the test data in Spark\n%s\n" %
                (data.toPandas().to_string(line_width=120)))
    return data
コード例 #17
0
    def run(self):
        print('----> Started Metadata Job')
        
        location = super().getDataFromMySQL('amrs', 'location', {
            'partitionColumn': 'location_id', 
            'fetchsize': 100,
            'lowerBound': 1,
            'upperBound': 500,
            'numPartitions': 1}).\
            select('name', 'uuid').\
            withColumn('_id', f.col('uuid')).\
            withColumn('build_date', f.current_timestamp()).\
            withColumn('type', f.lit('location'))
        
        program = super().getDataFromMySQL('amrs', 'program', {
            'partitionColumn': 'program_id', 
            'fetchsize': 100,
            'lowerBound': 1,
            'upperBound': 500,
            'numPartitions': 1}).\
            select('name', 'uuid', 'concept_id').\
            withColumn('_id', f.col('uuid')).\
            withColumn('build_date', f.current_timestamp()).\
            withColumn('type', f.lit('program'))
        
        concept = super().getDataFromMySQL('amrs', 'concept', {
            'partitionColumn': 'concept_id', 
            'fetchsize': 100,
            'lowerBound': 1,
            'upperBound': 500,
            'numPartitions': 1}).\
            select('concept_id', 'uuid').\
            withColumnRenamed('uuid', 'concept_uuid')
        
        program_object = program.join(concept, on="concept_id")\
                                .withColumn('concept', f.struct(f.col('concept_uuid').alias('uuid')))\
                                .drop('concept_uuid', 'concept_id')

        metadata_couch = Job.getSpark().read.format("org.apache.bahir.cloudant").load('openmrs-metadata').select('_id', '_rev')
        
        
        replace_program = program_object.join(metadata_couch, on='_id', how='left')
        replace_location = location.join(metadata_couch, on='_id', how='left')
        
        super().saveToCouchDB(replace_program, 'openmrs-metadata')
        super().saveToCouchDB(replace_location, 'openmrs-metadata')
コード例 #18
0
def process_batch(batch: DataFrame, epoch_id: int):
    message: DataFrame = batch \
        .agg(concat_ws(", ", collect_list("value")).alias("tweets")) \
        .withColumn("content", regexp_replace("tweets", "http\S+|www.\S+|#|RT:", "")) \
        .select("content") \
        .withColumn("timestamp", current_timestamp()) \
        .withColumn("total_case_count", total_cases())
    save(message)
コード例 #19
0
    def test_timeformat_nodate_dateincolumns(self):
        format = "HH:mm:ss"
        now = str(datetime.datetime.now())[11:19]

        # test wrong type of column
        data = pd.DataFrame()
        times = [i for i in range(100)]
        data["c1"] = times
        df = self.spark.createDataFrame(data)
        with self.assertRaises(SystemExit) as cm:
            r1 = freshness(["c1"], timeFormat=format, df=df)

        # test correct type
        data = pd.DataFrame()
        times = [now for _ in range(100)]
        data["c1"] = times
        df = self.spark.createDataFrame(data)
        df = df.withColumn("c2", current_timestamp())
        df = df.withColumn("c3", current_timestamp())

        r1, r2, r3 = freshness(["c1", "c2", "c3"], timeFormat=format, df=df)
        r1 = float(r1.split(" ")[0])
        r2 = float(r2.split(" ")[0])
        r3 = float(r3.split(" ")[0])
        self.assertLessEqual(r1, 10.0)
        self.assertLessEqual(r2, 10.0)
        self.assertLessEqual(r3, 10.0)

        data = pd.DataFrame()
        times = [now for _ in range(100)]
        for i in range(20):
            times[i] = ""
        data["c1"] = times
        df = self.spark.createDataFrame(data)
        df = df.withColumn("c1", replace_empty_with_null(df["c1"]))
        df = df.withColumn("c2", current_timestamp())
        df = df.withColumn("c3", current_timestamp())

        r1, r2, r3 = freshness(["c1", "c2", "c3"], timeFormat=format, df=df)
        r1 = float(r1.split(" ")[0])
        r2 = float(r2.split(" ")[0])
        r3 = float(r3.split(" ")[0])
        self.assertLessEqual(r1, 10.0)
        self.assertLessEqual(r2, 10.0)
        self.assertLessEqual(r3, 10.0)
コード例 #20
0
def execute_process(options):

    spark = pyspark.sql.session.SparkSession \
            .builder \
            .appName("criar_tabela_distribuicao") \
            .enableHiveSupport() \
            .getOrCreate()

    schema_exadata_aux = options['schema_exadata_aux']
    table_name = options['table_name']

    date_now = datetime.now()
    data_atual = date_now.strftime("%Y-%m-%d")

    qtd_acervo = spark.sql("""
        select A.cod_orgao, A.cod_atribuicao as cod_atribuicao, SUM(A.acervo) as acervo
        from {0}.tb_acervo A
        inner join {0}.tb_regra_negocio_investigacao B
        on A.cod_atribuicao = B.cod_atribuicao AND A.tipo_acervo = B.classe_documento
        where A.dt_inclusao = '{1}'
        group by A.cod_orgao, A.cod_atribuicao
        """.format(schema_exadata_aux, data_atual))
    qtd_acervo.registerTempTable('qtd_acervo_table')

    estatisticas = spark.sql("""
        select cod_orgao, acervo, dist.*
        from qtd_acervo_table
        inner join (
            select cod_atribuicao,
            min(acervo) as minimo,
            max(acervo) as maximo,
            avg(acervo) as media,
            percentile(acervo, 0.25) as primeiro_quartil,
            percentile(acervo, 0.5) as mediana,
            percentile(acervo, 0.75) as terceiro_quartil,
            percentile(acervo, 0.75) - percentile(acervo, 0.25) as IQR,
            percentile(acervo, 0.25)
                - 1.5*(percentile(acervo, 0.75) - percentile(acervo, 0.25)) as Lout,
            percentile(acervo, 0.75)
                + 1.5*(percentile(acervo, 0.75) - percentile(acervo, 0.25)) as Hout
            from qtd_acervo_table t 
            group by cod_atribuicao) dist ON dist.cod_atribuicao = qtd_acervo_table.cod_atribuicao
        """).withColumn(
        "dt_inclusao",
        from_unixtime(
            unix_timestamp(current_timestamp(), 'yyyy-MM-dd HH:mm:ss'),
            'yyyy-MM-dd HH:mm:ss').cast('timestamp'))

    table_name = "{}.{}".format(schema_exadata_aux, table_name)

    estatisticas.write.mode("overwrite").saveAsTable("temp_table_distribuicao")
    temp_table = spark.table("temp_table_distribuicao")

    temp_table.write.mode("overwrite").saveAsTable(table_name)
    spark.sql("drop table temp_table_distribuicao")

    execute_compute_stats(table_name)
コード例 #21
0
def add_audit_cols(df, changedt):
    """ Adds audit columns to the dataframe
    """
    df = df.withColumn("operation", f.lit("I")) \
           .withColumn("processeddate", f.current_timestamp().cast("String")) \
           .withColumn("changedate", f.lit(changedt)) \
           .withColumn('changedate_year', f.year('changedate').cast("String")) \
           .withColumn('changedate_month', f.month('changedate').cast("String")) \
           .withColumn('changedate_day', f.dayofmonth('changedate').cast("String"))
    return df
コード例 #22
0
ファイル: Load_CRM_Data.py プロジェクト: pachiyas1357/CRMLoad
def transform_data(data_stg):
    data_stg.dropna()
    #data_stg.fillna
    #check for zip code
    #check for email id
    #

    data_stg_trans = data_stg.withColumn("New Date", current_timestamp()).\
        withColumn("Customer UUID",monotonically_increasing_id())

    return data_stg_trans
コード例 #23
0
 def extract_stats_step(self, x):
     return x.select(
         *(F.lit(str(v)).alias(k) for k, v in self.training_info().items()),
         F.current_timestamp().alias('trained_at'),
         functions.stats(
             F.to_timestamp('date_received')).alias('date_received_stats'),
         functions.stats('committed_at').alias('committed_at_stats'),
         F.collect_list(F.struct(
             'complaint_id',
             'committed_at',
         )).alias('records'))
コード例 #24
0
def save_function(df, batch_id):
    df.persist()
    df.write.mode("append").format("parquet").save(TWEETS_OUTPUT)
    agg_df = df.agg(F.min("created_at_unix").alias("created_at_unix"), F.count("text").alias("count"))
    agg_df.persist()
    agg_df = agg_df.withColumn("timestamp", F.current_timestamp())
    agg_df = agg_df.withColumn("latency", (F.unix_timestamp("timestamp") - F.col("created_at_unix").alias("latency")))
    agg_df = agg_df.withColumn("batch_id", F.lit(batch_id))
    agg_df.write.mode("append").format("parquet").save(BATCH_DIAGNOSTIC_OUTPUT)
    agg_df.unpersist()
    df.unpersist()
コード例 #25
0
    def save_vector(self, model, index):
        def convert_vector(x):
            '''Convert a list or numpy array to delimited token filter format'''
            return " ".join(["%s|%s" % (i, v) for i, v in enumerate(x)])

        def vector_to_struct(x, version, ts):
            '''
            Convert a vector to a SparkSQL Struct with string-format vector
            and version fields
            '''
            return (convert_vector(x), version, ts)

        vector_struct = udf(
            vector_to_struct,
            StructType([
                StructField("factor", StringType(), True),
                StructField("version", StringType(), True),
                StructField("timestamp", LongType(), True)
            ]))

        start = time()
        ver = model.uid
        ts = unix_timestamp(current_timestamp())

        item_vectors = model.itemFactors.\
            select("id", vector_struct("features", lit(ver), ts).alias("@model"))
        user_vectors = model.userFactors.\
            select("id", vector_struct("features", lit(ver), ts).alias("@model"))

        # write data to ES, use:
        # - "id" as the column to map to ES movie id
        # - "update" write mode for ES, since you want to update new fields only
        # - "append" write mode for Spark
        item_vectors.write.format('es').\
            option('es.mapping.id', 'id').\
            option('es.write.operation', 'update').\
            save(index + '/' + 'movies', mode='append')

        # write data to ES, use:
        # - "id" as the column to map to ES movie id
        # - "index" write mode for ES, since you have not written to the user index previously
        # - "append" write mode for Spark
        user_vectors.write.format('es').\
            option('es.mapping.id', 'id').\
            option('es.write.operation', 'index').\
            save(index + '/' + 'users', mode='append')

        dur = time() - start
        print(
            'Save trained feature vectors into Elasticsearch in %.3f seconds.'
            % dur)

        return dur
コード例 #26
0
    def test_timeformat_withdate(self):
        format = "yyyy-MM-dd HH:mm:ss"
        time = str(datetime.datetime.now())[11:19]
        time = "1970-01-01 " + time

        # test wrong type of column
        data = pd.DataFrame()
        times = [i for i in range(100)]
        data["c1"] = times
        df = self.spark.createDataFrame(data)
        with self.assertRaises(SystemExit) as cm:
            r1 = freshness(["c1"], timeFormat=format, df=df)

        # test correct type
        data = pd.DataFrame()
        times = [time for _ in range(100)]
        data["c1"] = times
        df = self.spark.createDataFrame(data)
        df = df.withColumn("c2", to_timestamp(df["c1"], format))
        df = df.withColumn("c3", to_timestamp(df["c1"], format))
        df = df.withColumn(
            "c4",
            current_timestamp().cast("long") -
            to_timestamp(lit(time), format).cast("long"))
        # seconds from 1970 plus 10 seconds for computation time
        seconds = df.collect()[0][3] + 10

        r1, r2, r3 = freshness(["c1", "c2", "c3"], timeFormat=format, df=df)
        r1 = float(r1.split(" ")[0])
        r2 = float(r2.split(" ")[0])
        r3 = float(r3.split(" ")[0])
        self.assertLessEqual(r1, seconds)
        self.assertLessEqual(r2, seconds)
        self.assertLessEqual(r3, seconds)

        data = pd.DataFrame()
        times = [time for _ in range(100)]
        for i in range(20):
            times[i] = ""
        data["c1"] = times
        df = self.spark.createDataFrame(data)
        df = df.withColumn("c1", replace_empty_with_null(df["c1"]))
        df = df.withColumn("c2", to_timestamp(df["c1"], format))
        df = df.withColumn("c3", to_timestamp(df["c1"], format))

        r1, r2, r3 = freshness(["c1", "c2", "c3"], timeFormat=format, df=df)
        r1 = float(r1.split(" ")[0])
        r2 = float(r2.split(" ")[0])
        r3 = float(r3.split(" ")[0])
        self.assertLessEqual(r1, seconds)
        self.assertLessEqual(r2, seconds)
        self.assertLessEqual(r3, seconds)
コード例 #27
0
    def add_audit_columns(self, df, processing_dt):
        try:
            df = (
                df.withColumn("date_uploaded", F.lit(datetime.strftime(processing_dt, "%Y-%m-%d"))) \
                    .withColumn("error_desc", F.lit(None).cast(ArrayType(StringType()))) \
                    .withColumn("ingestion_dttm", F.current_timestamp())
            )
        except BaseException as ex:
            self.logger.error("Failed to add audit table because of error: %s",
                              str(ex))
            sys.exit(-1)

        return df
コード例 #28
0
def main(spark: SparkSession):

    df_acct_trsn_itm_etry_typ = sc.parallelize([ \
     Row(P_SRC_SYS_CD='1051041', ACCT_TRSN_ITM_ETRY_TYP_CD='S', ACCT_TRSN_ITM_ETRY_TYP_NM='Debit', ACCT_TRSN_ITM_ETRY_TYP_DN='', RADAR_UPD_BY_PRS_ID='ic_fi1_1051041_acct_trsn_itm_etry_typ', RADAR_DLT_IND='N'), \
     Row(P_SRC_SYS_CD='1051041', ACCT_TRSN_ITM_ETRY_TYP_CD='H', ACCT_TRSN_ITM_ETRY_TYP_NM='Credit', ACCT_TRSN_ITM_ETRY_TYP_DN='', RADAR_UPD_BY_PRS_ID='ic_fi1_1051041_acct_trsn_itm_etry_typ', RADAR_DLT_IND='N'), \
     Row(P_SRC_SYS_CD='1051041', ACCT_TRSN_ITM_ETRY_TYP_CD='DR', ACCT_TRSN_ITM_ETRY_TYP_NM='Debit', ACCT_TRSN_ITM_ETRY_TYP_DN='', RADAR_UPD_BY_PRS_ID='ic_fi1_1051041_acct_trsn_itm_etry_typ', RADAR_DLT_IND='N'), \
     Row(P_SRC_SYS_CD='1051041', ACCT_TRSN_ITM_ETRY_TYP_CD='CR', ACCT_TRSN_ITM_ETRY_TYP_NM='Credit', ACCT_TRSN_ITM_ETRY_TYP_DN='', RADAR_UPD_BY_PRS_ID='ic_fi1_1051041_acct_trsn_itm_etry_typ', RADAR_DLT_IND='N')]).toDF()

    df_acct_trsn_itm_etry_typ.select(
        'P_SRC_SYS_CD', 'ACCT_TRSN_ITM_ETRY_TYP_CD',
        'ACCT_TRSN_ITM_ETRY_TYP_NM',
        'ACCT_TRSN_ITM_ETRY_TYP_DN', 'RADAR_UPD_BY_PRS_ID', 'RADAR_DLT_IND',
        func.current_timestamp()).write.mode('owerwrite').insertinto(
            'radar.acct_trsn_itm_etry_typ', overwrite=True)
コード例 #29
0
ファイル: updated_Code.py プロジェクト: mpooja123/scala-
    def transform(self):
        self.df_order_table = self.df_order_table.withColumn(
            'order_date', to_date(col('order_datetime'), 'yyyy-MM-dd'))
        self.df_order_table = self.df_order_table.withColumn(
            'order_month', func.month(col('order_datetime')))

        df_filter_cust = self.df_customer_table.where(col('age') > 18)
        ###inner join
        df_order_customer = self.df_order_table.join(
            df_filter_cust,
            on=(self.df_order_table['customer_id'] ==
                df_filter_cust['customer_id']),
            how='inner').select(df_filter_cust['customer_id'],
                                self.df_order_table['order_id'],
                                self.df_order_table['order_month'],
                                self.df_order_table['amount'])

        # total sales amount for each month of each customer who are greater than age 18
        wind = Window.partitionBy('customer_id', 'order_month')

        df_order_customer = df_order_customer.withColumn(
            'total_sale',
            func.sum(col('amount')).over(wind))

        df_order_customer.distinct()
        df_order_customer.show()

        ###list the cutomer_id and their second order_id of customers who places more than 2 order in last 20 dayssss
        ########################
        wind = Window.partitionBy('customer_id', 'order_date').orderBy(
            func.col('order_id').asc())
        df_temp = self.df_order_table.withColumn('row', func.row_number().over(wind))\

        df_temp = df_temp.withColumn(
            'current_date', to_date(func.current_timestamp(), 'yyyy-MM-dd'))

        df_temp = df_temp.withColumn(
            'diff_days', func.datediff('current_date', 'order_date'))

        df_temp = df_temp.withColumn(
            "diff",
            when((col('diff_days') <= lit(20)), lit(1)).otherwise(0))
        df_temp = df_temp.where(col('diff') == 1)
        wind = Window.partitionBy('customer_id')
        df_temp = df_temp.withColumn('count',
                                     func.count('order_id').over(wind))
        df_temp = df_temp.where((col('count') > 2) & (col('row') == 2))

        df_temp.show()
    def verify_checks_on_datasets(self, tables, sections):

        checker = Parser.get(sections, 'checks').split('|')
        check = Check(spark, CheckLevel.Error, Parser.get(sections,'check_name'))
        checker.insert(0, "check")
        checks = ".".join(checker)

        Verifying_Checks = (VerificationSuite(spark)
                            .onData(tables)
                            .addCheck(eval(checks))
                            .run())
        Check_Reports = VerificationResult.checkResultsAsDataFrame(spark, Verifying_Checks)
        Check_Reports_Dataframe = (Check_Reports.withColumn('dataset_name', lit(sections))
                                   .withColumn('check_run_tsp', lit(current_timestamp())))
        return Check_Reports_Dataframe
  return locate(color_string.upper(), column)\
          .cast("boolean")\
          .alias("is_" + color_string)
selectedColumns = [color_locator(df.Description, c) for c in simpleColors]
selectedColumns.append(expr("*")) # has to a be Column type

df.select(*selectedColumns).where(expr("is_white OR is_red"))\
  .select("Description").show(3, False)


# COMMAND ----------

from pyspark.sql.functions import current_date, current_timestamp
dateDF = spark.range(10)\
  .withColumn("today", current_date())\
  .withColumn("now", current_timestamp())
dateDF.createOrReplaceTempView("dateTable")


# COMMAND ----------

from pyspark.sql.functions import date_add, date_sub
dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1)


# COMMAND ----------

from pyspark.sql.functions import datediff, months_between, to_date
dateDF.withColumn("week_ago", date_sub(col("today"), 7))\
  .select(datediff(col("week_ago"), col("today"))).show(1)