Python rtrim Exemples, pyspark.sql.functions.rtrim Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : i94_data_dictionary.py Projet : shwethav29/decapstone

def process_country_codes():
    input_data_file = os.path.join(s3, I94_CODES_DATA_PATH + COUNTRY_FILE)
    df_country = spark.read.format("csv").option("delimiter", "=").option(
        "header", "False").load(input_data_file)
    df_country = df_country.withColumnRenamed(
        "_c0", "country_code").withColumnRenamed("_c1", "country_name")
    df_country = df_country.withColumn(
        "country_name", F.regexp_replace(df_country.country_name, "'", ""))
    df_country = df_country.withColumn(
        "country_name", F.ltrim(F.rtrim(df_country.country_name)))
    df_country = df_country.withColumn(
        "country_code", F.ltrim(F.rtrim(df_country.country_code)))
    df_country = df_country.withColumn(
        "country_name",
        F.regexp_replace(df_country.country_name,
                         "^INVALID.*|Collapsed.*|No\ Country.*", "INVALID"))
    df_country.write.mode("overwrite").parquet(s3 +
                                               'data/processed/codes/country')
    return df_country

Exemple #2

0

Afficher le fichier

Fichier : data_transformation.py Projet : pengfei99/Spark

def remove_space(df, col_name, position):
    # remove left side space
    if position == "l":
        return df.withColumn("tmp", ltrim(f.col(col_name))).drop(col_name).withColumnRenamed("tmp", col_name)
    # remove right side space
    elif position == "r":
        return df.withColumn("tmp", rtrim(f.col(col_name))).drop(col_name).withColumnRenamed("tmp", col_name)
    # remove all side space
    elif position == "a":
        return df.withColumn("tmp", trim(f.col(col_name))).drop(col_name).withColumnRenamed("tmp", col_name)

Exemple #3

0

Afficher le fichier

def preprocess_data_table(
        df: pyspark.sql.dataframe.DataFrame
) -> pyspark.sql.dataframe.DataFrame:
    for col_name in df.columns:
        if isinstance(df.schema[col_name].dataType, StringType) is True:
            df = df \
                .withColumn(col_name, rtrim(col(col_name)))

        if isinstance(df.schema[col_name].dataType, TimestampType) is True:
            df = df \
                .withColumn(col_name, col(col_name).cast('string'))

    return df

Exemple #4

0

Afficher le fichier

Fichier : i94_data_dictionary.py Projet : shwethav29/decapstone

def process_state_codes():
    input_data_file = os.path.join(s3, I94_CODES_DATA_PATH + STATE_FILE)
    df_state = spark.read.format("csv").option("delimiter", "=").option(
        "header", "False").load(input_data_file)
    df_state = df_state.withColumnRenamed("_c0",
                                          "state_code").withColumnRenamed(
                                              "_c1", "state_name")
    df_state = df_state.withColumn(
        "state_code", F.regexp_replace(df_state.state_code, "[^A-Z]", ""))
    df_state = df_state.withColumn(
        "state_name", F.regexp_replace(df_state.state_name, "'", ""))
    df_state = df_state.withColumn("state_name",
                                   F.ltrim(F.rtrim(df_state.state_name)))
    df_state.write.mode("overwrite").parquet(s3 +
                                             'data/processed/codes/us_state')
    return df_state

Exemple #5

0

Afficher le fichier

def canonicaltokens(df, inputColumn, outputColumn):
   """
   turn input column of strings into canonical format as output column of tokens
   return as output column added to the dataframe
   """

   newname = df.withColumn("cleanname", \
       f.regexp_replace(f.regexp_replace(f.rtrim(f.ltrim(f.col(inputColumn))), \
       " (\w) (\w) ", "$1$2"), "(\w) (\w) (\w)$", "$1$2$3"))

   newtokenizer = mlf.Tokenizer(inputCol="cleanname", outputCol="words")
   chtokenized = newtokenizer.transform(newname).drop("cleanname")

   stopwordremover = mlf.StopWordsRemover(inputCol="words", outputCol=outputColumn)
   canonicalname = stopwordremover.transform(chtokenized).drop("words")

   return canonicalname

Exemple #6

0

Afficher le fichier

def remove_space(df, col_name, position):
    if position not in ["l", "r", "a"]:
        raise ValueError("The position value must be l, r or a")
    # get origin column orders
    columns = df.columns
    # remove left side space
    if position == "l":
        return df.withColumn("tmp", ltrim(sql_fun.col(col_name))).drop(col_name).withColumnRenamed("tmp",
                                                                                                   col_name).select(
            *columns)
    # remove right side space
    elif position == "r":
        return df.withColumn("tmp", rtrim(sql_fun.col(col_name))).drop(col_name).withColumnRenamed("tmp",
                                                                                                   col_name).select(
            *columns)
    # remove all side space
    elif position == "a":
        return df.withColumn("tmp", trim(sql_fun.col(col_name))).drop(col_name).withColumnRenamed("tmp",
                                                                                                  col_name).select(
            *columns)

Exemple #7

0

Afficher le fichier

def parse_message(col, eol="\n"):
    """
    Generate the expression that parses the email message into From, Subject, Body etc.

    Args:
        col - sqlf.col() column object
        eol - end of line chatacter to use when parsing the email
    Returns:
        List pyspark.sql.functions to be passed to select()
    """

    out_dict = [
        "Message-ID",
        "Date",
        "From",
        "To",
        "Subject",
        "Mime-Version",
        "Content-Type",
        "Content-Transfer-Encoding",
        "X-From",
        "X-To",
        "X-cc",
        "X-bcc",
        "X-Folder",
        "X-Origin",
        "X-FileName",
        eol,
    ]
    expr = []
    for i in range(0, len(out_dict) - 1):
        expr.append(
            sqlf.ltrim(
                sqlf.rtrim(sqlf.split(sqlf.split(col, out_dict[i] + ":")[1], eol)[0])
            ).alias(out_dict[i])
        )

    expr.append(sqlf.split(sqlf.split(col, "X-FileName:")[1], "nsf")[1].alias("Body"))

    return expr

Exemple #8

0

Afficher le fichier

Fichier : i94_data_dictionary.py Projet : shwethav29/decapstone

def process_airport_codes():

    #transform airport codes
    input_data_file = os.path.join(s3, I94_CODES_DATA_PATH + AIRPORT_FILE)
    df_airport = spark.read.format("csv").option("delimiter", "=").option(
        "header", "False").load(input_data_file)
    df_airport = df_airport.withColumn(
        "_c0", F.regexp_replace(df_airport._c0, "'", "")).withColumn(
            "_c1", F.regexp_replace(df_airport._c1, "'", ""))
    split_col = F.split(df_airport._c1, ",")
    df_airport = df_airport.withColumn("city", split_col.getItem(0))
    df_airport = df_airport.withColumn("state_code", split_col.getItem(1))
    df_airport = df_airport.withColumnRenamed("_c0", "port_code")
    df_airport = df_airport.drop("_c1")
    df_airport = df_airport.withColumn(
        "port_code",
        F.regexp_replace(df_airport.port_code, "[^A-Z]", "")).withColumn(
            "city", F.ltrim(F.rtrim(df_airport.city))).withColumn(
                "state_code",
                F.regexp_replace(df_airport.state_code, "[^A-Z]", ""))
    df_state = process_state_codes()
    df_airport = df_airport.join(df_state, "state_code")
    df_airport.write.mode("overwrite").parquet(s3 +
                                               'data/processed/codes/us_ports')

Exemple #9

0

Afficher le fichier

from pyspark.sql.functions import monotonically_increasing_id

df.select(monotonically_increasing_id()).show(10)

#working with strings

#perform case converstions

from pyspark.sql.functions import initcap, lower, upper, ltrim, rtrim, trim, lpad, rpad
df.select(initcap(col("Description"))).show(5)
df.select(col("Description"), initcap(col("Description")),
          lower(col("Description")), upper(col("Description"))).show(5)

string_with_space = "     hello     "

df.select(ltrim(lit(string_with_space)), rtrim(lit(string_with_space)),
          trim(lit(string_with_space))).show()

#regular expressions

#working with dates, timestamps
from pyspark.sql.functions import current_date, current_timestamp, date_add, date_sub, datediff, months_between, to_date, to_timestamp

dateDF = spark.range(10).withColumn("today", current_date()).withColumn(
    "now", current_timestamp())
dateDF.show()

dateDF.select(
    date_add(col("today"), 5).alias("today+5"),
    date_sub(col("today"), 5).alias("today-5")).show()

Exemple #10

0

Afficher le fichier

YEAR_PATTERN = r".*\(\d{4}\)$"

spark = SparkSession.builder.appName("hw9").getOrCreate()

keyspace = sys.argv[1]

df = (spark.read.format("csv").options(
    header=True, inferSchema=True,
    sep=",").load("hdfs:///data/movielens/movies.csv"))

df = df.withColumn("title", regexp_replace(col("title"), "\xa0", ""))

df = df.withColumn("title", regexp_replace(col("title"), r"\(\D+\)$", ""))
df = df.withColumn("title", regexp_replace(col("title"), '"+', '"'))
df = df.withColumn("title", regexp_replace(col("title"), r"\)+", ")"))
df = df.withColumn("title", regexp_replace(col("title"), '^"', ""))
df = df.withColumn("title", regexp_replace(col("title"), '"$', ""))
df = df.withColumn("title", rtrim(col("title"))).filter(
    col("title").rlike(YEAR_PATTERN))
df = df.withColumn("year", substring(col("title"), -5, 4).cast(IntegerType()))
df = df.withColumn("title", expr("substring(title, 1, length(title)-6)"))
df = df.filter(rtrim(col("title")).rlike(".+"))
df = df.filter(col("title").isNotNull())
df = df.filter(col("genres") != "(no genres listed)")
df = df.withColumn("genres", split(df["genres"], r"\|"))
df = df.select(
    col("movieId").alias("movieid"), col("title"), col("year"), col("genres"))

(df.write.format("org.apache.spark.sql.cassandra").options(
    table="movies_with_genre_index", keyspace=keyspace).mode("append").save())

Exemple #11

0

Afficher le fichier

Fichier : batch_process.py Projet : jg4821/travel_safe

    def calculate_score(self):
        # transform and filter data
        mentions_df = self.spark.read.parquet(self.mentions_path).select(
            'GLOBALEVENTID', 'MentionTimeDate', 'MentionIdentifier',
            'Confidence')
        gkg_df = self.spark.read.parquet(self.gkg_path).select(
            'DocumentIdentifier', 'Date', 'V2Tone')

        # filter rows on mention date in 2019
        mentions_df = mentions_df.filter(
            mentions_df.MentionTimeDate.like('2019%'))
        gkg_df = gkg_df.filter(gkg_df.Date.like('2019%'))
        gkg_df = gkg_df.drop('Date')

        # type casting for mentions and gkg df
        mentions_df = mentions_df.withColumn(
            'GLOBALEVENTID', mentions_df.GLOBALEVENTID.cast('INT'))
        mentions_df = mentions_df.withColumn(
            'Confidence', mentions_df.Confidence.cast('INT'))
        mentions_df = mentions_df.withColumn(
            'mDate',
            F.to_date(mentions_df.MentionTimeDate,
                      format='yyyyMMddHHmmss')).drop('MentionTimeDate')
        mentions_df.printSchema()
        print(mentions_df.first())

        gkg_df = gkg_df.withColumn(
            'Tone',
            F.split(gkg_df.V2Tone, ',')[0].cast('FLOAT')).drop('V2Tone')
        gkg_df.printSchema()
        print(gkg_df.first())

        # register the DataFrame as a SQL temporary view
        mentions_df.createOrReplaceTempView('mentions_table')
        gkg_df.createOrReplaceTempView('gkg_table')

        # run sql query on 3 tables to calculate safety_score
        temp_df = self.spark.sql(
            'SELECT GLOBALEVENTID, mDate, avg(Confidence*0.01*Tone) as sentiment, count(*) as numOfMentions \
                                FROM mentions_table inner join gkg_table on mentions_table.MentionIdentifier = gkg_table.DocumentIdentifier \
                                GROUP BY GLOBALEVENTID, mDate')

        temp_df.explain()
        temp_df.printSchema()
        print(temp_df.first())

        temp_df.createOrReplaceTempView('temp_table')

        # clear cache of mentions and gkg df & table, read in event data
        self.spark.catalog.dropTempView('mentions_table')
        self.spark.catalog.dropTempView('gkg_table')
        mentions_df.unpersist()
        gkg_df.unpersist()

        # load event data and perform join and aggregation
        event_df = self.spark.read.parquet(self.event_path).select(
            'GLOBALEVENTID', 'GoldsteinScale', 'ActionGeo_FullName')
        event_df = event_df.withColumn('GLOBALEVENTID',
                                       event_df.GLOBALEVENTID.cast('INT'))
        event_df = event_df.withColumn('GoldsteinScale',
                                       event_df.GoldsteinScale.cast('FLOAT'))
        event_df = event_df.withColumn(
            'country',
            F.rtrim(F.ltrim(F.split(event_df.ActionGeo_FullName, ',')[2])))
        event_df = event_df.withColumn(
            'state',
            F.rtrim(F.ltrim(F.split(event_df.ActionGeo_FullName, ',')[1])))
        event_df = event_df.withColumn(
            'city',
            F.rtrim(F.ltrim(F.split(event_df.ActionGeo_FullName,
                                    ',')[0]))).drop('ActionGeo_FullName')
        event_df.printSchema()
        print(event_df.first())

        event_df.createOrReplaceTempView('event_table')

        # compute final safety score
        result_df = self.spark.sql(
            'SELECT event_table.GLOBALEVENTID, mDate, 0.5*(GoldsteinScale*10+temp_table.sentiment) as SafetyScore, numOfMentions, \
                                    country, state, city \
                            FROM event_table inner join temp_table on event_table.GLOBALEVENTID = temp_table.GLOBALEVENTID'
        )

        result_df.explain()
        result_df.printSchema()
        print(result_df.first())

        # free up memory and disk
        self.spark.catalog.dropTempView('temp_table')
        self.spark.catalog.dropTempView('event_table')
        temp_df.unpersist()
        event_df.unpersist()

        return result_df

Exemple #12

0

Afficher le fichier

            if(i != " "):
                reformat_list = [rid, i]
                final_result.append(reformat_list)
    return final_result

#Preprocess the rdd from stage 3 for both positive and negative
music_p_small_preprocess = music_p_small_rdd.mapPartitions(review_encode_preprocess).cache()
# music_p_small_preprocess.take(2)
music_n_small_preprocess = music_n_small_rdd.mapPartitions(review_encode_preprocess).cache()
# music_n_small_preprocess.take(2)

#Re-formate the preprocessed rdd to formatted dataframe
music_p_preprocess_reformat = spark.createDataFrame(music_p_small_preprocess)
music_p_preprocess_reformat = music_p_preprocess_reformat.withColumnRenamed('_1', 'review_id').withColumnRenamed('_2', 'review_body')
music_p_preprocess_reformat = music_p_preprocess_reformat.withColumn('review_body', f.ltrim(music_p_preprocess_reformat.review_body))
music_p_preprocess_reformat = music_p_preprocess_reformat.withColumn('review_body', f.rtrim(music_p_preprocess_reformat.review_body))
# music_p_preprocess_reformat.show(5)
music_n_preprocess_reformat = spark.createDataFrame(music_n_small_preprocess)
music_n_preprocess_reformat = music_n_preprocess_reformat.withColumnRenamed('_1', 'review_id').withColumnRenamed('_2', 'review_body')
music_n_preprocess_reformat = music_n_preprocess_reformat.withColumn('review_body', f.ltrim(music_n_preprocess_reformat.review_body))
music_n_preprocess_reformat = music_n_preprocess_reformat.withColumn('review_body', f.rtrim(music_n_preprocess_reformat.review_body))
# music_n_preprocess_reformat.show(5)

#Doing tokenizer with regex to separate every word in review body and filter if empty list
regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'review_body', outputCol = 'review_token')
music_p_preprocess_reformat_token = regexTokenizer.transform(music_p_preprocess_reformat)
music_p_preprocess_reformat_token_filter = music_p_preprocess_reformat_token.filter(f.size('review_token') > 1)
# music_p_preprocess_reformat_token_filter.show(5)
music_n_preprocess_reformat_token = regexTokenizer.transform(music_n_preprocess_reformat)
music_n_preprocess_reformat_token_filter = music_n_preprocess_reformat_token.filter(f.size('review_token') > 1)
# music_n_preprocess_reformat_token_filter.show(5)

Exemple #13

0

Afficher le fichier

    def loadParquet(self):
        print(
            "-------------------Fetching the dates------------------------------------"
        )
        print(self.AttSalesActualsInput)
        s3keys = self.AttSalesActualsInput.split(':')[1].split('//')[1].split(
            '/')
        key = ""
        filefolderPrefix = s3keys[1] + "/" + s3keys[2]
        s3 = boto3.resource('s3')
        bucket = s3.Bucket(s3keys[0])
        listofObj = list(bucket.objects.filter(Prefix=filefolderPrefix))
        schema = StructType([
            StructField('Location', StringType(), False),
            StructField('Loc Id', StringType(), False),
            StructField('Dlr1 Code', StringType(), False),
            StructField('KPI', StringType(), False),
            StructField('KPI ID', StringType(), False),
            StructField('Dec 2017 Actual', StringType(), True),
            StructField('Dec 2017 Projected', StringType(), True),
            StructField('Dec 2017 Target', StringType(), True)
        ])
        dfATTSalesActuals = self.sparkSession.createDataFrame(
            self.sparkSession.sparkContext.emptyRDD(), schema)
        bucketWithPath = urlparse(self.AttSalesActualsInput)
        bucket = bucketWithPath.netloc
        spark = SparkSession.builder.appName("ATTSalesActuals").getOrCreate()
        for obj in listofObj:
            key = obj.key
            file = "s3://" + bucket + "/" + obj.key
            df1 = self.sparkSession.read.format("com.databricks.spark.csv").\
                option("header", "true").\
                option("treatEmptyValuesAsNulls", "true").\
                option("inferSchema", "false").\
                load(file, schema=schema)
            df1 = df1.filter(~(df1.Location.like('Histori%%')))
            df1 = df1.filter(~(df1.Location.like('AT&T MyRe%%')))
            df1 = df1.filter(~(df1.Location.like('AR : SPRI%%')))
            df1 = df1.filter(~(df1.Location.like('Generated%%')))
            df1 = df1.filter(~(df1.Location.like('Locat%%')))
            dfATTSalesActuals = df1.unionAll(dfATTSalesActuals)
            dfATTSalesActuals = dfATTSalesActuals.withColumnRenamed("Location", "attlocationname").\
                withColumnRenamed("Loc Id", "locid").\
                withColumnRenamed("Dlr1 Code", "dealercode").\
                withColumnRenamed("KPI", "kpiname").\
                withColumnRenamed("KPI ID", "kpiid").\
                withColumnRenamed("Dec 2017 Actual", "actual_value").\
                withColumnRenamed("Dec 2017 Projected", "projected_value").\
                withColumnRenamed("Dec 2017 Target", "target_value")

        filedate = key.split("/")[2].split("_")[3].split(".")[1]
        filedate = filedate[0:8]
        filedateS = datetime.strptime(filedate, '%Y%m%d').strftime('%m/%d/%Y')

        #########################################################################################################
        #                                 Reading the source data files                                         #
        #########################################################################################################

        FinalHistDF1 = dfATTSalesActuals
        FinalHistDF1 = FinalHistDF1.withColumn(
            "actualvalue",
            when(
                FinalHistDF1.actual_value.contains('%'),
                rtrim(regexp_replace("actual_value", '\\%|\\,', '')).cast(
                    DecimalType()) / 100).otherwise(
                        rtrim(regexp_replace("actual_value", '\\$|\\%|\\,',
                                             ''))))

        FinalHistDF1 = FinalHistDF1.withColumn(
            "projectedvalue",
            when(
                FinalHistDF1.actual_value.contains('%'),
                rtrim(regexp_replace("projected_value", '\\%|\\,', '')).cast(
                    DecimalType()) / 100).otherwise(
                        rtrim(
                            regexp_replace("projected_value", '\\$|\\%|\\,',
                                           ''))))

        FinalHistDF1 = FinalHistDF1.withColumn(
            "targetvalue",
            when(
                FinalHistDF1.actual_value.contains('%'),
                rtrim(regexp_replace("target_value", '\\%|\\,', '')).cast(
                    DecimalType()) / 100).otherwise(
                        rtrim(regexp_replace("target_value", '\\$|\\%|\\,',
                                             ''))))

        dfATTSalesActualsRPT = spark.read.format("com.databricks.spark.csv").\
            option("header", "true").\
            option("treatEmptyValuesAsNulls", "true").\
            option("inferSchema", "false").\
            load(self.ATTMyResultsInp2, schema=schema)

        dfATTSalesActualsRPT1 = dfATTSalesActualsRPT.filter(
            ~(dfATTSalesActualsRPT.Location.like('Histori%%')))
        dfATTSalesActualsRPT2 = dfATTSalesActualsRPT1.filter(
            ~(dfATTSalesActualsRPT.Location.like('AT&T MyRe%%')))
        dfATTSalesActualsRPT3 = dfATTSalesActualsRPT2.filter(
            ~(dfATTSalesActualsRPT.Location.like('AR : SPRI%%')))
        dfATTSalesActualsRPT4 = dfATTSalesActualsRPT3.filter(
            ~(dfATTSalesActualsRPT.Location.like('Generated%%')))
        dfAttSalesActualsInputRPT = dfATTSalesActualsRPT4.filter(
            ~(dfATTSalesActualsRPT.Location.like('Locat%%')))

        FinalRPTDF1 = dfAttSalesActualsInputRPT.withColumnRenamed("Location", "attlocationname2").\
            withColumnRenamed("Loc Id", "locid2").\
            withColumnRenamed("Dlr1 Code", "dealercode2").\
            withColumnRenamed("KPI", "kpiname2").\
            withColumnRenamed("KPI ID", "kpiid2").\
            withColumnRenamed("Dec 2017 Actual", "actual_value2").\
            withColumnRenamed("Dec 2017 Projected", "projected_value2").\
            withColumnRenamed("Dec 2017 Target", "target_value2")

        FinalRPTDF1 = FinalRPTDF1.withColumn(
            "actualvalue2",
            when(
                FinalRPTDF1.actual_value2.contains('%'),
                rtrim(regexp_replace("actual_value2", '\\%|\\,', '')).cast(
                    DecimalType()) / 100).otherwise(
                        rtrim(
                            regexp_replace("actual_value2", '\\$|\\%|\\,',
                                           ''))))

        FinalRPTDF1 = FinalRPTDF1.withColumn(
            "projectedvalue2",
            when(
                FinalRPTDF1.actual_value2.contains('%'),
                rtrim(regexp_replace("projected_value2", '\\%|\\,', '')).cast(
                    DecimalType()) / 100).otherwise(
                        rtrim(
                            regexp_replace("projected_value2", '\\$|\\%|\\,',
                                           ''))))

        FinalRPTDF1 = FinalRPTDF1.withColumn(
            "targetvalue2",
            when(
                FinalRPTDF1.actual_value2.contains('%'),
                rtrim(regexp_replace("target_value2", '\\%|\\,', '')).cast(
                    DecimalType()) / 100).otherwise(
                        rtrim(
                            regexp_replace("target_value2", '\\$|\\%|\\,',
                                           ''))))

        FinalHistDF1 = FinalHistDF1.withColumn('reportdate', lit(filedateS))
        FinalHistDF1.registerTempTable("HIST")
        FinalRPTDF1 = FinalRPTDF1.withColumn('reportdate2', lit(filedateS))
        FinalRPTDF1.registerTempTable("RPT")

        FinalHistDF = self.sparkSession.sql(
            "select attlocationname, locid, dealercode, kpiname, kpiid, actualvalue, projectedvalue, targetvalue, reportdate, YEAR(FROM_UNIXTIME(UNIX_TIMESTAMP())) as year, SUBSTR(FROM_UNIXTIME(UNIX_TIMESTAMP()),6,2) as month from HIST"
        )

        FinalRPTDF = self.sparkSession.sql(
            "select attlocationname2,locid2,dealercode2,kpiname2,kpiid2,actualvalue2 ,projectedvalue2,targetvalue2,reportdate2,YEAR(FROM_UNIXTIME(UNIX_TIMESTAMP())) as year,SUBSTR(FROM_UNIXTIME(UNIX_TIMESTAMP()),6,2) as month from RPT"
        )
        '''
        FinalHistDF = spark.sql("select attlocationname,locid,dealercode,kpiname,kpiid,actualvalue ,projectedvalue,"
                                "targetvalue,reportdate,"
                                "SUBSTR(reportdate,7,4) as year,SUBSTR(reportdate,1,2) as month  from HIST")
        FinalRPTDF = spark.sql("select attlocationname2,locid2,dealercode2,kpiname2,kpiid2,"
                               "actualvalue2 ,projectedvalue2,targetvalue2,reportdate2,"
                               "SUBSTR(reportdate2,7,4) as year,SUBSTR(reportdate2,1,2)  as month from RPT")
        '''
        FinalHistDF.show(100, False)
        FinalRPTDF.show(100, False)
        FinalHistDF.coalesce(1).select("*").write.mode("overwrite").parquet(
            self.AttSalesActualsOutput + '/' + 'Working1')

        FinalRPTDF.coalesce(1).select("*").write.mode("overwrite").parquet(
            self.AttSalesActualsOutput + '/' + 'Working2')

        FinalHistDF.coalesce(1).select("*").write.mode("append").partitionBy(
            'year', 'month').format('parquet').save(self.AttSalesActualsOutput)
        FinalRPTDF.coalesce(1).select("*").write.mode("append").partitionBy(
            'year', 'month').format('parquet').save(self.AttSalesActualsOutput)

        spark.stop()

Exemple #14

0

Afficher le fichier

Fichier : phone combine Schmidt.py Projet : benjaminkschmidt/ETL-Sample-Files-with-python

banner_phone = hive_context.sql(banner_phone_query)


#Clean up Advance data
#The data coming in from Advance has double quotes around each field. We need to clean them up. We use Pandas for this part because it has a handy applymap method that works elementwise.

advance_phone_pd = advance_phone.toPandas()
advance_phone_pd_clean = advance_phone_pd.applymap(lambda x: clean_quotes(x))
adv_phone_clean = hive_context.createDataFrame(advance_phone_pd_clean)
adv_phone_clean = hive_context.createDataFrame(advance_phone_pd_clean)
banner_full_phone = (banner_phone

    .withColumn('Full_Phone',
                sf.rtrim(
                    sf.concat(
                        safe_concat(banner_phone, 'Telephone_Area_Code', ''),
                        safe_concat(banner_phone, 'Telephone_Number', ''), )))

    .withColumn('Data_Source', sf.lit('Banner'))

    .select(
    sf.col('NetID'),
    sf.col('PIDM').alias('Source_ID'),
    sf.col('Telephone_Description').alias('Phone_Type'),
    sf.col('Full_Phone'),
    sf.col('Data_Source')))
hub_full_phone = (hub_phone
    .withColumn('Full_Phone',
                sf.rtrim(
                    sf.concat(
                        safe_concat(hub_phone, 'Country_Code', ''),

Exemple #15

0

Afficher le fichier

Fichier : project.py Projet : chiragshahi/Toxic-Comment-Classification-Challenge

snewdf = sdf.withColumn(
    "comment_text",
    F.regexp_replace(F.col("comment_text"), "[\$#,?."
                     "!@#$%^&*()0123456789:-=\+]", ""))
snewdf = snewdf.withColumn('comment_text',
                           F.regexp_replace(F.col("comment_text"), "\"", ""))
snewdf = snewdf.withColumn('comment_text',
                           F.regexp_replace(F.col("comment_text"), "\n", " "))
snewdf = snewdf.withColumn('comment_text',
                           F.regexp_replace(F.col("comment_text"), "\[", ""))
snewdf = snewdf.withColumn('comment_text',
                           F.regexp_replace(F.col("comment_text"), "\]", ""))
snewdf = snewdf.withColumn('comment_text',
                           F.regexp_replace(F.col("comment_text"), "\"+", ""))
snewdf = snewdf.withColumn('comment_text', F.lower(F.col('comment_text')))
snewdf = snewdf.withColumn('comment_text', F.rtrim(snewdf.comment_text))
snewdf = snewdf.withColumn('comment_text', F.ltrim(snewdf.comment_text))

from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="comment_text", outputCol="tokenized")
tokenized_df = tokenizer.transform(snewdf)
tokenized_df.select("tokenized").show()

stopwordsremoved = StopWordsRemover(inputCol="tokenized",
                                    outputCol="comment_txt")
swr_df = stopwordsremoved.transform(tokenized_df)
swr_df.select("comment_txt").show()

from pyspark.ml.feature import HashingTF, IDF

Exemple #16

0

Afficher le fichier

def compile_rstrip(t, expr, scope, **kwargs):
    op = expr.op()

    src_column = t.translate(op.arg, scope)
    return F.rtrim(src_column)

Exemple #17

0

Afficher le fichier

    def loadParquet(self):

        self.log.info('Exception Handling starts')

        validSourceFormat = self.isValidFormatInSource()

        if not validSourceFormat:
            self.log.error(" Source files not in csv format.")

        validSourceSchema = self.isValidSchemaInSource()
        if not validSourceSchema:
            self.log.error(
                " Source schema does not have all the required columns.")

        if not validSourceFormat or not validSourceSchema:
            self.log.error(
                "Copy the source files to data processing error path and return."
            )
            self.copyFile(
                self.EmpOprEffMasterListFile,
                self.dataProcessingErrorPath + self.EmpOprEffName +
                datetime.now().strftime('%Y%m%d%H%M') + self.fileFormat)

            return

        self.log.info('Source format and schema validation successful.')
        self.log.info('Reading the input parquet file')

        schema = StructType([
            StructField('Market', StringType(), True),
            StructField('Region', StringType(), True),
            StructField('District', StringType(), True),
            StructField('Location', StringType(), True),
            StructField('Sales Person', StringType(), True),
            StructField('WorkDay ID', StringType(), False),
            StructField('Total Loss', StringType(), False),
            StructField('Total Issues', StringType(), False),
            StructField('Action Taken', StringType(), False),
            StructField('HR Consulted Before Termination', StringType(),
                        False),
            StructField('Transaction Errors', StringType(), False),
            StructField('Total Errors', StringType(), False),
            StructField('NEXT Trades', StringType(), False),
            StructField('Total Devices12', StringType(), False),
            StructField('Hyla Loss', StringType(), False),
            StructField('Total Devices14', StringType(), False),
            StructField('Denied RMA Devices', StringType(), False),
            StructField('Total Devices16', StringType(), False),
            StructField('Cash Deposits', StringType(), False),
            StructField('Total Missing Deposits', StringType(), False),
            StructField('Total Short Deposits', StringType(), False),
            StructField('Shrinkage', StringType(), False),
            StructField('Comments', StringType(), False),
            StructField('Date', StringType(), False)
        ])
        df1 = self.sparkSession.read.format("com.databricks.spark.csv").option(
            "header", "true").option("multiLine", "true").option(
                "delimiter", ",").option("quotechar", '"').option(
                    "treatEmptyValuesAsNulls",
                    "true").option("escape", ",").option("escape", '"').option(
                        "encoding",
                        "UTF-8").option("inferSchema", "false").load(
                            self.OperationalEfficiencyScoreCard, schema=schema)
        df1 = df1.filter(~(df1.Market.like('%%Marke%%')))
        df1 = df1.filter(~(df1.Market.like('%%Operational%%')))
        df1 = df1.filter(~(df1.Market.like(' Regio%%')))
        df1 = df1.where(df1.Market != '')
        df1 = df1.where(df1.Region != '')
        df1 = df1.where(df1.District != '')
        df1 = df1.withColumnRenamed("Market", "market").withColumnRenamed(
            "Region", "region"
        ).withColumnRenamed("District", "district").withColumnRenamed(
            "Location", "location"
        ).withColumnRenamed("Sales Person", "salesperson").withColumnRenamed(
            "WorkDay ID", "workdayid"
        ).withColumnRenamed("Total Loss", "total_loss").withColumnRenamed(
            "Total Issues", "total_issues"
        ).withColumnRenamed("Action Taken", "actiontaken").withColumnRenamed(
            "HR Consulted Before Termination", "hrconsultedbefore_termination"
        ).withColumnRenamed(
            "Transaction Errors", "transaction_errors"
        ).withColumnRenamed("Total Errors", "total_errors").withColumnRenamed(
            "NEXT Trades", "next_trades").withColumnRenamed(
                "Total Devices12", "total_devices12"
            ).withColumnRenamed("Hyla Loss", "hyla_loss").withColumnRenamed(
                "Total Devices14", "total_devices14").withColumnRenamed(
                    "Denied RMA Devices",
                    "denied_rmadevices").withColumnRenamed(
                        "Total Devices16", "total_device16").withColumnRenamed(
                            "Cash Deposits",
                            "cash_deposits").withColumnRenamed(
                                "Total Missing Deposits",
                                "totalmissing_deposits").withColumnRenamed(
                                    "Total Short Deposits",
                                    "totalshort_deposits").withColumnRenamed(
                                        "Shrinkage",
                                        "Shrinkage").withColumnRenamed(
                                            "Comments",
                                            "comments").withColumnRenamed(
                                                "Date", "reportdate")

        df1 = df1.withColumn(
            "totalloss",
            rtrim(regexp_replace("total_loss", '\\(|\\)|\\$|\\,',
                                 '')).cast(DecimalType(15, 2)))
        df1 = df1.withColumn("totalissues",
                             df1["total_issues"].cast(IntegerType()))
        df1 = df1.withColumn(
            "hrconsultedbeforetermination",
            df1["hrconsultedbefore_termination"].cast(BooleanType()))
        df1 = df1.withColumn(
            "transactionerrors",
            rtrim(regexp_replace("transaction_errors", '\\(|\\)|\\$|\\,',
                                 '')).cast(DecimalType(15, 2)))
        df1 = df1.withColumn("totalerrors",
                             df1["total_errors"].cast(IntegerType()))
        df1 = df1.withColumn(
            "nexttrades",
            rtrim(regexp_replace("next_trades", '\\(|\\)|\\$',
                                 '')).cast(DecimalType(15, 2)))
        df1 = df1.withColumn("totaldevices12",
                             df1["total_devices12"].cast(IntegerType()))
        df1 = df1.withColumn(
            "hylaloss",
            rtrim(regexp_replace("hyla_loss", '\\(|\\)|\\$',
                                 '')).cast(DecimalType(15, 2)))
        df1 = df1.withColumn("totaldevices14",
                             df1["total_devices14"].cast(IntegerType()))
        df1 = df1.withColumn(
            "deniedrmadevices",
            rtrim(regexp_replace("denied_rmadevices", '\\(|\\)|\\$',
                                 '')).cast(DecimalType(15, 2)))
        df1 = df1.withColumn("totaldevice16",
                             df1["total_device16"].cast(IntegerType()))
        df1 = df1.withColumn(
            "cashdeposits",
            rtrim(regexp_replace("cash_deposits", '\\(|\\)|\\$',
                                 '')).cast(DecimalType(15, 2)))
        df1 = df1.withColumn("totalmissingdeposits",
                             df1["totalmissing_deposits"].cast(IntegerType()))
        df1 = df1.withColumn("totalshortdeposits",
                             df1["totalshort_deposits"].cast(IntegerType()))
        df1 = df1.withColumn(
            "shrinkage",
            rtrim(regexp_replace("Shrinkage", '\\(|\\)|\\$|\\,',
                                 '')).cast(DecimalType(15, 2)))

        df1.registerTempTable("empopreff")
        df2 = self.sparkSession.sql(
            "select e.market , e.region,e.district, e.location, e.salesperson, e.workdayid, e.totalloss, e.totalissues, e.actiontaken,e.hrconsultedbeforetermination , e.transactionerrors, e.totalerrors, e.nexttrades, e.totaldevices12, e.hylaloss, e.totaldevices14, e.deniedrmadevices, e.totaldevice16, e.cashdeposits, e.totalmissingdeposits, e.totalshortdeposits, e.shrinkage, e.comments, YEAR(FROM_UNIXTIME(UNIX_TIMESTAMP())) as year, SUBSTR(FROM_UNIXTIME(UNIX_TIMESTAMP()),6,2) as month, e.reportdate from empopreff e "
        )

        df2.coalesce(1).select("*").write.mode("append").partitionBy(
            'year',
            'month').parquet(self.OperationalEfficiencyScoreCardOutputArg)
        df2.coalesce(1).select("*").write.mode("overwrite").parquet(
            self.OperationalEfficiencyScoreCardOutputArg + '/' + 'Working')
        self.sparkSession.stop()

Exemple #18

0

Afficher le fichier

Fichier : Structured_APIs-Chapter_6_Working_with_Different_Types_of_Data.py Projet : yehonatc/Spark-The-Definitive-Guide


# COMMAND ----------

from pyspark.sql.functions import lower, upper
df.select(col("Description"),
    lower(col("Description")),
    upper(lower(col("Description")))).show(2)


# COMMAND ----------

from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
    ltrim(lit("    HELLO    ")).alias("ltrim"),
    rtrim(lit("    HELLO    ")).alias("rtrim"),
    trim(lit("    HELLO    ")).alias("trim"),
    lpad(lit("HELLO"), 3, " ").alias("lp"),
    rpad(lit("HELLO"), 10, " ").alias("rp")).show(2)


# COMMAND ----------

from pyspark.sql.functions import regexp_replace
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
  regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"),
  col("Description")).show(2)


# COMMAND ----------

Exemple #19

0

Afficher le fichier

Fichier : glue_job.py Projet : tzaton/rate-my-post

            "favorite_count",
            "community_owned_date",
            "closed_date",
            "content_license")\
    .withColumn("creation_date",
                f.to_timestamp(f.col("creation_date")))\
    .withColumn("last_edit_date",
                f.to_timestamp(f.col("last_edit_date")))\
    .withColumn("last_activity_date",
                f.to_timestamp(f.col("last_activity_date")))\
    .withColumn("community_owned_date",
                f.to_timestamp(f.col("community_owned_date")))\
    .withColumn("closed_date",
                f.to_timestamp(f.col("closed_date")))\
    .withColumn("tags",
                f.split(f.rtrim(f.regexp_replace(f.regexp_replace(f.col("tags"), "<", ""), ">", " ")), " "))\
    .withColumn("creation_year", f.year("creation_date"))

save_table(posts, "posts", ["creation_year"])

# tags
tags = spark.read.format('xml')\
    .options(rowTag='row')\
    .load(f's3://{bucket_name}/{input_dir}/tags')\
    .withColumnRenamed("_Count", "count")\
    .withColumnRenamed("_ExcerptPostId", "excerpt_post_id")\
    .withColumnRenamed("_Id", "id")\
    .withColumnRenamed("_TagName", "tag_name")\
    .withColumnRenamed("_WikiPostId", "wiki_post_id")\
    .select("id",
            "tag_name",

Exemple #20

0

Afficher le fichier

Fichier : TraitsImpactingSalary.py Projet : mateensa/PepoleEDA

	StructField("country", StringType(), True),
	StructField("salary", StringType(), True)]);
	df = spark.read.csv(filepath, schema=strct);


	dfregex1 = df.select(df.workclass, df.finalweight, df.education, df.educationnum, df.maritalstatus, df.occupation, df.relationship
		, df.race , df.gender, df.capitalgain, df.capitalloss, df.hoursperweek, df.country, regexp_replace(df.salary, '(<=50K)', '50').alias('salary'));


	dfregex = dfregex1.select(dfregex1.workclass, dfregex1.finalweight, dfregex1.education, dfregex1.educationnum, dfregex1.maritalstatus, dfregex1.occupation
		, dfregex1.relationship, dfregex1.race , dfregex1.gender, dfregex1.capitalgain, dfregex1.capitalloss, dfregex1.hoursperweek, dfregex1.country
		, regexp_replace(dfregex1.salary, '(>50K)', '51').alias('salary'));

	dfbtrim = dfregex1.select(dfregex1.workclass, dfregex1.finalweight, dfregex1.education, dfregex1.educationnum, dfregex1.maritalstatus
		, dfregex1.occupation, dfregex1.relationship, dfregex1.race , dfregex1.gender, dfregex1.capitalgain, dfregex1.capitalloss
		, dfregex1.hoursperweek, dfregex1.country, ltrim(rtrim(dfregex1.salary)).alias('salary') );


	dfcast = dfbtrim.select(dfbtrim.workclass, dfbtrim.finalweight, dfbtrim.education, dfbtrim.educationnum, dfbtrim.maritalstatus, dfbtrim.occupation
		, dfbtrim.relationship, dfbtrim.race , dfbtrim.gender, dfbtrim.capitalgain, dfbtrim.capitalloss, dfbtrim.hoursperweek, dfbtrim.country
		, dfbtrim.salary.cast(IntegerType()).alias('intSal'));

	dfcast.createOrReplaceTempView("employees");

	query = "select workclass, education, maritalstatus, occupation, relationship, hoursperweek, country, avg(intSal) as sal_avg from employees " +
		"group by workclass, education, maritalstatus, occupation, relationship, hoursperweek, country";

	sqldf = spark.sql(query);
	sqldf.dropna().show(10);

	spark.stop();

Exemple #21

0

Afficher le fichier

Fichier : teradata_processor.py Projet : ShandiLin/pyspark-sqlflow

 def get_latest_yyyymm_table_name(self, table):
     # TODO fix import here
     import pyspark.sql.functions as F
     df = self.get_all_yyyymm_tables(table)
     return df.agg(F.max(F.rtrim(F.col('TableName'))).alias('max_partition')).collect()[0].max_partition.lower()

Exemple #22

0

Afficher le fichier

Fichier : MySQL_UC2.py Projet : abhilash97sharma/O_Work

        delta = (start-end).total_seconds()
        return delta
# register as a UDF 
f1 = f.udf(time_delta)

def time_sub(y,x):
    z=y-x
    return z

f2 = f.udf(time_sub)

sc = SparkContext('local','MySQL_UC1')
sqlContext = SQLContext(sc)
device=sqlContext.read.format("jdbc").option("url","jdbc:mysql://localhost:3306/husain?useSSL=false").option("driver", "com.mysql.jdbc.Driver").option("dbtable","device").option("user","root").option("password","root").load()
emp=sqlContext.read.format("jdbc").option("url","jdbc:mysql://localhost:3306/husain?useSSL=false").option("driver", "com.mysql.jdbc.Driver").option("dbtable","employee").option("user","root").option("password","root").load()
emp = emp.withColumn('Timings',f.rtrim(emp.timings))
emp=emp.join(device,device.device_id == emp.device_id,"inner").select(emp.id,emp.name,emp.device_id,device.status,emp.Timings)
split_col = f.split(emp['Timings'],' ')
emp = emp.withColumn('Date',split_col.getItem(0).cast('date'))
#emp = emp.orderBy('Date','name')
#emp.show()
emp1 = emp.filter(emp['status']=='in')
emp1=emp1.orderBy('Date','name','Timings')
emp2 = emp.filter(emp.status=='out')
emp2 = emp2.orderBy('Date','name','Timings')
#
emp1 = emp1.withColumnRenamed('Timings','IN_TIME')
emp2 = emp2.withColumnRenamed('Timings','OUT_TIME')
#
#emp1=emp1.join(emp2,emp1.id == emp2.id,"full_outer").select(emp1.id,emp1.Date,emp1.name,emp1.device_id,emp1.status,emp1.IN_TIME,emp2.OUT_TIME)
#emp1 = emp1.

Exemple #23

0

Afficher le fichier

Fichier : 6-chapter.py Projet : tarasowski/apache-spark

        lower(col('Description')),
        upper(col('Description'))).show(2)

df.selectExpr(
        'Description',
        'lower(Description)',
        'upper(lower(Description))').show(2)

# select description, lower(Description), upper(lower(Description)) from dfTable


from pyspark.sql.functions import ltrim, rtrim, rpad, lpad, trim

df.select(
        ltrim(lit('         HELLO           ')).alias('ltrim'),
        rtrim(lit('         HELLO           ')).alias('rtrim'),
        trim(lit('         HELLO           ')).alias('trim'),
        lpad(lit('HELLO'), 3, ' ').alias('lp'),
        rpad(lit('HELLO'), 10, ' ').alias('rp')).show(2)

df.selectExpr(
        'ltrim(         "HELLO"           ) as ltrim',
        'rtrim(         "HELLO"           ) as rtrim',
        'trim(         "HELLO"           )as trim',
        'lpad("HELLO", 3, " ") as lp',
        'rpad("HELLO", 3, " ")as rp').show(2)

# select 
#   ltrim('     HELLO       '),
#   rtrim('     HELLO       '),
#   trim('      HELLO       '),

Exemple #24

0

Afficher le fichier

Fichier : 문자열 데이터.py Projet : seongpyoHong/LearningSpark

df = spark.read.format("csv")\
  .option("header","true")\
  .option("inferSchema","true")\
  .load('/databricks-datasets/definitive-guide/data/retail-data/by-day/2010-12-01.csv')
#data schema 확인
df.printSchema()

#initcap : 주어진 문자열에서 공백을 나눠 첫글자를 대문자로 반환
df.select(initcap(col("Description"))).show(2, False)
#lower // upper
df.select(lower(col("StockCode"))).show(2)
#공백 추가 및 제거 (lit,ltrim,rtrim,rpad,lpad,trim)
from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
    ltrim(lit("   HELLO   ")).alias("ltrim"),
    rtrim(lit("   HELLO   ")).alias("rtrim"),
    trim(lit("   HELLO   ")).alias("trim"),
    lpad(lit("HELLO"), 3, " ").alias("lpad"),
    rpad(lit("HELLP"), 10, " ").alias("rpad")).show(2)

##정규 표현식
#description컬럼의 값을 COLOR 값으로 치환
from pyspark.sql.functions import regexp_replace
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
    regexp_replace(col("Description"), regex_string,
                   "COLOR").alias("color_clean"), col("Description")).show(2)

#주어진 문자를 다른 문자로 치환
from pyspark.sql.functions import translate
df.select(translate(col("Description"), "WHI", "123")).show(2)

Exemple #25

0

Afficher le fichier

Fichier : cvm_pre_processing.py Projet : allison-cy-wu/CVM_view_market_basket

    def lsg_omni(self):
        start_date, end_date = date_period(self.period, self.start_date)

        table_name = 'datalake_omni.omni_hit_data'
        dt_col_name = 'hit_time_gmt_dt_key'
        _, bound_end_date = date_period(-1, end_date)
        bound_date_check(table_name, dt_col_name, start_date, bound_end_date,
                         self.env, 'YYYYMMDD', 'LSG')

        query = 'SELECT ' \
                'VS.visit_session_key AS session_key, ' \
                'HIT.post_visid_combined AS visit_id, ' \
                'HIT.visit_return_count AS visit_number, ' \
                'UPPER(TRIM(prod_list)) AS prod_list, ' \
                'HIT.hit_time_gmt_ts AS time_stamp, ' \
                "TRIM(SUBSTRING(TRIM(DEMANDBASE), 0, POSITION('|' IN TRIM(DEMANDBASE)))) AS " \
                "account_no " \
                'FROM datalake_omni.omni_hit_data HIT ' \
                'LEFT JOIN CDWDS.D_OMNI_VISIT_SESSION VS ON ' \
                '  VS.VISIT_RETURN_COUNT=HIT.VISIT_RETURN_COUNT AND VS.POST_VISID_COMBINED=HIT.POST_VISID_COMBINED ' \
                f'WHERE HIT.hit_time_gmt_dt_key<{start_date} AND HIT.hit_time_gmt_dt_key>={end_date} ' \
                'AND HIT.post_visid_combined IS NOT NULL ' \
                "AND prod_list IS NOT NULL AND prod_list NOT LIKE '%shipping-handling%' " \
                "AND TRIM(SUBSTRING(TRIM(DEMANDBASE), 0, POSITION('|' IN TRIM(DEMANDBASE)))) <> '' "

        schema = StructType([
            StructField('session_key', IntegerType(), True),
            StructField('visit_id', StringType(), True),
            StructField('visit_number', IntegerType(), True),
            StructField('time_stamp', StringType(), True),
            StructField('prod_list', StringType(), True),
            StructField('account_no', StringType(), True),
        ])

        df = redshift_cdw_read(query, db_type = 'RS', database = 'CDWDS', env = self.env, schema = schema). \
            withColumn('prod_id_untrimmed', explode(split('prod_list', ','))). \
            withColumn('prod_id', ltrim(rtrim(col('prod_id_untrimmed')))). \
            drop('prod_id_untrimmed'). \
            drop('prod_list'). \
            filter(col('prod_id').isNotNull()). \
            filter(col('prod_id') != ''). \
            distinct()

        if self.debug:
            print(f'row count for df = {df.count()}')

        # find active products
        query = 'SELECT	sku as prod_id, stk_type_cd '\
                'FROM cdwds.lsg_prod_v ' \
                "WHERE	stk_type_cd = 'D'"

        discontinued_prods = redshift_cdw_read(query,
                                               db_type='RS',
                                               database='CDWDS',
                                               env=self.env)

        df = df.join(discontinued_prods, ['prod_id'], how = 'left').\
            filter(col('stk_type_cd').isNull()).\
            drop('stk_type_cd')

        if self.debug:
            print(
                f'After filtering out discontinued SKUs, row count for df = {df.count()}'
            )

        query = 'SELECT UPPER(sku_nbr) AS prod_id, size_grp AS coupon ' \
                'FROM cdwds.f_web_prod_feature ' \
                "WHERE size_grp IS NOT NULL AND size_grp <> 'T' " \
                'GROUP BY sku_nbr, size_grp'

        coupons = redshift_cdw_read(query,
                                    db_type='RS',
                                    database='CDWDS',
                                    env=self.env)

        if coupons.count() == 0:
            raise DataValidityError(
                'No coupon information.  Please check the validity of size_grp column '
                'on cdwds.f_web_prod_feature.')

        df = df.join(broadcast(coupons), ['prod_id'], how = 'left').\
            withColumn('coupon', coalesce('coupon', 'prod_id'))

        prod_list = df.select('prod_id').distinct()
        coupons = coupons.union(df.select('prod_id', 'coupon')).\
            filter(col('prod_id').isNotNull()).\
            distinct().\
            withColumn("coupon_key", func.dense_rank().over(Window.orderBy('coupon')))

        df = df.join(coupons, ['prod_id', 'coupon'], how='left')

        if self.debug:
            coupons.show()
            df.show()
            print(
                f'row count for coupons = {coupons.select(col("coupon_key")).distinct().count()}'
            )

        return df, prod_list, coupons

Exemple #26

0

Afficher le fichier

Fichier : stackoverflow-data-prep.py Projet : AbeOmor/Spark-in-AzureML

posts = spark.read.format("xml").option("rowTag", "row").load(xml_file_path)

# select only questions
questions = posts.filter(posts._PostTypeId == 1)

# drop irrelvant columns and clean up strings
questions = questions.select(
    [c for c in questions.columns if c in ['_Id', '_Title', '_Body', '_Tags']])
questions = questions.withColumn(
    'full_question', sf.concat(sf.col('_Title'), sf.lit(' '), sf.col('_Body')))
questions = questions.select([
    c for c in questions.columns if c in ['_Id', 'full_question', '_Tags']
]).withColumn("full_question", regexp_replace("full_question", "[\\n,]", " "))
questions = questions.withColumn("_Tags", regexp_replace("_Tags", "><", " "))
questions = questions.withColumn("_Tags", regexp_replace("_Tags", "(>|<)", ""))
questions = questions.withColumn('_Tags', rtrim(questions._Tags))
questions = questions.withColumn('_Tags', split(questions._Tags, " "))

# filter out to single tags in following list
tags_of_interest = [
    'azure-devops', 'azure-functions', 'azure-web-app-service',
    'azure-storage', 'azure-virtual-machine'
]


def intersect(xs):
    xs = set(xs)

    @udf("array<string>")
    def _(ys):
        return list(xs.intersection(ys))

Exemple #27

0

Afficher le fichier

Fichier : common.py Projet : ciprian90m/capestone-udacity

def trimStrings(dataframe, columns):
    for col in columns:
        dataframe = dataframe.withColumn(col, F.ltrim(F.rtrim(dataframe[col])))
    return dataframe