コード例 #1
0
def process_country_codes():
    input_data_file = os.path.join(s3, I94_CODES_DATA_PATH + COUNTRY_FILE)
    df_country = spark.read.format("csv").option("delimiter", "=").option(
        "header", "False").load(input_data_file)
    df_country = df_country.withColumnRenamed(
        "_c0", "country_code").withColumnRenamed("_c1", "country_name")
    df_country = df_country.withColumn(
        "country_name", F.regexp_replace(df_country.country_name, "'", ""))
    df_country = df_country.withColumn(
        "country_name", F.ltrim(F.rtrim(df_country.country_name)))
    df_country = df_country.withColumn(
        "country_code", F.ltrim(F.rtrim(df_country.country_code)))
    df_country = df_country.withColumn(
        "country_name",
        F.regexp_replace(df_country.country_name,
                         "^INVALID.*|Collapsed.*|No\ Country.*", "INVALID"))
    df_country.write.mode("overwrite").parquet(s3 +
                                               'data/processed/codes/country')
    return df_country
コード例 #2
0
def remove_space(df, col_name, position):
    # remove left side space
    if position == "l":
        return df.withColumn("tmp", ltrim(f.col(col_name))).drop(col_name).withColumnRenamed("tmp", col_name)
    # remove right side space
    elif position == "r":
        return df.withColumn("tmp", rtrim(f.col(col_name))).drop(col_name).withColumnRenamed("tmp", col_name)
    # remove all side space
    elif position == "a":
        return df.withColumn("tmp", trim(f.col(col_name))).drop(col_name).withColumnRenamed("tmp", col_name)
コード例 #3
0
def preprocess_data_table(
        df: pyspark.sql.dataframe.DataFrame
) -> pyspark.sql.dataframe.DataFrame:
    for col_name in df.columns:
        if isinstance(df.schema[col_name].dataType, StringType) is True:
            df = df \
                .withColumn(col_name, rtrim(col(col_name)))

        if isinstance(df.schema[col_name].dataType, TimestampType) is True:
            df = df \
                .withColumn(col_name, col(col_name).cast('string'))

    return df
コード例 #4
0
def process_state_codes():
    input_data_file = os.path.join(s3, I94_CODES_DATA_PATH + STATE_FILE)
    df_state = spark.read.format("csv").option("delimiter", "=").option(
        "header", "False").load(input_data_file)
    df_state = df_state.withColumnRenamed("_c0",
                                          "state_code").withColumnRenamed(
                                              "_c1", "state_name")
    df_state = df_state.withColumn(
        "state_code", F.regexp_replace(df_state.state_code, "[^A-Z]", ""))
    df_state = df_state.withColumn(
        "state_name", F.regexp_replace(df_state.state_name, "'", ""))
    df_state = df_state.withColumn("state_name",
                                   F.ltrim(F.rtrim(df_state.state_name)))
    df_state.write.mode("overwrite").parquet(s3 +
                                             'data/processed/codes/us_state')
    return df_state
コード例 #5
0
def canonicaltokens(df, inputColumn, outputColumn):
   """
   turn input column of strings into canonical format as output column of tokens
   return as output column added to the dataframe
   """

   newname = df.withColumn("cleanname", \
       f.regexp_replace(f.regexp_replace(f.rtrim(f.ltrim(f.col(inputColumn))), \
       " (\w) (\w) ", "$1$2"), "(\w) (\w) (\w)$", "$1$2$3"))

   newtokenizer = mlf.Tokenizer(inputCol="cleanname", outputCol="words")
   chtokenized = newtokenizer.transform(newname).drop("cleanname")

   stopwordremover = mlf.StopWordsRemover(inputCol="words", outputCol=outputColumn)
   canonicalname = stopwordremover.transform(chtokenized).drop("words")

   return canonicalname
コード例 #6
0
def remove_space(df, col_name, position):
    if position not in ["l", "r", "a"]:
        raise ValueError("The position value must be l, r or a")
    # get origin column orders
    columns = df.columns
    # remove left side space
    if position == "l":
        return df.withColumn("tmp", ltrim(sql_fun.col(col_name))).drop(col_name).withColumnRenamed("tmp",
                                                                                                   col_name).select(
            *columns)
    # remove right side space
    elif position == "r":
        return df.withColumn("tmp", rtrim(sql_fun.col(col_name))).drop(col_name).withColumnRenamed("tmp",
                                                                                                   col_name).select(
            *columns)
    # remove all side space
    elif position == "a":
        return df.withColumn("tmp", trim(sql_fun.col(col_name))).drop(col_name).withColumnRenamed("tmp",
                                                                                                  col_name).select(
            *columns)
コード例 #7
0
def parse_message(col, eol="\n"):
    """
    Generate the expression that parses the email message into From, Subject, Body etc.

    Args:
        col - sqlf.col() column object
        eol - end of line chatacter to use when parsing the email
    Returns:
        List pyspark.sql.functions to be passed to select()
    """

    out_dict = [
        "Message-ID",
        "Date",
        "From",
        "To",
        "Subject",
        "Mime-Version",
        "Content-Type",
        "Content-Transfer-Encoding",
        "X-From",
        "X-To",
        "X-cc",
        "X-bcc",
        "X-Folder",
        "X-Origin",
        "X-FileName",
        eol,
    ]
    expr = []
    for i in range(0, len(out_dict) - 1):
        expr.append(
            sqlf.ltrim(
                sqlf.rtrim(sqlf.split(sqlf.split(col, out_dict[i] + ":")[1], eol)[0])
            ).alias(out_dict[i])
        )

    expr.append(sqlf.split(sqlf.split(col, "X-FileName:")[1], "nsf")[1].alias("Body"))

    return expr
コード例 #8
0
def process_airport_codes():

    #transform airport codes
    input_data_file = os.path.join(s3, I94_CODES_DATA_PATH + AIRPORT_FILE)
    df_airport = spark.read.format("csv").option("delimiter", "=").option(
        "header", "False").load(input_data_file)
    df_airport = df_airport.withColumn(
        "_c0", F.regexp_replace(df_airport._c0, "'", "")).withColumn(
            "_c1", F.regexp_replace(df_airport._c1, "'", ""))
    split_col = F.split(df_airport._c1, ",")
    df_airport = df_airport.withColumn("city", split_col.getItem(0))
    df_airport = df_airport.withColumn("state_code", split_col.getItem(1))
    df_airport = df_airport.withColumnRenamed("_c0", "port_code")
    df_airport = df_airport.drop("_c1")
    df_airport = df_airport.withColumn(
        "port_code",
        F.regexp_replace(df_airport.port_code, "[^A-Z]", "")).withColumn(
            "city", F.ltrim(F.rtrim(df_airport.city))).withColumn(
                "state_code",
                F.regexp_replace(df_airport.state_code, "[^A-Z]", ""))
    df_state = process_state_codes()
    df_airport = df_airport.join(df_state, "state_code")
    df_airport.write.mode("overwrite").parquet(s3 +
                                               'data/processed/codes/us_ports')
コード例 #9
0
from pyspark.sql.functions import monotonically_increasing_id

df.select(monotonically_increasing_id()).show(10)

#working with strings

#perform case converstions

from pyspark.sql.functions import initcap, lower, upper, ltrim, rtrim, trim, lpad, rpad
df.select(initcap(col("Description"))).show(5)
df.select(col("Description"), initcap(col("Description")),
          lower(col("Description")), upper(col("Description"))).show(5)

string_with_space = "     hello     "

df.select(ltrim(lit(string_with_space)), rtrim(lit(string_with_space)),
          trim(lit(string_with_space))).show()

#regular expressions

#working with dates, timestamps
from pyspark.sql.functions import current_date, current_timestamp, date_add, date_sub, datediff, months_between, to_date, to_timestamp

dateDF = spark.range(10).withColumn("today", current_date()).withColumn(
    "now", current_timestamp())
dateDF.show()

dateDF.select(
    date_add(col("today"), 5).alias("today+5"),
    date_sub(col("today"), 5).alias("today-5")).show()
コード例 #10
0
YEAR_PATTERN = r".*\(\d{4}\)$"

spark = SparkSession.builder.appName("hw9").getOrCreate()

keyspace = sys.argv[1]

df = (spark.read.format("csv").options(
    header=True, inferSchema=True,
    sep=",").load("hdfs:///data/movielens/movies.csv"))

df = df.withColumn("title", regexp_replace(col("title"), "\xa0", ""))

df = df.withColumn("title", regexp_replace(col("title"), r"\(\D+\)$", ""))
df = df.withColumn("title", regexp_replace(col("title"), '"+', '"'))
df = df.withColumn("title", regexp_replace(col("title"), r"\)+", ")"))
df = df.withColumn("title", regexp_replace(col("title"), '^"', ""))
df = df.withColumn("title", regexp_replace(col("title"), '"$', ""))
df = df.withColumn("title", rtrim(col("title"))).filter(
    col("title").rlike(YEAR_PATTERN))
df = df.withColumn("year", substring(col("title"), -5, 4).cast(IntegerType()))
df = df.withColumn("title", expr("substring(title, 1, length(title)-6)"))
df = df.filter(rtrim(col("title")).rlike(".+"))
df = df.filter(col("title").isNotNull())
df = df.filter(col("genres") != "(no genres listed)")
df = df.withColumn("genres", split(df["genres"], r"\|"))
df = df.select(
    col("movieId").alias("movieid"), col("title"), col("year"), col("genres"))

(df.write.format("org.apache.spark.sql.cassandra").options(
    table="movies_with_genre_index", keyspace=keyspace).mode("append").save())
コード例 #11
0
ファイル: batch_process.py プロジェクト: jg4821/travel_safe
    def calculate_score(self):
        # transform and filter data
        mentions_df = self.spark.read.parquet(self.mentions_path).select(
            'GLOBALEVENTID', 'MentionTimeDate', 'MentionIdentifier',
            'Confidence')
        gkg_df = self.spark.read.parquet(self.gkg_path).select(
            'DocumentIdentifier', 'Date', 'V2Tone')

        # filter rows on mention date in 2019
        mentions_df = mentions_df.filter(
            mentions_df.MentionTimeDate.like('2019%'))
        gkg_df = gkg_df.filter(gkg_df.Date.like('2019%'))
        gkg_df = gkg_df.drop('Date')

        # type casting for mentions and gkg df
        mentions_df = mentions_df.withColumn(
            'GLOBALEVENTID', mentions_df.GLOBALEVENTID.cast('INT'))
        mentions_df = mentions_df.withColumn(
            'Confidence', mentions_df.Confidence.cast('INT'))
        mentions_df = mentions_df.withColumn(
            'mDate',
            F.to_date(mentions_df.MentionTimeDate,
                      format='yyyyMMddHHmmss')).drop('MentionTimeDate')
        mentions_df.printSchema()
        print(mentions_df.first())

        gkg_df = gkg_df.withColumn(
            'Tone',
            F.split(gkg_df.V2Tone, ',')[0].cast('FLOAT')).drop('V2Tone')
        gkg_df.printSchema()
        print(gkg_df.first())

        # register the DataFrame as a SQL temporary view
        mentions_df.createOrReplaceTempView('mentions_table')
        gkg_df.createOrReplaceTempView('gkg_table')

        # run sql query on 3 tables to calculate safety_score
        temp_df = self.spark.sql(
            'SELECT GLOBALEVENTID, mDate, avg(Confidence*0.01*Tone) as sentiment, count(*) as numOfMentions \
                                FROM mentions_table inner join gkg_table on mentions_table.MentionIdentifier = gkg_table.DocumentIdentifier \
                                GROUP BY GLOBALEVENTID, mDate')

        temp_df.explain()
        temp_df.printSchema()
        print(temp_df.first())

        temp_df.createOrReplaceTempView('temp_table')

        # clear cache of mentions and gkg df & table, read in event data
        self.spark.catalog.dropTempView('mentions_table')
        self.spark.catalog.dropTempView('gkg_table')
        mentions_df.unpersist()
        gkg_df.unpersist()

        # load event data and perform join and aggregation
        event_df = self.spark.read.parquet(self.event_path).select(
            'GLOBALEVENTID', 'GoldsteinScale', 'ActionGeo_FullName')
        event_df = event_df.withColumn('GLOBALEVENTID',
                                       event_df.GLOBALEVENTID.cast('INT'))
        event_df = event_df.withColumn('GoldsteinScale',
                                       event_df.GoldsteinScale.cast('FLOAT'))
        event_df = event_df.withColumn(
            'country',
            F.rtrim(F.ltrim(F.split(event_df.ActionGeo_FullName, ',')[2])))
        event_df = event_df.withColumn(
            'state',
            F.rtrim(F.ltrim(F.split(event_df.ActionGeo_FullName, ',')[1])))
        event_df = event_df.withColumn(
            'city',
            F.rtrim(F.ltrim(F.split(event_df.ActionGeo_FullName,
                                    ',')[0]))).drop('ActionGeo_FullName')
        event_df.printSchema()
        print(event_df.first())

        event_df.createOrReplaceTempView('event_table')

        # compute final safety score
        result_df = self.spark.sql(
            'SELECT event_table.GLOBALEVENTID, mDate, 0.5*(GoldsteinScale*10+temp_table.sentiment) as SafetyScore, numOfMentions, \
                                    country, state, city \
                            FROM event_table inner join temp_table on event_table.GLOBALEVENTID = temp_table.GLOBALEVENTID'
        )

        result_df.explain()
        result_df.printSchema()
        print(result_df.first())

        # free up memory and disk
        self.spark.catalog.dropTempView('temp_table')
        self.spark.catalog.dropTempView('event_table')
        temp_df.unpersist()
        event_df.unpersist()

        return result_df
コード例 #12
0
            if(i != " "):
                reformat_list = [rid, i]
                final_result.append(reformat_list)
    return final_result

#Preprocess the rdd from stage 3 for both positive and negative
music_p_small_preprocess = music_p_small_rdd.mapPartitions(review_encode_preprocess).cache()
# music_p_small_preprocess.take(2)
music_n_small_preprocess = music_n_small_rdd.mapPartitions(review_encode_preprocess).cache()
# music_n_small_preprocess.take(2)

#Re-formate the preprocessed rdd to formatted dataframe
music_p_preprocess_reformat = spark.createDataFrame(music_p_small_preprocess)
music_p_preprocess_reformat = music_p_preprocess_reformat.withColumnRenamed('_1', 'review_id').withColumnRenamed('_2', 'review_body')
music_p_preprocess_reformat = music_p_preprocess_reformat.withColumn('review_body', f.ltrim(music_p_preprocess_reformat.review_body))
music_p_preprocess_reformat = music_p_preprocess_reformat.withColumn('review_body', f.rtrim(music_p_preprocess_reformat.review_body))
# music_p_preprocess_reformat.show(5)
music_n_preprocess_reformat = spark.createDataFrame(music_n_small_preprocess)
music_n_preprocess_reformat = music_n_preprocess_reformat.withColumnRenamed('_1', 'review_id').withColumnRenamed('_2', 'review_body')
music_n_preprocess_reformat = music_n_preprocess_reformat.withColumn('review_body', f.ltrim(music_n_preprocess_reformat.review_body))
music_n_preprocess_reformat = music_n_preprocess_reformat.withColumn('review_body', f.rtrim(music_n_preprocess_reformat.review_body))
# music_n_preprocess_reformat.show(5)

#Doing tokenizer with regex to separate every word in review body and filter if empty list
regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'review_body', outputCol = 'review_token')
music_p_preprocess_reformat_token = regexTokenizer.transform(music_p_preprocess_reformat)
music_p_preprocess_reformat_token_filter = music_p_preprocess_reformat_token.filter(f.size('review_token') > 1)
# music_p_preprocess_reformat_token_filter.show(5)
music_n_preprocess_reformat_token = regexTokenizer.transform(music_n_preprocess_reformat)
music_n_preprocess_reformat_token_filter = music_n_preprocess_reformat_token.filter(f.size('review_token') > 1)
# music_n_preprocess_reformat_token_filter.show(5)
コード例 #13
0
    def loadParquet(self):
        print(
            "-------------------Fetching the dates------------------------------------"
        )
        print(self.AttSalesActualsInput)
        s3keys = self.AttSalesActualsInput.split(':')[1].split('//')[1].split(
            '/')
        key = ""
        filefolderPrefix = s3keys[1] + "/" + s3keys[2]
        s3 = boto3.resource('s3')
        bucket = s3.Bucket(s3keys[0])
        listofObj = list(bucket.objects.filter(Prefix=filefolderPrefix))
        schema = StructType([
            StructField('Location', StringType(), False),
            StructField('Loc Id', StringType(), False),
            StructField('Dlr1 Code', StringType(), False),
            StructField('KPI', StringType(), False),
            StructField('KPI ID', StringType(), False),
            StructField('Dec 2017 Actual', StringType(), True),
            StructField('Dec 2017 Projected', StringType(), True),
            StructField('Dec 2017 Target', StringType(), True)
        ])
        dfATTSalesActuals = self.sparkSession.createDataFrame(
            self.sparkSession.sparkContext.emptyRDD(), schema)
        bucketWithPath = urlparse(self.AttSalesActualsInput)
        bucket = bucketWithPath.netloc
        spark = SparkSession.builder.appName("ATTSalesActuals").getOrCreate()
        for obj in listofObj:
            key = obj.key
            file = "s3://" + bucket + "/" + obj.key
            df1 = self.sparkSession.read.format("com.databricks.spark.csv").\
                option("header", "true").\
                option("treatEmptyValuesAsNulls", "true").\
                option("inferSchema", "false").\
                load(file, schema=schema)
            df1 = df1.filter(~(df1.Location.like('Histori%%')))
            df1 = df1.filter(~(df1.Location.like('AT&T MyRe%%')))
            df1 = df1.filter(~(df1.Location.like('AR : SPRI%%')))
            df1 = df1.filter(~(df1.Location.like('Generated%%')))
            df1 = df1.filter(~(df1.Location.like('Locat%%')))
            dfATTSalesActuals = df1.unionAll(dfATTSalesActuals)
            dfATTSalesActuals = dfATTSalesActuals.withColumnRenamed("Location", "attlocationname").\
                withColumnRenamed("Loc Id", "locid").\
                withColumnRenamed("Dlr1 Code", "dealercode").\
                withColumnRenamed("KPI", "kpiname").\
                withColumnRenamed("KPI ID", "kpiid").\
                withColumnRenamed("Dec 2017 Actual", "actual_value").\
                withColumnRenamed("Dec 2017 Projected", "projected_value").\
                withColumnRenamed("Dec 2017 Target", "target_value")

        filedate = key.split("/")[2].split("_")[3].split(".")[1]
        filedate = filedate[0:8]
        filedateS = datetime.strptime(filedate, '%Y%m%d').strftime('%m/%d/%Y')

        #########################################################################################################
        #                                 Reading the source data files                                         #
        #########################################################################################################

        FinalHistDF1 = dfATTSalesActuals
        FinalHistDF1 = FinalHistDF1.withColumn(
            "actualvalue",
            when(
                FinalHistDF1.actual_value.contains('%'),
                rtrim(regexp_replace("actual_value", '\\%|\\,', '')).cast(
                    DecimalType()) / 100).otherwise(
                        rtrim(regexp_replace("actual_value", '\\$|\\%|\\,',
                                             ''))))

        FinalHistDF1 = FinalHistDF1.withColumn(
            "projectedvalue",
            when(
                FinalHistDF1.actual_value.contains('%'),
                rtrim(regexp_replace("projected_value", '\\%|\\,', '')).cast(
                    DecimalType()) / 100).otherwise(
                        rtrim(
                            regexp_replace("projected_value", '\\$|\\%|\\,',
                                           ''))))

        FinalHistDF1 = FinalHistDF1.withColumn(
            "targetvalue",
            when(
                FinalHistDF1.actual_value.contains('%'),
                rtrim(regexp_replace("target_value", '\\%|\\,', '')).cast(
                    DecimalType()) / 100).otherwise(
                        rtrim(regexp_replace("target_value", '\\$|\\%|\\,',
                                             ''))))

        dfATTSalesActualsRPT = spark.read.format("com.databricks.spark.csv").\
            option("header", "true").\
            option("treatEmptyValuesAsNulls", "true").\
            option("inferSchema", "false").\
            load(self.ATTMyResultsInp2, schema=schema)

        dfATTSalesActualsRPT1 = dfATTSalesActualsRPT.filter(
            ~(dfATTSalesActualsRPT.Location.like('Histori%%')))
        dfATTSalesActualsRPT2 = dfATTSalesActualsRPT1.filter(
            ~(dfATTSalesActualsRPT.Location.like('AT&T MyRe%%')))
        dfATTSalesActualsRPT3 = dfATTSalesActualsRPT2.filter(
            ~(dfATTSalesActualsRPT.Location.like('AR : SPRI%%')))
        dfATTSalesActualsRPT4 = dfATTSalesActualsRPT3.filter(
            ~(dfATTSalesActualsRPT.Location.like('Generated%%')))
        dfAttSalesActualsInputRPT = dfATTSalesActualsRPT4.filter(
            ~(dfATTSalesActualsRPT.Location.like('Locat%%')))

        FinalRPTDF1 = dfAttSalesActualsInputRPT.withColumnRenamed("Location", "attlocationname2").\
            withColumnRenamed("Loc Id", "locid2").\
            withColumnRenamed("Dlr1 Code", "dealercode2").\
            withColumnRenamed("KPI", "kpiname2").\
            withColumnRenamed("KPI ID", "kpiid2").\
            withColumnRenamed("Dec 2017 Actual", "actual_value2").\
            withColumnRenamed("Dec 2017 Projected", "projected_value2").\
            withColumnRenamed("Dec 2017 Target", "target_value2")

        FinalRPTDF1 = FinalRPTDF1.withColumn(
            "actualvalue2",
            when(
                FinalRPTDF1.actual_value2.contains('%'),
                rtrim(regexp_replace("actual_value2", '\\%|\\,', '')).cast(
                    DecimalType()) / 100).otherwise(
                        rtrim(
                            regexp_replace("actual_value2", '\\$|\\%|\\,',
                                           ''))))

        FinalRPTDF1 = FinalRPTDF1.withColumn(
            "projectedvalue2",
            when(
                FinalRPTDF1.actual_value2.contains('%'),
                rtrim(regexp_replace("projected_value2", '\\%|\\,', '')).cast(
                    DecimalType()) / 100).otherwise(
                        rtrim(
                            regexp_replace("projected_value2", '\\$|\\%|\\,',
                                           ''))))

        FinalRPTDF1 = FinalRPTDF1.withColumn(
            "targetvalue2",
            when(
                FinalRPTDF1.actual_value2.contains('%'),
                rtrim(regexp_replace("target_value2", '\\%|\\,', '')).cast(
                    DecimalType()) / 100).otherwise(
                        rtrim(
                            regexp_replace("target_value2", '\\$|\\%|\\,',
                                           ''))))

        FinalHistDF1 = FinalHistDF1.withColumn('reportdate', lit(filedateS))
        FinalHistDF1.registerTempTable("HIST")
        FinalRPTDF1 = FinalRPTDF1.withColumn('reportdate2', lit(filedateS))
        FinalRPTDF1.registerTempTable("RPT")

        FinalHistDF = self.sparkSession.sql(
            "select attlocationname, locid, dealercode, kpiname, kpiid, actualvalue, projectedvalue, targetvalue, reportdate, YEAR(FROM_UNIXTIME(UNIX_TIMESTAMP())) as year, SUBSTR(FROM_UNIXTIME(UNIX_TIMESTAMP()),6,2) as month from HIST"
        )

        FinalRPTDF = self.sparkSession.sql(
            "select attlocationname2,locid2,dealercode2,kpiname2,kpiid2,actualvalue2 ,projectedvalue2,targetvalue2,reportdate2,YEAR(FROM_UNIXTIME(UNIX_TIMESTAMP())) as year,SUBSTR(FROM_UNIXTIME(UNIX_TIMESTAMP()),6,2) as month from RPT"
        )
        '''
        FinalHistDF = spark.sql("select attlocationname,locid,dealercode,kpiname,kpiid,actualvalue ,projectedvalue,"
                                "targetvalue,reportdate,"
                                "SUBSTR(reportdate,7,4) as year,SUBSTR(reportdate,1,2) as month  from HIST")
        FinalRPTDF = spark.sql("select attlocationname2,locid2,dealercode2,kpiname2,kpiid2,"
                               "actualvalue2 ,projectedvalue2,targetvalue2,reportdate2,"
                               "SUBSTR(reportdate2,7,4) as year,SUBSTR(reportdate2,1,2)  as month from RPT")
        '''
        FinalHistDF.show(100, False)
        FinalRPTDF.show(100, False)
        FinalHistDF.coalesce(1).select("*").write.mode("overwrite").parquet(
            self.AttSalesActualsOutput + '/' + 'Working1')

        FinalRPTDF.coalesce(1).select("*").write.mode("overwrite").parquet(
            self.AttSalesActualsOutput + '/' + 'Working2')

        FinalHistDF.coalesce(1).select("*").write.mode("append").partitionBy(
            'year', 'month').format('parquet').save(self.AttSalesActualsOutput)
        FinalRPTDF.coalesce(1).select("*").write.mode("append").partitionBy(
            'year', 'month').format('parquet').save(self.AttSalesActualsOutput)

        spark.stop()
banner_phone = hive_context.sql(banner_phone_query)


#Clean up Advance data
#The data coming in from Advance has double quotes around each field. We need to clean them up. We use Pandas for this part because it has a handy applymap method that works elementwise.

advance_phone_pd = advance_phone.toPandas()
advance_phone_pd_clean = advance_phone_pd.applymap(lambda x: clean_quotes(x))
adv_phone_clean = hive_context.createDataFrame(advance_phone_pd_clean)
adv_phone_clean = hive_context.createDataFrame(advance_phone_pd_clean)
banner_full_phone = (banner_phone

    .withColumn('Full_Phone',
                sf.rtrim(
                    sf.concat(
                        safe_concat(banner_phone, 'Telephone_Area_Code', ''),
                        safe_concat(banner_phone, 'Telephone_Number', ''), )))

    .withColumn('Data_Source', sf.lit('Banner'))

    .select(
    sf.col('NetID'),
    sf.col('PIDM').alias('Source_ID'),
    sf.col('Telephone_Description').alias('Phone_Type'),
    sf.col('Full_Phone'),
    sf.col('Data_Source')))
hub_full_phone = (hub_phone
    .withColumn('Full_Phone',
                sf.rtrim(
                    sf.concat(
                        safe_concat(hub_phone, 'Country_Code', ''),
コード例 #15
0
snewdf = sdf.withColumn(
    "comment_text",
    F.regexp_replace(F.col("comment_text"), "[\$#,?."
                     "!@#$%^&*()0123456789:-=\+]", ""))
snewdf = snewdf.withColumn('comment_text',
                           F.regexp_replace(F.col("comment_text"), "\"", ""))
snewdf = snewdf.withColumn('comment_text',
                           F.regexp_replace(F.col("comment_text"), "\n", " "))
snewdf = snewdf.withColumn('comment_text',
                           F.regexp_replace(F.col("comment_text"), "\[", ""))
snewdf = snewdf.withColumn('comment_text',
                           F.regexp_replace(F.col("comment_text"), "\]", ""))
snewdf = snewdf.withColumn('comment_text',
                           F.regexp_replace(F.col("comment_text"), "\"+", ""))
snewdf = snewdf.withColumn('comment_text', F.lower(F.col('comment_text')))
snewdf = snewdf.withColumn('comment_text', F.rtrim(snewdf.comment_text))
snewdf = snewdf.withColumn('comment_text', F.ltrim(snewdf.comment_text))

from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="comment_text", outputCol="tokenized")
tokenized_df = tokenizer.transform(snewdf)
tokenized_df.select("tokenized").show()

stopwordsremoved = StopWordsRemover(inputCol="tokenized",
                                    outputCol="comment_txt")
swr_df = stopwordsremoved.transform(tokenized_df)
swr_df.select("comment_txt").show()

from pyspark.ml.feature import HashingTF, IDF
コード例 #16
0
def compile_rstrip(t, expr, scope, **kwargs):
    op = expr.op()

    src_column = t.translate(op.arg, scope)
    return F.rtrim(src_column)
コード例 #17
0
    def loadParquet(self):

        self.log.info('Exception Handling starts')

        validSourceFormat = self.isValidFormatInSource()

        if not validSourceFormat:
            self.log.error(" Source files not in csv format.")

        validSourceSchema = self.isValidSchemaInSource()
        if not validSourceSchema:
            self.log.error(
                " Source schema does not have all the required columns.")

        if not validSourceFormat or not validSourceSchema:
            self.log.error(
                "Copy the source files to data processing error path and return."
            )
            self.copyFile(
                self.EmpOprEffMasterListFile,
                self.dataProcessingErrorPath + self.EmpOprEffName +
                datetime.now().strftime('%Y%m%d%H%M') + self.fileFormat)

            return

        self.log.info('Source format and schema validation successful.')
        self.log.info('Reading the input parquet file')

        schema = StructType([
            StructField('Market', StringType(), True),
            StructField('Region', StringType(), True),
            StructField('District', StringType(), True),
            StructField('Location', StringType(), True),
            StructField('Sales Person', StringType(), True),
            StructField('WorkDay ID', StringType(), False),
            StructField('Total Loss', StringType(), False),
            StructField('Total Issues', StringType(), False),
            StructField('Action Taken', StringType(), False),
            StructField('HR Consulted Before Termination', StringType(),
                        False),
            StructField('Transaction Errors', StringType(), False),
            StructField('Total Errors', StringType(), False),
            StructField('NEXT Trades', StringType(), False),
            StructField('Total Devices12', StringType(), False),
            StructField('Hyla Loss', StringType(), False),
            StructField('Total Devices14', StringType(), False),
            StructField('Denied RMA Devices', StringType(), False),
            StructField('Total Devices16', StringType(), False),
            StructField('Cash Deposits', StringType(), False),
            StructField('Total Missing Deposits', StringType(), False),
            StructField('Total Short Deposits', StringType(), False),
            StructField('Shrinkage', StringType(), False),
            StructField('Comments', StringType(), False),
            StructField('Date', StringType(), False)
        ])
        df1 = self.sparkSession.read.format("com.databricks.spark.csv").option(
            "header", "true").option("multiLine", "true").option(
                "delimiter", ",").option("quotechar", '"').option(
                    "treatEmptyValuesAsNulls",
                    "true").option("escape", ",").option("escape", '"').option(
                        "encoding",
                        "UTF-8").option("inferSchema", "false").load(
                            self.OperationalEfficiencyScoreCard, schema=schema)
        df1 = df1.filter(~(df1.Market.like('%%Marke%%')))
        df1 = df1.filter(~(df1.Market.like('%%Operational%%')))
        df1 = df1.filter(~(df1.Market.like(' Regio%%')))
        df1 = df1.where(df1.Market != '')
        df1 = df1.where(df1.Region != '')
        df1 = df1.where(df1.District != '')
        df1 = df1.withColumnRenamed("Market", "market").withColumnRenamed(
            "Region", "region"
        ).withColumnRenamed("District", "district").withColumnRenamed(
            "Location", "location"
        ).withColumnRenamed("Sales Person", "salesperson").withColumnRenamed(
            "WorkDay ID", "workdayid"
        ).withColumnRenamed("Total Loss", "total_loss").withColumnRenamed(
            "Total Issues", "total_issues"
        ).withColumnRenamed("Action Taken", "actiontaken").withColumnRenamed(
            "HR Consulted Before Termination", "hrconsultedbefore_termination"
        ).withColumnRenamed(
            "Transaction Errors", "transaction_errors"
        ).withColumnRenamed("Total Errors", "total_errors").withColumnRenamed(
            "NEXT Trades", "next_trades").withColumnRenamed(
                "Total Devices12", "total_devices12"
            ).withColumnRenamed("Hyla Loss", "hyla_loss").withColumnRenamed(
                "Total Devices14", "total_devices14").withColumnRenamed(
                    "Denied RMA Devices",
                    "denied_rmadevices").withColumnRenamed(
                        "Total Devices16", "total_device16").withColumnRenamed(
                            "Cash Deposits",
                            "cash_deposits").withColumnRenamed(
                                "Total Missing Deposits",
                                "totalmissing_deposits").withColumnRenamed(
                                    "Total Short Deposits",
                                    "totalshort_deposits").withColumnRenamed(
                                        "Shrinkage",
                                        "Shrinkage").withColumnRenamed(
                                            "Comments",
                                            "comments").withColumnRenamed(
                                                "Date", "reportdate")

        df1 = df1.withColumn(
            "totalloss",
            rtrim(regexp_replace("total_loss", '\\(|\\)|\\$|\\,',
                                 '')).cast(DecimalType(15, 2)))
        df1 = df1.withColumn("totalissues",
                             df1["total_issues"].cast(IntegerType()))
        df1 = df1.withColumn(
            "hrconsultedbeforetermination",
            df1["hrconsultedbefore_termination"].cast(BooleanType()))
        df1 = df1.withColumn(
            "transactionerrors",
            rtrim(regexp_replace("transaction_errors", '\\(|\\)|\\$|\\,',
                                 '')).cast(DecimalType(15, 2)))
        df1 = df1.withColumn("totalerrors",
                             df1["total_errors"].cast(IntegerType()))
        df1 = df1.withColumn(
            "nexttrades",
            rtrim(regexp_replace("next_trades", '\\(|\\)|\\$',
                                 '')).cast(DecimalType(15, 2)))
        df1 = df1.withColumn("totaldevices12",
                             df1["total_devices12"].cast(IntegerType()))
        df1 = df1.withColumn(
            "hylaloss",
            rtrim(regexp_replace("hyla_loss", '\\(|\\)|\\$',
                                 '')).cast(DecimalType(15, 2)))
        df1 = df1.withColumn("totaldevices14",
                             df1["total_devices14"].cast(IntegerType()))
        df1 = df1.withColumn(
            "deniedrmadevices",
            rtrim(regexp_replace("denied_rmadevices", '\\(|\\)|\\$',
                                 '')).cast(DecimalType(15, 2)))
        df1 = df1.withColumn("totaldevice16",
                             df1["total_device16"].cast(IntegerType()))
        df1 = df1.withColumn(
            "cashdeposits",
            rtrim(regexp_replace("cash_deposits", '\\(|\\)|\\$',
                                 '')).cast(DecimalType(15, 2)))
        df1 = df1.withColumn("totalmissingdeposits",
                             df1["totalmissing_deposits"].cast(IntegerType()))
        df1 = df1.withColumn("totalshortdeposits",
                             df1["totalshort_deposits"].cast(IntegerType()))
        df1 = df1.withColumn(
            "shrinkage",
            rtrim(regexp_replace("Shrinkage", '\\(|\\)|\\$|\\,',
                                 '')).cast(DecimalType(15, 2)))

        df1.registerTempTable("empopreff")
        df2 = self.sparkSession.sql(
            "select e.market , e.region,e.district, e.location, e.salesperson, e.workdayid, e.totalloss, e.totalissues, e.actiontaken,e.hrconsultedbeforetermination , e.transactionerrors, e.totalerrors, e.nexttrades, e.totaldevices12, e.hylaloss, e.totaldevices14, e.deniedrmadevices, e.totaldevice16, e.cashdeposits, e.totalmissingdeposits, e.totalshortdeposits, e.shrinkage, e.comments, YEAR(FROM_UNIXTIME(UNIX_TIMESTAMP())) as year, SUBSTR(FROM_UNIXTIME(UNIX_TIMESTAMP()),6,2) as month, e.reportdate from empopreff e "
        )

        df2.coalesce(1).select("*").write.mode("append").partitionBy(
            'year',
            'month').parquet(self.OperationalEfficiencyScoreCardOutputArg)
        df2.coalesce(1).select("*").write.mode("overwrite").parquet(
            self.OperationalEfficiencyScoreCardOutputArg + '/' + 'Working')
        self.sparkSession.stop()

# COMMAND ----------

from pyspark.sql.functions import lower, upper
df.select(col("Description"),
    lower(col("Description")),
    upper(lower(col("Description")))).show(2)


# COMMAND ----------

from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
    ltrim(lit("    HELLO    ")).alias("ltrim"),
    rtrim(lit("    HELLO    ")).alias("rtrim"),
    trim(lit("    HELLO    ")).alias("trim"),
    lpad(lit("HELLO"), 3, " ").alias("lp"),
    rpad(lit("HELLO"), 10, " ").alias("rp")).show(2)


# COMMAND ----------

from pyspark.sql.functions import regexp_replace
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
  regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"),
  col("Description")).show(2)


# COMMAND ----------
コード例 #19
0
ファイル: glue_job.py プロジェクト: tzaton/rate-my-post
            "favorite_count",
            "community_owned_date",
            "closed_date",
            "content_license")\
    .withColumn("creation_date",
                f.to_timestamp(f.col("creation_date")))\
    .withColumn("last_edit_date",
                f.to_timestamp(f.col("last_edit_date")))\
    .withColumn("last_activity_date",
                f.to_timestamp(f.col("last_activity_date")))\
    .withColumn("community_owned_date",
                f.to_timestamp(f.col("community_owned_date")))\
    .withColumn("closed_date",
                f.to_timestamp(f.col("closed_date")))\
    .withColumn("tags",
                f.split(f.rtrim(f.regexp_replace(f.regexp_replace(f.col("tags"), "<", ""), ">", " ")), " "))\
    .withColumn("creation_year", f.year("creation_date"))

save_table(posts, "posts", ["creation_year"])

# tags
tags = spark.read.format('xml')\
    .options(rowTag='row')\
    .load(f's3://{bucket_name}/{input_dir}/tags')\
    .withColumnRenamed("_Count", "count")\
    .withColumnRenamed("_ExcerptPostId", "excerpt_post_id")\
    .withColumnRenamed("_Id", "id")\
    .withColumnRenamed("_TagName", "tag_name")\
    .withColumnRenamed("_WikiPostId", "wiki_post_id")\
    .select("id",
            "tag_name",
コード例 #20
0
	StructField("country", StringType(), True),
	StructField("salary", StringType(), True)]);
	df = spark.read.csv(filepath, schema=strct);


	dfregex1 = df.select(df.workclass, df.finalweight, df.education, df.educationnum, df.maritalstatus, df.occupation, df.relationship
		, df.race , df.gender, df.capitalgain, df.capitalloss, df.hoursperweek, df.country, regexp_replace(df.salary, '(<=50K)', '50').alias('salary'));


	dfregex = dfregex1.select(dfregex1.workclass, dfregex1.finalweight, dfregex1.education, dfregex1.educationnum, dfregex1.maritalstatus, dfregex1.occupation
		, dfregex1.relationship, dfregex1.race , dfregex1.gender, dfregex1.capitalgain, dfregex1.capitalloss, dfregex1.hoursperweek, dfregex1.country
		, regexp_replace(dfregex1.salary, '(>50K)', '51').alias('salary'));

	dfbtrim = dfregex1.select(dfregex1.workclass, dfregex1.finalweight, dfregex1.education, dfregex1.educationnum, dfregex1.maritalstatus
		, dfregex1.occupation, dfregex1.relationship, dfregex1.race , dfregex1.gender, dfregex1.capitalgain, dfregex1.capitalloss
		, dfregex1.hoursperweek, dfregex1.country, ltrim(rtrim(dfregex1.salary)).alias('salary') );


	dfcast = dfbtrim.select(dfbtrim.workclass, dfbtrim.finalweight, dfbtrim.education, dfbtrim.educationnum, dfbtrim.maritalstatus, dfbtrim.occupation
		, dfbtrim.relationship, dfbtrim.race , dfbtrim.gender, dfbtrim.capitalgain, dfbtrim.capitalloss, dfbtrim.hoursperweek, dfbtrim.country
		, dfbtrim.salary.cast(IntegerType()).alias('intSal'));

	dfcast.createOrReplaceTempView("employees");

	query = "select workclass, education, maritalstatus, occupation, relationship, hoursperweek, country, avg(intSal) as sal_avg from employees " +
		"group by workclass, education, maritalstatus, occupation, relationship, hoursperweek, country";

	sqldf = spark.sql(query);
	sqldf.dropna().show(10);

	spark.stop();
コード例 #21
0
 def get_latest_yyyymm_table_name(self, table):
     # TODO fix import here
     import pyspark.sql.functions as F
     df = self.get_all_yyyymm_tables(table)
     return df.agg(F.max(F.rtrim(F.col('TableName'))).alias('max_partition')).collect()[0].max_partition.lower()
コード例 #22
0
ファイル: MySQL_UC2.py プロジェクト: abhilash97sharma/O_Work
        delta = (start-end).total_seconds()
        return delta
# register as a UDF 
f1 = f.udf(time_delta)

def time_sub(y,x):
    z=y-x
    return z

f2 = f.udf(time_sub)

sc = SparkContext('local','MySQL_UC1')
sqlContext = SQLContext(sc)
device=sqlContext.read.format("jdbc").option("url","jdbc:mysql://localhost:3306/husain?useSSL=false").option("driver", "com.mysql.jdbc.Driver").option("dbtable","device").option("user","root").option("password","root").load()
emp=sqlContext.read.format("jdbc").option("url","jdbc:mysql://localhost:3306/husain?useSSL=false").option("driver", "com.mysql.jdbc.Driver").option("dbtable","employee").option("user","root").option("password","root").load()
emp = emp.withColumn('Timings',f.rtrim(emp.timings))
emp=emp.join(device,device.device_id == emp.device_id,"inner").select(emp.id,emp.name,emp.device_id,device.status,emp.Timings)
split_col = f.split(emp['Timings'],' ')
emp = emp.withColumn('Date',split_col.getItem(0).cast('date'))
#emp = emp.orderBy('Date','name')
#emp.show()
emp1 = emp.filter(emp['status']=='in')
emp1=emp1.orderBy('Date','name','Timings')
emp2 = emp.filter(emp.status=='out')
emp2 = emp2.orderBy('Date','name','Timings')
#
emp1 = emp1.withColumnRenamed('Timings','IN_TIME')
emp2 = emp2.withColumnRenamed('Timings','OUT_TIME')
#
#emp1=emp1.join(emp2,emp1.id == emp2.id,"full_outer").select(emp1.id,emp1.Date,emp1.name,emp1.device_id,emp1.status,emp1.IN_TIME,emp2.OUT_TIME)
#emp1 = emp1.
コード例 #23
0
ファイル: 6-chapter.py プロジェクト: tarasowski/apache-spark
        lower(col('Description')),
        upper(col('Description'))).show(2)

df.selectExpr(
        'Description',
        'lower(Description)',
        'upper(lower(Description))').show(2)

# select description, lower(Description), upper(lower(Description)) from dfTable


from pyspark.sql.functions import ltrim, rtrim, rpad, lpad, trim

df.select(
        ltrim(lit('         HELLO           ')).alias('ltrim'),
        rtrim(lit('         HELLO           ')).alias('rtrim'),
        trim(lit('         HELLO           ')).alias('trim'),
        lpad(lit('HELLO'), 3, ' ').alias('lp'),
        rpad(lit('HELLO'), 10, ' ').alias('rp')).show(2)

df.selectExpr(
        'ltrim(         "HELLO"           ) as ltrim',
        'rtrim(         "HELLO"           ) as rtrim',
        'trim(         "HELLO"           )as trim',
        'lpad("HELLO", 3, " ") as lp',
        'rpad("HELLO", 3, " ")as rp').show(2)

# select 
#   ltrim('     HELLO       '),
#   rtrim('     HELLO       '),
#   trim('      HELLO       '),
コード例 #24
0
df = spark.read.format("csv")\
  .option("header","true")\
  .option("inferSchema","true")\
  .load('/databricks-datasets/definitive-guide/data/retail-data/by-day/2010-12-01.csv')
#data schema 확인
df.printSchema()

#initcap : 주어진 문자열에서 공백을 나눠 첫글자를 대문자로 반환
df.select(initcap(col("Description"))).show(2, False)
#lower // upper
df.select(lower(col("StockCode"))).show(2)
#공백 추가 및 제거 (lit,ltrim,rtrim,rpad,lpad,trim)
from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
    ltrim(lit("   HELLO   ")).alias("ltrim"),
    rtrim(lit("   HELLO   ")).alias("rtrim"),
    trim(lit("   HELLO   ")).alias("trim"),
    lpad(lit("HELLO"), 3, " ").alias("lpad"),
    rpad(lit("HELLP"), 10, " ").alias("rpad")).show(2)

##정규 표현식
#description컬럼의 값을 COLOR 값으로 치환
from pyspark.sql.functions import regexp_replace
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
    regexp_replace(col("Description"), regex_string,
                   "COLOR").alias("color_clean"), col("Description")).show(2)

#주어진 문자를 다른 문자로 치환
from pyspark.sql.functions import translate
df.select(translate(col("Description"), "WHI", "123")).show(2)
コード例 #25
0
    def lsg_omni(self):
        start_date, end_date = date_period(self.period, self.start_date)

        table_name = 'datalake_omni.omni_hit_data'
        dt_col_name = 'hit_time_gmt_dt_key'
        _, bound_end_date = date_period(-1, end_date)
        bound_date_check(table_name, dt_col_name, start_date, bound_end_date,
                         self.env, 'YYYYMMDD', 'LSG')

        query = 'SELECT ' \
                'VS.visit_session_key AS session_key, ' \
                'HIT.post_visid_combined AS visit_id, ' \
                'HIT.visit_return_count AS visit_number, ' \
                'UPPER(TRIM(prod_list)) AS prod_list, ' \
                'HIT.hit_time_gmt_ts AS time_stamp, ' \
                "TRIM(SUBSTRING(TRIM(DEMANDBASE), 0, POSITION('|' IN TRIM(DEMANDBASE)))) AS " \
                "account_no " \
                'FROM datalake_omni.omni_hit_data HIT ' \
                'LEFT JOIN CDWDS.D_OMNI_VISIT_SESSION VS ON ' \
                '  VS.VISIT_RETURN_COUNT=HIT.VISIT_RETURN_COUNT AND VS.POST_VISID_COMBINED=HIT.POST_VISID_COMBINED ' \
                f'WHERE HIT.hit_time_gmt_dt_key<{start_date} AND HIT.hit_time_gmt_dt_key>={end_date} ' \
                'AND HIT.post_visid_combined IS NOT NULL ' \
                "AND prod_list IS NOT NULL AND prod_list NOT LIKE '%shipping-handling%' " \
                "AND TRIM(SUBSTRING(TRIM(DEMANDBASE), 0, POSITION('|' IN TRIM(DEMANDBASE)))) <> '' "

        schema = StructType([
            StructField('session_key', IntegerType(), True),
            StructField('visit_id', StringType(), True),
            StructField('visit_number', IntegerType(), True),
            StructField('time_stamp', StringType(), True),
            StructField('prod_list', StringType(), True),
            StructField('account_no', StringType(), True),
        ])

        df = redshift_cdw_read(query, db_type = 'RS', database = 'CDWDS', env = self.env, schema = schema). \
            withColumn('prod_id_untrimmed', explode(split('prod_list', ','))). \
            withColumn('prod_id', ltrim(rtrim(col('prod_id_untrimmed')))). \
            drop('prod_id_untrimmed'). \
            drop('prod_list'). \
            filter(col('prod_id').isNotNull()). \
            filter(col('prod_id') != ''). \
            distinct()

        if self.debug:
            print(f'row count for df = {df.count()}')

        # find active products
        query = 'SELECT	sku as prod_id, stk_type_cd '\
                'FROM cdwds.lsg_prod_v ' \
                "WHERE	stk_type_cd = 'D'"

        discontinued_prods = redshift_cdw_read(query,
                                               db_type='RS',
                                               database='CDWDS',
                                               env=self.env)

        df = df.join(discontinued_prods, ['prod_id'], how = 'left').\
            filter(col('stk_type_cd').isNull()).\
            drop('stk_type_cd')

        if self.debug:
            print(
                f'After filtering out discontinued SKUs, row count for df = {df.count()}'
            )

        query = 'SELECT UPPER(sku_nbr) AS prod_id, size_grp AS coupon ' \
                'FROM cdwds.f_web_prod_feature ' \
                "WHERE size_grp IS NOT NULL AND size_grp <> 'T' " \
                'GROUP BY sku_nbr, size_grp'

        coupons = redshift_cdw_read(query,
                                    db_type='RS',
                                    database='CDWDS',
                                    env=self.env)

        if coupons.count() == 0:
            raise DataValidityError(
                'No coupon information.  Please check the validity of size_grp column '
                'on cdwds.f_web_prod_feature.')

        df = df.join(broadcast(coupons), ['prod_id'], how = 'left').\
            withColumn('coupon', coalesce('coupon', 'prod_id'))

        prod_list = df.select('prod_id').distinct()
        coupons = coupons.union(df.select('prod_id', 'coupon')).\
            filter(col('prod_id').isNotNull()).\
            distinct().\
            withColumn("coupon_key", func.dense_rank().over(Window.orderBy('coupon')))

        df = df.join(coupons, ['prod_id', 'coupon'], how='left')

        if self.debug:
            coupons.show()
            df.show()
            print(
                f'row count for coupons = {coupons.select(col("coupon_key")).distinct().count()}'
            )

        return df, prod_list, coupons
コード例 #26
0
posts = spark.read.format("xml").option("rowTag", "row").load(xml_file_path)

# select only questions
questions = posts.filter(posts._PostTypeId == 1)

# drop irrelvant columns and clean up strings
questions = questions.select(
    [c for c in questions.columns if c in ['_Id', '_Title', '_Body', '_Tags']])
questions = questions.withColumn(
    'full_question', sf.concat(sf.col('_Title'), sf.lit(' '), sf.col('_Body')))
questions = questions.select([
    c for c in questions.columns if c in ['_Id', 'full_question', '_Tags']
]).withColumn("full_question", regexp_replace("full_question", "[\\n,]", " "))
questions = questions.withColumn("_Tags", regexp_replace("_Tags", "><", " "))
questions = questions.withColumn("_Tags", regexp_replace("_Tags", "(>|<)", ""))
questions = questions.withColumn('_Tags', rtrim(questions._Tags))
questions = questions.withColumn('_Tags', split(questions._Tags, " "))

# filter out to single tags in following list
tags_of_interest = [
    'azure-devops', 'azure-functions', 'azure-web-app-service',
    'azure-storage', 'azure-virtual-machine'
]


def intersect(xs):
    xs = set(xs)

    @udf("array<string>")
    def _(ys):
        return list(xs.intersection(ys))
コード例 #27
0
def trimStrings(dataframe, columns):
    for col in columns:
        dataframe = dataframe.withColumn(col, F.ltrim(F.rtrim(dataframe[col])))
    return dataframe