コード例 #1
0
def process_country_codes():
    input_data_file = os.path.join(s3, I94_CODES_DATA_PATH + COUNTRY_FILE)
    df_country = spark.read.format("csv").option("delimiter", "=").option(
        "header", "False").load(input_data_file)
    df_country = df_country.withColumnRenamed(
        "_c0", "country_code").withColumnRenamed("_c1", "country_name")
    df_country = df_country.withColumn(
        "country_name", F.regexp_replace(df_country.country_name, "'", ""))
    df_country = df_country.withColumn(
        "country_name", F.ltrim(F.rtrim(df_country.country_name)))
    df_country = df_country.withColumn(
        "country_code", F.ltrim(F.rtrim(df_country.country_code)))
    df_country = df_country.withColumn(
        "country_name",
        F.regexp_replace(df_country.country_name,
                         "^INVALID.*|Collapsed.*|No\ Country.*", "INVALID"))
    df_country.write.mode("overwrite").parquet(s3 +
                                               'data/processed/codes/country')
    return df_country
コード例 #2
0
def remove_space(df, col_name, position):
    # remove left side space
    if position == "l":
        return df.withColumn("tmp", ltrim(f.col(col_name))).drop(col_name).withColumnRenamed("tmp", col_name)
    # remove right side space
    elif position == "r":
        return df.withColumn("tmp", rtrim(f.col(col_name))).drop(col_name).withColumnRenamed("tmp", col_name)
    # remove all side space
    elif position == "a":
        return df.withColumn("tmp", trim(f.col(col_name))).drop(col_name).withColumnRenamed("tmp", col_name)
コード例 #3
0
def joinDataSet():

    spark = SparkSession.builder.appName('csv_parse').getOrCreate()

    #Load xml
    xml_df = spark.read.format('com.databricks.spark.xml'). \
    option("rootTag", "feed"). \
           option("rowTag","doc"). \
    load(xml_s3_path). \
           withColumn("title",f.ltrim(f.split(f.col("title"),":").getItem(1))). \
    withColumn("shortUrl",f.split(f.col("url"),"/"))

    selectedData = xml_df.select(
        "title", "url",
        f.element_at(f.col('shortUrl'), -1).alias('shortUrl'), "abstract")
    selectedData.repartition(1).write.option(
        "sep", "\t").format('csv').mode("overwrite").save(csv_output_path_2,
                                                          header='false')
    selectedData.createOrReplaceTempView("wiki_pages")

    #Load csv
    json_schema = ArrayType(
        StructType([
            StructField('name', StringType(), nullable=False),
            StructField('id', IntegerType(), nullable=False)
        ]))

    df = spark.read.option("header",True). \
    option("quote","\""). \
    option("escape","\""). \
    option("multiLine",True). \
    csv(csv_s3_path). \
    withColumn("sanitizedTitle",f.regexp_replace(f.col("title"),"\\s+","_")). \
    withColumn("year",f.split(f.col("release_date"),"-").getItem(0)). \
    withColumn("companiesList",f.from_json(f.col("production_companies"),json_schema)). \
    withColumn("companiesList",f.concat_ws("-",f.col("companiesList.name")))

    csvSelectedData = df.select("title", "sanitizedTitle")
    csvSelectedData.repartition(1).write.option(
        "sep", "\t").format('csv').mode("overwrite").save(csv_output_path_3,
                                                          header='false')

    df.createOrReplaceTempView("movies_metadata")

    # Join datasets
    q = spark.sql(join_sql_query)

    # Write output to s3
    q.repartition(1).write.option(
        "sep", "\t").format('csv').mode("overwrite").save(csv_output_path,
                                                          header='false')
コード例 #4
0
def process_state_codes():
    input_data_file = os.path.join(s3, I94_CODES_DATA_PATH + STATE_FILE)
    df_state = spark.read.format("csv").option("delimiter", "=").option(
        "header", "False").load(input_data_file)
    df_state = df_state.withColumnRenamed("_c0",
                                          "state_code").withColumnRenamed(
                                              "_c1", "state_name")
    df_state = df_state.withColumn(
        "state_code", F.regexp_replace(df_state.state_code, "[^A-Z]", ""))
    df_state = df_state.withColumn(
        "state_name", F.regexp_replace(df_state.state_name, "'", ""))
    df_state = df_state.withColumn("state_name",
                                   F.ltrim(F.rtrim(df_state.state_name)))
    df_state.write.mode("overwrite").parquet(s3 +
                                             'data/processed/codes/us_state')
    return df_state
コード例 #5
0
def canonicaltokens(df, inputColumn, outputColumn):
   """
   turn input column of strings into canonical format as output column of tokens
   return as output column added to the dataframe
   """

   newname = df.withColumn("cleanname", \
       f.regexp_replace(f.regexp_replace(f.rtrim(f.ltrim(f.col(inputColumn))), \
       " (\w) (\w) ", "$1$2"), "(\w) (\w) (\w)$", "$1$2$3"))

   newtokenizer = mlf.Tokenizer(inputCol="cleanname", outputCol="words")
   chtokenized = newtokenizer.transform(newname).drop("cleanname")

   stopwordremover = mlf.StopWordsRemover(inputCol="words", outputCol=outputColumn)
   canonicalname = stopwordremover.transform(chtokenized).drop("words")

   return canonicalname
コード例 #6
0
def remove_space(df, col_name, position):
    if position not in ["l", "r", "a"]:
        raise ValueError("The position value must be l, r or a")
    # get origin column orders
    columns = df.columns
    # remove left side space
    if position == "l":
        return df.withColumn("tmp", ltrim(sql_fun.col(col_name))).drop(col_name).withColumnRenamed("tmp",
                                                                                                   col_name).select(
            *columns)
    # remove right side space
    elif position == "r":
        return df.withColumn("tmp", rtrim(sql_fun.col(col_name))).drop(col_name).withColumnRenamed("tmp",
                                                                                                   col_name).select(
            *columns)
    # remove all side space
    elif position == "a":
        return df.withColumn("tmp", trim(sql_fun.col(col_name))).drop(col_name).withColumnRenamed("tmp",
                                                                                                  col_name).select(
            *columns)
コード例 #7
0
def parse_message(col, eol="\n"):
    """
    Generate the expression that parses the email message into From, Subject, Body etc.

    Args:
        col - sqlf.col() column object
        eol - end of line chatacter to use when parsing the email
    Returns:
        List pyspark.sql.functions to be passed to select()
    """

    out_dict = [
        "Message-ID",
        "Date",
        "From",
        "To",
        "Subject",
        "Mime-Version",
        "Content-Type",
        "Content-Transfer-Encoding",
        "X-From",
        "X-To",
        "X-cc",
        "X-bcc",
        "X-Folder",
        "X-Origin",
        "X-FileName",
        eol,
    ]
    expr = []
    for i in range(0, len(out_dict) - 1):
        expr.append(
            sqlf.ltrim(
                sqlf.rtrim(sqlf.split(sqlf.split(col, out_dict[i] + ":")[1], eol)[0])
            ).alias(out_dict[i])
        )

    expr.append(sqlf.split(sqlf.split(col, "X-FileName:")[1], "nsf")[1].alias("Body"))

    return expr
コード例 #8
0
def process_airport_codes():

    #transform airport codes
    input_data_file = os.path.join(s3, I94_CODES_DATA_PATH + AIRPORT_FILE)
    df_airport = spark.read.format("csv").option("delimiter", "=").option(
        "header", "False").load(input_data_file)
    df_airport = df_airport.withColumn(
        "_c0", F.regexp_replace(df_airport._c0, "'", "")).withColumn(
            "_c1", F.regexp_replace(df_airport._c1, "'", ""))
    split_col = F.split(df_airport._c1, ",")
    df_airport = df_airport.withColumn("city", split_col.getItem(0))
    df_airport = df_airport.withColumn("state_code", split_col.getItem(1))
    df_airport = df_airport.withColumnRenamed("_c0", "port_code")
    df_airport = df_airport.drop("_c1")
    df_airport = df_airport.withColumn(
        "port_code",
        F.regexp_replace(df_airport.port_code, "[^A-Z]", "")).withColumn(
            "city", F.ltrim(F.rtrim(df_airport.city))).withColumn(
                "state_code",
                F.regexp_replace(df_airport.state_code, "[^A-Z]", ""))
    df_state = process_state_codes()
    df_airport = df_airport.join(df_state, "state_code")
    df_airport.write.mode("overwrite").parquet(s3 +
                                               'data/processed/codes/us_ports')
コード例 #9
0
from pyspark.sql.functions import monotonically_increasing_id

df.select(monotonically_increasing_id()).show(10)

#working with strings

#perform case converstions

from pyspark.sql.functions import initcap, lower, upper, ltrim, rtrim, trim, lpad, rpad
df.select(initcap(col("Description"))).show(5)
df.select(col("Description"), initcap(col("Description")),
          lower(col("Description")), upper(col("Description"))).show(5)

string_with_space = "     hello     "

df.select(ltrim(lit(string_with_space)), rtrim(lit(string_with_space)),
          trim(lit(string_with_space))).show()

#regular expressions

#working with dates, timestamps
from pyspark.sql.functions import current_date, current_timestamp, date_add, date_sub, datediff, months_between, to_date, to_timestamp

dateDF = spark.range(10).withColumn("today", current_date()).withColumn(
    "now", current_timestamp())
dateDF.show()

dateDF.select(
    date_add(col("today"), 5).alias("today+5"),
    date_sub(col("today"), 5).alias("today-5")).show()
コード例 #10
0
ファイル: batch_process.py プロジェクト: jg4821/travel_safe
    def calculate_score(self):
        # transform and filter data
        mentions_df = self.spark.read.parquet(self.mentions_path).select(
            'GLOBALEVENTID', 'MentionTimeDate', 'MentionIdentifier',
            'Confidence')
        gkg_df = self.spark.read.parquet(self.gkg_path).select(
            'DocumentIdentifier', 'Date', 'V2Tone')

        # filter rows on mention date in 2019
        mentions_df = mentions_df.filter(
            mentions_df.MentionTimeDate.like('2019%'))
        gkg_df = gkg_df.filter(gkg_df.Date.like('2019%'))
        gkg_df = gkg_df.drop('Date')

        # type casting for mentions and gkg df
        mentions_df = mentions_df.withColumn(
            'GLOBALEVENTID', mentions_df.GLOBALEVENTID.cast('INT'))
        mentions_df = mentions_df.withColumn(
            'Confidence', mentions_df.Confidence.cast('INT'))
        mentions_df = mentions_df.withColumn(
            'mDate',
            F.to_date(mentions_df.MentionTimeDate,
                      format='yyyyMMddHHmmss')).drop('MentionTimeDate')
        mentions_df.printSchema()
        print(mentions_df.first())

        gkg_df = gkg_df.withColumn(
            'Tone',
            F.split(gkg_df.V2Tone, ',')[0].cast('FLOAT')).drop('V2Tone')
        gkg_df.printSchema()
        print(gkg_df.first())

        # register the DataFrame as a SQL temporary view
        mentions_df.createOrReplaceTempView('mentions_table')
        gkg_df.createOrReplaceTempView('gkg_table')

        # run sql query on 3 tables to calculate safety_score
        temp_df = self.spark.sql(
            'SELECT GLOBALEVENTID, mDate, avg(Confidence*0.01*Tone) as sentiment, count(*) as numOfMentions \
                                FROM mentions_table inner join gkg_table on mentions_table.MentionIdentifier = gkg_table.DocumentIdentifier \
                                GROUP BY GLOBALEVENTID, mDate')

        temp_df.explain()
        temp_df.printSchema()
        print(temp_df.first())

        temp_df.createOrReplaceTempView('temp_table')

        # clear cache of mentions and gkg df & table, read in event data
        self.spark.catalog.dropTempView('mentions_table')
        self.spark.catalog.dropTempView('gkg_table')
        mentions_df.unpersist()
        gkg_df.unpersist()

        # load event data and perform join and aggregation
        event_df = self.spark.read.parquet(self.event_path).select(
            'GLOBALEVENTID', 'GoldsteinScale', 'ActionGeo_FullName')
        event_df = event_df.withColumn('GLOBALEVENTID',
                                       event_df.GLOBALEVENTID.cast('INT'))
        event_df = event_df.withColumn('GoldsteinScale',
                                       event_df.GoldsteinScale.cast('FLOAT'))
        event_df = event_df.withColumn(
            'country',
            F.rtrim(F.ltrim(F.split(event_df.ActionGeo_FullName, ',')[2])))
        event_df = event_df.withColumn(
            'state',
            F.rtrim(F.ltrim(F.split(event_df.ActionGeo_FullName, ',')[1])))
        event_df = event_df.withColumn(
            'city',
            F.rtrim(F.ltrim(F.split(event_df.ActionGeo_FullName,
                                    ',')[0]))).drop('ActionGeo_FullName')
        event_df.printSchema()
        print(event_df.first())

        event_df.createOrReplaceTempView('event_table')

        # compute final safety score
        result_df = self.spark.sql(
            'SELECT event_table.GLOBALEVENTID, mDate, 0.5*(GoldsteinScale*10+temp_table.sentiment) as SafetyScore, numOfMentions, \
                                    country, state, city \
                            FROM event_table inner join temp_table on event_table.GLOBALEVENTID = temp_table.GLOBALEVENTID'
        )

        result_df.explain()
        result_df.printSchema()
        print(result_df.first())

        # free up memory and disk
        self.spark.catalog.dropTempView('temp_table')
        self.spark.catalog.dropTempView('event_table')
        temp_df.unpersist()
        event_df.unpersist()

        return result_df
コード例 #11
0
        for i in r_body:
            if(i != " "):
                reformat_list = [rid, i]
                final_result.append(reformat_list)
    return final_result

#Preprocess the rdd from stage 3 for both positive and negative
music_p_small_preprocess = music_p_small_rdd.mapPartitions(review_encode_preprocess).cache()
# music_p_small_preprocess.take(2)
music_n_small_preprocess = music_n_small_rdd.mapPartitions(review_encode_preprocess).cache()
# music_n_small_preprocess.take(2)

#Re-formate the preprocessed rdd to formatted dataframe
music_p_preprocess_reformat = spark.createDataFrame(music_p_small_preprocess)
music_p_preprocess_reformat = music_p_preprocess_reformat.withColumnRenamed('_1', 'review_id').withColumnRenamed('_2', 'review_body')
music_p_preprocess_reformat = music_p_preprocess_reformat.withColumn('review_body', f.ltrim(music_p_preprocess_reformat.review_body))
music_p_preprocess_reformat = music_p_preprocess_reformat.withColumn('review_body', f.rtrim(music_p_preprocess_reformat.review_body))
# music_p_preprocess_reformat.show(5)
music_n_preprocess_reformat = spark.createDataFrame(music_n_small_preprocess)
music_n_preprocess_reformat = music_n_preprocess_reformat.withColumnRenamed('_1', 'review_id').withColumnRenamed('_2', 'review_body')
music_n_preprocess_reformat = music_n_preprocess_reformat.withColumn('review_body', f.ltrim(music_n_preprocess_reformat.review_body))
music_n_preprocess_reformat = music_n_preprocess_reformat.withColumn('review_body', f.rtrim(music_n_preprocess_reformat.review_body))
# music_n_preprocess_reformat.show(5)

#Doing tokenizer with regex to separate every word in review body and filter if empty list
regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'review_body', outputCol = 'review_token')
music_p_preprocess_reformat_token = regexTokenizer.transform(music_p_preprocess_reformat)
music_p_preprocess_reformat_token_filter = music_p_preprocess_reformat_token.filter(f.size('review_token') > 1)
# music_p_preprocess_reformat_token_filter.show(5)
music_n_preprocess_reformat_token = regexTokenizer.transform(music_n_preprocess_reformat)
music_n_preprocess_reformat_token_filter = music_n_preprocess_reformat_token.filter(f.size('review_token') > 1)
コード例 #12
0
df.printSchema()
df.show(3)
print(df.columns)
#header = df.first()
#print header
#header.show()

#print([str.strip(column) for column in df.columns])
#print(map(str.strip,df.columns))
#df.withColumnRenamed(" DATE1",str.strip(" DATE1")).columns
df_strip_spaces = df.toDF(*map(str.strip, df.columns))

df_strip_spaces.show(3)
print(df_strip_spaces.columns)

df_strip_spaces.select(ltrim(df_strip_spaces["SYMBOL"])).show()
df_strip_spaces.select(lower(df_strip_spaces["SYMBOL"])).show()
df_strip_spaces.select(upper(df_strip_spaces["SYMBOL"])).show()
df_strip_spaces.select(lpad(df_strip_spaces["SYMBOL"], 20, '0')).show()
df_strip_spaces.DATE1
#df_strip_spaces.select(to_date(df_strip_spaces.DATE1),'dd-mmm-yyyy').show()

df_strip_spaces.select(col('SYMBOL')).show(3)
#filtering,selection
df_strip_spaces.select('SYMBOL').show()
df_strip_spaces.select(col('SYMBOL')).show()

df.where(col("SYMBOL").startswith("KOTAK")).show()
df.where("SYMBOL like '%BANK%'").show()
df_strip_spaces.where(col('SYMBOL').like('KOTAKBANK') | col('SYMBOL').like("%YES%")) \
                .select('SYMBOL','DATE1','OPEN_PRICE','CLOSE_PRICE').show(3)
コード例 #13
0
	StructField("country", StringType(), True),
	StructField("salary", StringType(), True)]);
	df = spark.read.csv(filepath, schema=strct);


	dfregex1 = df.select(df.workclass, df.finalweight, df.education, df.educationnum, df.maritalstatus, df.occupation, df.relationship
		, df.race , df.gender, df.capitalgain, df.capitalloss, df.hoursperweek, df.country, regexp_replace(df.salary, '(<=50K)', '50').alias('salary'));


	dfregex = dfregex1.select(dfregex1.workclass, dfregex1.finalweight, dfregex1.education, dfregex1.educationnum, dfregex1.maritalstatus, dfregex1.occupation
		, dfregex1.relationship, dfregex1.race , dfregex1.gender, dfregex1.capitalgain, dfregex1.capitalloss, dfregex1.hoursperweek, dfregex1.country
		, regexp_replace(dfregex1.salary, '(>50K)', '51').alias('salary'));

	dfbtrim = dfregex1.select(dfregex1.workclass, dfregex1.finalweight, dfregex1.education, dfregex1.educationnum, dfregex1.maritalstatus
		, dfregex1.occupation, dfregex1.relationship, dfregex1.race , dfregex1.gender, dfregex1.capitalgain, dfregex1.capitalloss
		, dfregex1.hoursperweek, dfregex1.country, ltrim(rtrim(dfregex1.salary)).alias('salary') );


	dfcast = dfbtrim.select(dfbtrim.workclass, dfbtrim.finalweight, dfbtrim.education, dfbtrim.educationnum, dfbtrim.maritalstatus, dfbtrim.occupation
		, dfbtrim.relationship, dfbtrim.race , dfbtrim.gender, dfbtrim.capitalgain, dfbtrim.capitalloss, dfbtrim.hoursperweek, dfbtrim.country
		, dfbtrim.salary.cast(IntegerType()).alias('intSal'));

	dfcast.createOrReplaceTempView("employees");

	query = "select workclass, education, maritalstatus, occupation, relationship, hoursperweek, country, avg(intSal) as sal_avg from employees " +
		"group by workclass, education, maritalstatus, occupation, relationship, hoursperweek, country";

	sqldf = spark.sql(query);
	sqldf.dropna().show(10);

	spark.stop();
コード例 #14
0
        Person("Pratik", "Solanki", 22, 176.7, None),
        Person("Ashok ", "Pradhan", 62, None, None),
        Person(" ashok", "Pradhan", 42, 125.3, "Chemical Engineer"),
        Person("Pratik", "Solanki", 22, 222.2, "Teacher")
    ])

    people_df.show()
    people_df.groupBy("firstName").agg(first("weightInLbs")).show()
    people_df.groupBy(trim(lower(col('firstName')))).agg(first("weightInLbs")).show()
    people_df.groupBy(trim(lower(col("firstName")))).agg(first("weightInLbs", True)).show()
    people_df.sort(col("weightInLbs").desc()).groupBy(trim(lower(col("firstName")))).agg(first("weightInLbs", True)).show()
    people_df.sort(col("weightInLbs").asc_nulls_last()).groupBy(trim(lower(col("firstName")))).agg(first("weightInLbs", True)).show()

    corrected_people_df = people_df\
        .withColumn("firstName", initcap("firstName"))\
        .withColumn("firstName", ltrim(initcap("firstName")))\
        .withColumn("firstName", trim(initcap("firstName")))\

    corrected_people_df.groupBy("firstName").agg(first("weightInLbs")).show()

    corrected_people_df = corrected_people_df\
        .withColumn("fullName", format_string("%s %s", "firstName", "lastName"))\

    corrected_people_df.show()

    corrected_people_df = corrected_people_df\
        .withColumn("weightInLbs", coalesce("weightInLbs", lit(0)))\

    corrected_people_df.show()

    corrected_people_df\
コード例 #15
0
ファイル: 6-chapter.py プロジェクト: tarasowski/apache-spark
df.select(col('Description'),
        lower(col('Description')),
        upper(col('Description'))).show(2)

df.selectExpr(
        'Description',
        'lower(Description)',
        'upper(lower(Description))').show(2)

# select description, lower(Description), upper(lower(Description)) from dfTable


from pyspark.sql.functions import ltrim, rtrim, rpad, lpad, trim

df.select(
        ltrim(lit('         HELLO           ')).alias('ltrim'),
        rtrim(lit('         HELLO           ')).alias('rtrim'),
        trim(lit('         HELLO           ')).alias('trim'),
        lpad(lit('HELLO'), 3, ' ').alias('lp'),
        rpad(lit('HELLO'), 10, ' ').alias('rp')).show(2)

df.selectExpr(
        'ltrim(         "HELLO"           ) as ltrim',
        'rtrim(         "HELLO"           ) as rtrim',
        'trim(         "HELLO"           )as trim',
        'lpad("HELLO", 3, " ") as lp',
        'rpad("HELLO", 3, " ")as rp').show(2)

# select 
#   ltrim('     HELLO       '),
#   rtrim('     HELLO       '),
コード例 #16
0
    "comment_text",
    F.regexp_replace(F.col("comment_text"), "[\$#,?."
                     "!@#$%^&*()0123456789:-=\+]", ""))
snewdf = snewdf.withColumn('comment_text',
                           F.regexp_replace(F.col("comment_text"), "\"", ""))
snewdf = snewdf.withColumn('comment_text',
                           F.regexp_replace(F.col("comment_text"), "\n", " "))
snewdf = snewdf.withColumn('comment_text',
                           F.regexp_replace(F.col("comment_text"), "\[", ""))
snewdf = snewdf.withColumn('comment_text',
                           F.regexp_replace(F.col("comment_text"), "\]", ""))
snewdf = snewdf.withColumn('comment_text',
                           F.regexp_replace(F.col("comment_text"), "\"+", ""))
snewdf = snewdf.withColumn('comment_text', F.lower(F.col('comment_text')))
snewdf = snewdf.withColumn('comment_text', F.rtrim(snewdf.comment_text))
snewdf = snewdf.withColumn('comment_text', F.ltrim(snewdf.comment_text))

from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="comment_text", outputCol="tokenized")
tokenized_df = tokenizer.transform(snewdf)
tokenized_df.select("tokenized").show()

stopwordsremoved = StopWordsRemover(inputCol="tokenized",
                                    outputCol="comment_txt")
swr_df = stopwordsremoved.transform(tokenized_df)
swr_df.select("comment_txt").show()

from pyspark.ml.feature import HashingTF, IDF

hashingTF = HashingTF().setNumFeatures(50).setInputCol(
コード例 #17
0
#data load
df = spark.read.format("csv")\
  .option("header","true")\
  .option("inferSchema","true")\
  .load('/databricks-datasets/definitive-guide/data/retail-data/by-day/2010-12-01.csv')
#data schema 확인
df.printSchema()

#initcap : 주어진 문자열에서 공백을 나눠 첫글자를 대문자로 반환
df.select(initcap(col("Description"))).show(2, False)
#lower // upper
df.select(lower(col("StockCode"))).show(2)
#공백 추가 및 제거 (lit,ltrim,rtrim,rpad,lpad,trim)
from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
    ltrim(lit("   HELLO   ")).alias("ltrim"),
    rtrim(lit("   HELLO   ")).alias("rtrim"),
    trim(lit("   HELLO   ")).alias("trim"),
    lpad(lit("HELLO"), 3, " ").alias("lpad"),
    rpad(lit("HELLP"), 10, " ").alias("rpad")).show(2)

##정규 표현식
#description컬럼의 값을 COLOR 값으로 치환
from pyspark.sql.functions import regexp_replace
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
    regexp_replace(col("Description"), regex_string,
                   "COLOR").alias("color_clean"), col("Description")).show(2)

#주어진 문자를 다른 문자로 치환
from pyspark.sql.functions import translate
コード例 #18
0
ファイル: bnbclean.py プロジェクト: poojk/BnB-Pay
df = spark.read \
        .format("jdbc") \
        .option("url", "jdbc:postgresql://10.0.0.8:5432/my_db") \
        .option("dbtable", "airbnb") \
        .option("user", "test") \
        .option("password", "test") \
        .option("driver", "org.postgresql.Driver") \
        .load()

df = df.withColumn('bedrooms', F.round(df['bedrooms'], 0))
df = df.filter(~F.col('city').contains("/"))
df = df.filter(~F.col('city').contains(","))
df = df.filter(~F.col('city').contains("-"))
df = df.filter(~F.col('city').contains("^[0-9]*$"))
df = df.filter(~F.col('city').contains("*"))
df = df.withColumn('city', F.ltrim(df.city))
df = df.withColumn("city", F.initcap(F.col("city")))
df = df.filter(~df.city.rlike("[ ,;{}()\n\t=]"))
df = df.filter(~df.city.rlike("[^0-9A-Za-z]"))
df = df.filter(~F.col('city').contains("("))
df = df.groupBy('city', 'bedrooms').agg(
    F.avg('average').alias('average'), F.first('state'))
df = df.withColumnRenamed('first(state)', 'state')
df = df.sort('city')
df1 = df.withColumn('average', F.round(df['average'], 0))

df1.write \
        .format("jdbc") \
        .option("url", "jdbc:postgresql://10.0.0.8:5432/my_db") \
        .option("dbtable", "bnbclean") \
        .option("user", "test") \
コード例 #19
0
ファイル: validate.py プロジェクト: josephRog/deepframe_ntnu
def Validate(ngrams \
			, sampleSizes \
			, ctxSize \
			, sqc \
			, seqs \
			, outFile \
			, minval \
			, maxval \
			, avg \
			, nlines):

	accuracy = []
	gramSize = GramSize(ctxSize, lookahead)

	c1 = (((maxval - minval) * 1.0) / nlines) / avg
	c2 = ((minval * 1.0) / nlines) / avg
	print seqs.count()
				


	ngrams = ngrams.repartition(1 << nPartLog)
	ngrams.cache()

	#we will validate separately for each vector size
	for vecSize in vecSizes:
		print '======TESTING FOR VECTOR SIZE', vecSize
		#start fresh
		old_ngrams = ngrams
		ngrams = ngrams.withColumn('correct', lit(0))



		#use models from each sample
		modelId = 0
		for sampleSize in sampleSizes:

			w2v = Word2VecModel.load(w2vFile(outDir, ctxSize, sampleSize, vecSize))
			lrmodels = []
			for dim in range(0, vecSize):
				lrmodels.append(LinearRegressionModel.load(lrmFile(outDir, ctxSize, sampleSize, vecSize, dim)))

			success = 0
			fail = 0
			unopt = 0

			#add columns to store model success and failure
			modelSucc = 'succ_' + str(modelId)
			modelFail = 'fail_' + str(modelId)
			modelUnopt = 'unopt_' + str(modelId)
			seqs = seqs.withColumn(modelSucc, lit(0)) \
						.withColumn(modelFail, lit(0)) \
						.withColumn(modelUnopt, lit(0))
			modelId = modelId + 1



			ngrams = ngrams \
				.withColumn('predSeq', lit(''))

			#create initial feature vector
			#transform each word into a cluster center
			words, d, centers = ClusterWords(w2v \
											, seqs \
											)
		
			#record correctness for this model only
			old_ngrams = ngrams
			ngrams = ngrams.withColumn('sample_correct', lit(0)).withColumn('sample_confi', lit(1.0))

			for nextPos in range(0,lookahead):
				#build the feature vector
				ngrams = BuildSubstringFeature(ngrams, w2v, nextPos, nextPos + ctxSize, ctxSize, lookahead,)

				#build the prediction vector
				ngrams = BuildPredictionVector(ngrams, lrmodels, ctxSize, vecSize)


			

				#now assign a cluster id to each prediction vector
				old_ngrams = ngrams
				ngrams = centers.transform(ngrams).withColumnRenamed('cluster', 'predWord').withColumnRenamed('vector', 'predictionVector')
				
				
				#get the predicted word
				ngrams = ngrams.join(broadcast(words), words.cluster == ngrams.predWord, 'inner') \
								.drop('cluster') #\

				#calculate the cosine similarity between prediction vector and center vector 
				epsilon = 0.0001
				def CosineSimi (v1, v2):
					d1 = DenseVector(v1)
					d2 = DenseVector(v2)
					n1 = d1.norm(2)
					n2 = d2.norm(2)
					return float(d1.dot(d2) / (n1 * n2))
				cossim = udf(lambda v1, v2: CosineSimi(v1, v2), DoubleType())
				ngrams = ngrams.withColumn('simi', cossim('centerVector', 'predictionVector'))
				ngrams = ngrams.drop('centerVector').drop('predictionVector')


				#update predicted sequence
				ngrams = ngrams.withColumn('predSeq', concat_ws(' ', 'predSeq', 'word')) 
				ngrams = ngrams.withColumn('predSeq', ltrim(ngrams.predSeq))


				#get actual sequence
				ngrams = CreateSubstring(ngrams, 'sentence', 'actualSeq', gramSize, ' ', ctxSize, ctxSize + nextPos + 1)


				#now get the cluster id for the predicted word in the sentence
				ngrams = BuildLabelVector(ngrams, w2v, ctxSize, lookahead, nextPos).withColumnRenamed('labelVec', 'vector').drop('ngrams')
				ngrams = centers.transform(ngrams).drop('vector')

				#and host latency for actual word
				ngrams = ngrams.join(broadcast(words), 'cluster', 'inner') \
						.drop('word') \
						.drop('centerVector') #\
				
				
			
				#record correctness
				ngrams = ngrams.withColumn('round_correct', when((ngrams.predWord != ngrams.cluster) | (ngrams.simi < confidence), 0).otherwise(nextPos + 1)).drop('predWord').drop('cluster')
				ngrams = ngrams.withColumn('sample_correct', when(ngrams.sample_correct + 1 == ngrams.round_correct, ngrams.round_correct).otherwise(ngrams.sample_correct)) 




				#get overall correctness
				ngrams = ngrams.withColumn('correct', greatest('sample_correct', 'correct'))

				#get binary correctness
				ngrams = ngrams.withColumn('binary_correct', when(ngrams.correct >= nextPos + 1, 1).otherwise(0))
				ngrams = ngrams.withColumn('sample_confi', when(ngrams.binary_correct == 1, 1.0).otherwise(least(ngrams.simi, ngrams.sample_confi)))
				ngrams = ngrams.withColumn('simi', when(ngrams.binary_correct == 1, ngrams.simi).otherwise(ngrams.sample_confi))


				ngrams = ngrams.withColumn('predSeq', when((ngrams.binary_correct == 1) | (ngrams.simi < confidence), ngrams.actualSeq).otherwise(ngrams.predSeq))
				ngrams = ngrams.withColumn('succ_wt', when(ngrams.binary_correct == 1, ngrams.wt).otherwise(0))
				ngrams = ngrams.withColumn('fail_wt', when((ngrams.binary_correct == 1) | (ngrams.simi < confidence), 0).otherwise(ngrams.wt))
				ngrams = ngrams.withColumn('unopt_wt', when((ngrams.binary_correct == 0) & (ngrams.simi < confidence), ngrams.wt).otherwise(0))
				ngrams = ngrams.drop('simi')

				#now summarize success and failure rates by predicted sequence
				seqWts = ngrams.groupBy('predSeq').agg(sum('succ_wt').alias('succ_wt'), sum('fail_wt').alias('fail_wt'), sum('unopt_wt').alias('unopt_wt'))

				#update sequences table
				seqs = seqWts.join(broadcast(seqs), seqWts.predSeq==seqs.word, 'right_outer').drop('predSeq').fillna(-c2/c1, ['succ_wt', 'fail_wt', 'unopt_wt'])


				scaleback = udf(lambda s: float(s*c1 + c2), DoubleType())
				seqs = seqs.withColumn(modelSucc, col(modelSucc) + scaleback(seqs.succ_wt)).drop('succ_wt')
				seqs = seqs.withColumn(modelFail, col(modelFail) + scaleback(seqs.fail_wt)).drop('fail_wt')
				seqs = seqs.withColumn(modelUnopt, col(modelUnopt) + scaleback(seqs.unopt_wt)).drop('unopt_wt')
				seqs.cache()

				aggregated = seqs.agg(sum(modelSucc), sum(modelFail), sum(modelUnopt))
				aggregated.cache()
				new_success = aggregated.head()['sum(' + modelSucc + ')']
				new_fail = aggregated.head()['sum(' + modelFail + ')']
				new_unopt = aggregated.head()['sum(' + modelUnopt + ')']
				print nextPos, new_success - success, new_fail - fail, new_unopt - unopt 
				success = new_success
				fail = new_fail
				unopt = new_unopt


		#end for testing for each model for a particular vector size

	#end for each vector size


	seqs.orderBy('succ_0', ascending=False).write.mode('overwrite').csv(outputFile(outDir, ctxSize, vecSize, sampleSizes))


	return accuracy
コード例 #20
0
def fix_null(x):
    return F.when(
        F.col(x).isNotNull() & (F.lower(F.col(x)) != "null") &
        (F.ltrim(F.col(x)) != ""), F.col(x)).otherwise(None)
df.select(initcap(col("Description"))).show()


# COMMAND ----------

from pyspark.sql.functions import lower, upper
df.select(col("Description"),
    lower(col("Description")),
    upper(lower(col("Description")))).show(2)


# COMMAND ----------

from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
    ltrim(lit("    HELLO    ")).alias("ltrim"),
    rtrim(lit("    HELLO    ")).alias("rtrim"),
    trim(lit("    HELLO    ")).alias("trim"),
    lpad(lit("HELLO"), 3, " ").alias("lp"),
    rpad(lit("HELLO"), 10, " ").alias("rp")).show(2)


# COMMAND ----------

from pyspark.sql.functions import regexp_replace
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
  regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"),
  col("Description")).show(2)

コード例 #22
0
    def lsg_omni(self):
        start_date, end_date = date_period(self.period, self.start_date)

        table_name = 'datalake_omni.omni_hit_data'
        dt_col_name = 'hit_time_gmt_dt_key'
        _, bound_end_date = date_period(-1, end_date)
        bound_date_check(table_name, dt_col_name, start_date, bound_end_date,
                         self.env, 'YYYYMMDD', 'LSG')

        query = 'SELECT ' \
                'VS.visit_session_key AS session_key, ' \
                'HIT.post_visid_combined AS visit_id, ' \
                'HIT.visit_return_count AS visit_number, ' \
                'UPPER(TRIM(prod_list)) AS prod_list, ' \
                'HIT.hit_time_gmt_ts AS time_stamp, ' \
                "TRIM(SUBSTRING(TRIM(DEMANDBASE), 0, POSITION('|' IN TRIM(DEMANDBASE)))) AS " \
                "account_no " \
                'FROM datalake_omni.omni_hit_data HIT ' \
                'LEFT JOIN CDWDS.D_OMNI_VISIT_SESSION VS ON ' \
                '  VS.VISIT_RETURN_COUNT=HIT.VISIT_RETURN_COUNT AND VS.POST_VISID_COMBINED=HIT.POST_VISID_COMBINED ' \
                f'WHERE HIT.hit_time_gmt_dt_key<{start_date} AND HIT.hit_time_gmt_dt_key>={end_date} ' \
                'AND HIT.post_visid_combined IS NOT NULL ' \
                "AND prod_list IS NOT NULL AND prod_list NOT LIKE '%shipping-handling%' " \
                "AND TRIM(SUBSTRING(TRIM(DEMANDBASE), 0, POSITION('|' IN TRIM(DEMANDBASE)))) <> '' "

        schema = StructType([
            StructField('session_key', IntegerType(), True),
            StructField('visit_id', StringType(), True),
            StructField('visit_number', IntegerType(), True),
            StructField('time_stamp', StringType(), True),
            StructField('prod_list', StringType(), True),
            StructField('account_no', StringType(), True),
        ])

        df = redshift_cdw_read(query, db_type = 'RS', database = 'CDWDS', env = self.env, schema = schema). \
            withColumn('prod_id_untrimmed', explode(split('prod_list', ','))). \
            withColumn('prod_id', ltrim(rtrim(col('prod_id_untrimmed')))). \
            drop('prod_id_untrimmed'). \
            drop('prod_list'). \
            filter(col('prod_id').isNotNull()). \
            filter(col('prod_id') != ''). \
            distinct()

        if self.debug:
            print(f'row count for df = {df.count()}')

        # find active products
        query = 'SELECT	sku as prod_id, stk_type_cd '\
                'FROM cdwds.lsg_prod_v ' \
                "WHERE	stk_type_cd = 'D'"

        discontinued_prods = redshift_cdw_read(query,
                                               db_type='RS',
                                               database='CDWDS',
                                               env=self.env)

        df = df.join(discontinued_prods, ['prod_id'], how = 'left').\
            filter(col('stk_type_cd').isNull()).\
            drop('stk_type_cd')

        if self.debug:
            print(
                f'After filtering out discontinued SKUs, row count for df = {df.count()}'
            )

        query = 'SELECT UPPER(sku_nbr) AS prod_id, size_grp AS coupon ' \
                'FROM cdwds.f_web_prod_feature ' \
                "WHERE size_grp IS NOT NULL AND size_grp <> 'T' " \
                'GROUP BY sku_nbr, size_grp'

        coupons = redshift_cdw_read(query,
                                    db_type='RS',
                                    database='CDWDS',
                                    env=self.env)

        if coupons.count() == 0:
            raise DataValidityError(
                'No coupon information.  Please check the validity of size_grp column '
                'on cdwds.f_web_prod_feature.')

        df = df.join(broadcast(coupons), ['prod_id'], how = 'left').\
            withColumn('coupon', coalesce('coupon', 'prod_id'))

        prod_list = df.select('prod_id').distinct()
        coupons = coupons.union(df.select('prod_id', 'coupon')).\
            filter(col('prod_id').isNotNull()).\
            distinct().\
            withColumn("coupon_key", func.dense_rank().over(Window.orderBy('coupon')))

        df = df.join(coupons, ['prod_id', 'coupon'], how='left')

        if self.debug:
            coupons.show()
            df.show()
            print(
                f'row count for coupons = {coupons.select(col("coupon_key")).distinct().count()}'
            )

        return df, prod_list, coupons
コード例 #23
0
from pyspark.sql import functions as F
df = spark.read.text("s3://wagal/bigdata/shakespeare.txt")
textLowerDf = df.select(F.lower(F.col("value")).alias("words_lower"))
textSplitDf = textLowerDf.select(
    F.split(F.col("words_lower"), " ").alias("words_split"))
textExplodedDf = textSplitDf.select(
    F.explode(F.col("words_split")).alias("word"))
textExplodedDf = textExplodedDf.where(F.ltrim(F.col("word")) != "")
textExplodedDf = textExplodedDf.select(
    F.regexp_extract(F.col("word"), "[a-z]+", 0).alias("word"))
textWordCounts = textExplodedDf.groupBy("word").count().orderBy(
    F.col("count").desc())
textWordCounts.show()
コード例 #24
0
def compile_lstrip(t, expr, scope, **kwargs):
    op = expr.op()

    src_column = t.translate(op.arg, scope)
    return F.ltrim(src_column)
コード例 #25
0
def trimStrings(dataframe, columns):
    for col in columns:
        dataframe = dataframe.withColumn(col, F.ltrim(F.rtrim(dataframe[col])))
    return dataframe